In [40]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline

import os
print(os.listdir(r"."))



['classification-algorithms.ipynb', 'Generator', 'requirements.txt', 'Training.csv', 'zoomlink.csv']


In [42]:
TRAIN_PATH = os.path.join(r".", "Training.csv")

df = pd.read_csv(TRAIN_PATH)
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1,https://berkeley.zoom.us/j/41810383029,zoomlink
1,2,https://berkeley.zoom.us/j/23269085011,zoomlink
2,3,https://berkeley.zoom.us/j/47580519432,zoomlink
3,4,https://berkeley.zoom.us/j/83750991624,zoomlink
4,5,https://berkeley.zoom.us/j/83562172419,zoomlink


In [43]:
df['category_id'] = df['Category'].factorize()[0]

df['category_id'][0:10]
df.head(20)

Unnamed: 0,ArticleId,Text,Category,category_id
0,1,https://berkeley.zoom.us/j/41810383029,zoomlink,0
1,2,https://berkeley.zoom.us/j/23269085011,zoomlink,0
2,3,https://berkeley.zoom.us/j/47580519432,zoomlink,0
3,4,https://berkeley.zoom.us/j/83750991624,zoomlink,0
4,5,https://berkeley.zoom.us/j/83562172419,zoomlink,0
5,6,https://berkeley.zoom.us/j/11597374826,zoomlink,0
6,7,https://berkeley.zoom.us/j/23136749058,zoomlink,0
7,8,https://berkeley.zoom.us/j/05437112866,zoomlink,0
8,9,https://berkeley.zoom.us/j/91301078356,zoomlink,0
9,10,https://berkeley.zoom.us/j/75320904571,zoomlink,0


In [44]:
category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_id_df

Unnamed: 0,Category,category_id
0,zoomlink,0
101,business,1
104,tech,2
106,politics,3
107,others,4
108,entertainment,5


In [45]:
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)
id_to_category


{0: 'zoomlink',
 1: 'business',
 2: 'tech',
 3: 'politics',
 4: 'others',
 5: 'entertainment'}

In [46]:
category_to_id

{'zoomlink': 0,
 'business': 1,
 'tech': 2,
 'politics': 3,
 'others': 4,
 'entertainment': 5}

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Text).toarray() 

labels = df.category_id


In [48]:
features.shape 


(1591, 9932)

In [49]:
category_to_id.items()


dict_items([('zoomlink', 0), ('business', 1), ('tech', 2), ('politics', 3), ('others', 4), ('entertainment', 5)])

In [50]:
sorted(category_to_id.items())

[('business', 1),
 ('entertainment', 5),
 ('others', 4),
 ('politics', 3),
 ('tech', 2),
 ('zoomlink', 0)]

In [51]:
from sklearn.feature_selection import chi2
N = 3  



for Category, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)  
  indices = np.argsort(features_chi2[0])            
  feature_names = np.array(tfidf.get_feature_names())[indices]    
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1] 
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2] 
  print("# '{}':".format(Category))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:]))) 
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:]))) 




# 'business':
  . Most correlated unigrams:
       . growth
       . bank
       . shares
  . Most correlated bigrams:
       . analysts said
       . berkeley zoom
       . https berkeley
# 'entertainment':
  . Most correlated unigrams:
       . singer
       . actor
       . film
  . Most correlated bigrams:
       . won best
       . los angeles
       . box office
# 'others':
  . Most correlated unigrams:
       . match
       . coach
       . cup
  . Most correlated bigrams:
       . https berkeley
       . champions league
       . australian open
# 'politics':
  . Most correlated unigrams:
       . blair
       . election
       . labour
  . Most correlated bigrams:
       . prime minister
       . tony blair
       . mr blair
# 'tech':
  . Most correlated unigrams:
       . technology
       . software
       . users
  . Most correlated bigrams:
       . anti virus
       . mobile phones
       . mobile phone
# 'zoomlink':
  . Most correlated unigrams:
       . zoom
       . be



In [52]:
features_chi2

(array([1.12957856e+00, 3.36660754e-02, 6.21357272e-02, ...,
        6.60311144e+02, 4.90372229e-02, 3.81890915e-02]),
 array([2.87865251e-001, 8.54418919e-001, 8.03151634e-001, ...,
        1.27840164e-145, 8.24747126e-001, 8.45063930e-001]))

In [53]:
from sklearn.manifold import TSNE

SAMPLE_SIZE = int(len(features) * 0.3)
np.random.seed(0)
indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False)          
projected_features = TSNE(n_components=2, random_state=0).fit_transform(features[indices])  
projected_features.shape




(477, 2)

In [54]:
my_id = 0 
projected_features[(labels[indices] == my_id).values]


array([[ 56.466904, -22.881916],
       [ 52.27793 , -21.63747 ],
       [ 55.35656 , -18.410301],
       [ 52.39015 , -25.02835 ],
       [ 55.197838, -22.986044],
       [ 52.47178 , -22.959036],
       [ 53.92324 , -18.479593],
       [ 53.72657 , -21.109371],
       [ 51.484203, -23.943062],
       [ 53.899048, -19.868244],
       [ 50.96417 , -21.235039],
       [ 53.726154, -25.485126],
       [ 56.800587, -21.643253],
       [ 53.34045 , -22.18226 ],
       [ 56.599224, -19.15022 ],
       [ 56.48196 , -25.198023],
       [ 54.031216, -23.1078  ],
       [ 51.046597, -22.622475],
       [ 57.44992 , -20.18878 ],
       [ 51.54936 , -19.93885 ],
       [ 52.58808 , -18.952946],
       [ 55.144886, -19.762875],
       [ 53.24129 , -23.953127],
       [ 55.643757, -21.851334],
       [ 54.51444 , -22.019596],
       [ 58.181194, -21.40362 ],
       [ 54.531483, -24.210386],
       [ 55.096775, -25.511183],
       [ 57.382797, -24.065126],
       [ 52.763485, -20.486076],
       [ 5

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=100, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=30),
]


In [56]:
CV = 5  

cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = [] 




In [58]:
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))


In [59]:
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])


In [60]:
cv_df.groupby('model_name').accuracy.mean()


model_name
LogisticRegression        0.976745
MultinomialNB             0.973605
RandomForestClassifier    0.962925
Name: accuracy, dtype: float64

In [61]:
from sklearn.model_selection import train_test_split

model = LogisticRegression(random_state=0)

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=42)

model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)


In [62]:
texts = ["https://bcourses.berkeley.edu/courses/1516807"]
text_features = tfidf.transform(texts)
predictions = model.predict(text_features)
for text, predicted in zip(texts, predictions):
  print('"{}"'.format(text))
  print("  - Predicted as: '{}'".format(id_to_category[predicted]))
  print("")



"https://berkeley.zoom.us/j/95496378974"
  - Predicted as: 'zoomlink'

"Apple bought Samsung for 100billion dollars and their stock went up"
  - Predicted as: 'business'

"Team korea scored 7 goals against team japan in soccer game"
  - Predicted as: 'others'

