In [7]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline

import os
print(os.listdir(r"C:\Users\Surface Book\OneDrive\Desktop\project"))


['BBC News Sample Solution.csv', 'BBC News Test.csv', 'BBC News Train.csv', 'p1.ipynb']


In [10]:
TRAIN_PATH = os.path.join(r"C:\Users\Surface Book\OneDrive\Desktop\project", "BBC News Train.csv")

#Load the data using pandas : Create a DataFrame named df, that contains the training data 
df = pd.read_csv(TRAIN_PATH)
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [13]:
df['category_id'] = df['Category'].factorize()[0]

df['category_id'][0:10]
df.head(20)

Unnamed: 0,ArticleId,Text,Category,category_id
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0
5,1582,howard truanted to play snooker conservative...,politics,2
6,651,wales silent on grand slam talk rhys williams ...,sport,3
7,1797,french honour for director parker british film...,entertainment,4
8,2034,car giant hit by mercedes slump a slump in pro...,business,0
9,1866,fockers fuel festive film chart comedy meet th...,entertainment,4


In [15]:
category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_id_df

Unnamed: 0,Category,category_id
0,business,0
3,tech,1
5,politics,2
6,sport,3
7,entertainment,4


In [16]:
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)
id_to_category


{0: 'business', 1: 'tech', 2: 'politics', 3: 'sport', 4: 'entertainment'}

In [17]:
category_to_id

{'business': 0, 'tech': 1, 'politics': 2, 'sport': 3, 'entertainment': 4}

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Text).toarray() 

labels = df.category_id


In [19]:
features.shape 


(1490, 9927)

In [20]:
category_to_id.items()


dict_items([('business', 0), ('tech', 1), ('politics', 2), ('sport', 3), ('entertainment', 4)])

In [21]:
sorted(category_to_id.items())

[('business', 0),
 ('entertainment', 4),
 ('politics', 2),
 ('sport', 3),
 ('tech', 1)]

In [22]:
from sklearn.feature_selection import chi2
N = 3  

for Category, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)                   
  indices = np.argsort(features_chi2[0])                                  
  feature_names = np.array(tfidf.get_feature_names())[indices]           
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]         
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]       
  print("# '{}':".format(Category))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:]))) 
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:]))) 




# 'business':
  . Most correlated unigrams:
       . growth
       . bank
       . shares
  . Most correlated bigrams:
       . stock market
       . economic growth
       . analysts said
# 'entertainment':
  . Most correlated unigrams:
       . singer
       . actor
       . film
  . Most correlated bigrams:
       . won best
       . los angeles
       . box office




# 'politics':
  . Most correlated unigrams:
       . blair
       . election
       . labour
  . Most correlated bigrams:
       . prime minister
       . tony blair
       . mr blair
# 'sport':
  . Most correlated unigrams:
       . match
       . coach
       . cup
  . Most correlated bigrams:
       . grand slam
       . champions league
       . australian open
# 'tech':
  . Most correlated unigrams:
       . technology
       . software
       . users
  . Most correlated bigrams:
       . anti virus
       . mobile phones
       . mobile phone




In [23]:
features_chi2

(array([0.13345481, 0.01563765, 0.00108776, ..., 0.00812363, 0.15434356,
        0.120306  ]),
 array([0.71487652, 0.9004835 , 0.97368962, ..., 0.92818297, 0.69441858,
        0.72870284]))

In [24]:
from sklearn.manifold import TSNE

SAMPLE_SIZE = int(len(features) * 0.3)
np.random.seed(0)
indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False)          
projected_features = TSNE(n_components=2, random_state=0).fit_transform(features[indices])  
projected_features.shape




(447, 2)

In [25]:
my_id = 0 
projected_features[(labels[indices] == my_id).values]


array([[-4.13286686e+00, -9.20892525e+00],
       [ 4.44985104e+00,  1.30575931e+00],
       [ 7.39399910e+00, -1.22784491e+01],
       [ 1.59464378e+01, -4.11092138e+00],
       [ 1.44705992e+01, -2.87031727e+01],
       [ 1.26260519e+01, -2.34371829e+00],
       [-4.94337034e+00,  4.33154964e+00],
       [ 1.27926292e+01, -2.40895197e-01],
       [ 2.11887150e+01, -4.63770106e-02],
       [ 9.60490513e+00, -4.14878511e+00],
       [ 6.92501143e-02, -1.80034428e+01],
       [ 5.69708633e+00, -7.60759878e+00],
       [ 2.77500877e+01,  1.80671978e+01],
       [ 1.65667713e+00, -4.33841705e+00],
       [ 1.82398529e+01, -5.45124149e+00],
       [ 5.64697361e+00, -6.05977535e+00],
       [ 5.02676773e+00,  5.62201440e-02],
       [ 1.46825180e+01, -1.95173347e+00],
       [ 5.08124173e-01, -8.35781860e+00],
       [-1.57865810e+00,  4.48549318e+00],
       [ 1.05475798e+01,  7.59943342e+00],
       [-2.92681146e+00, -2.31089449e+00],
       [ 7.57700491e+00, -1.17490501e+01],
       [ 3.

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=100, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=30),
]


In [27]:
CV = 5  

cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = [] 


In [28]:
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))


In [29]:
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])


In [30]:
cv_df.groupby('model_name').accuracy.mean()


model_name
LogisticRegression        0.975168
MultinomialNB             0.972483
RandomForestClassifier    0.955705
Name: accuracy, dtype: float64

In [31]:
from sklearn.model_selection import train_test_split

model = LogisticRegression(random_state=0)

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=42)

model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)


In [33]:
texts = ["Tesla made new computer","Bitcoin is 30000 dollar", "UC Berkeley beat Stanford in soccer game"]
text_features = tfidf.transform(texts)
predictions = model.predict(text_features)
for text, predicted in zip(texts, predictions):
  print('"{}"'.format(text))
  print("  - Predicted as: '{}'".format(id_to_category[predicted]))
  print("")


"Tesla made new computer"
  - Predicted as: 'tech'

"Bitcoin is 30000 dollar"
  - Predicted as: 'business'

"UC Berkeley beat Stanford in soccer game"
  - Predicted as: 'sport'

