In [1]:
from pprint import pprint

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import acquire as a
import prepare as p

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [2]:
df = p.prep_article(a.get_news_articles())
df = df[['clean','category']]

In [3]:
df.head()

Unnamed: 0,clean,category
0,rbi thursday kept repo rate unchang five cut y...,business
1,former financ minist p chidambaram thursday sa...,business
2,googl cofound sergey brin larri page ad combin...,business
3,googl yearold indiaborn ceo sundar pichai rece...,business
4,softbank founder ceo masayoshi son discuss ali...,business


In [4]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.clean)
y = df.category

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

In [6]:
y_train.shape

(80,)

In [7]:
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

In [8]:
lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)



# tfidf regression train

In [9]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.50%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business             19              0       0           1
entertainment         0             20       0           0
sports                0              0      20           0
technology            1              0       0          19
---
               precision    recall  f1-score   support

     business       0.95      0.95      0.95        20
entertainment       1.00      1.00      1.00        20
       sports       1.00      1.00      1.00        20
   technology       0.95      0.95      0.95        20

     accuracy                           0.97        80
    macro avg       0.98      0.98      0.98        80
 weighted avg       0.97      0.97      0.97        80



# tfidf regression train

In [10]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 75.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business              2              0       0           1
entertainment         0              4       0           0
sports                1              0       5           0
technology            2              1       0           4
---
               precision    recall  f1-score   support

     business       0.67      0.40      0.50         5
entertainment       1.00      0.80      0.89         5
       sports       0.83      1.00      0.91         5
   technology       0.57      0.80      0.67         5

     accuracy                           0.75        20
    macro avg       0.77      0.75      0.74        20
 weighted avg       0.77      0.75      0.74        20



In [11]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.clean)
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

In [12]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [13]:
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

# tfidf knn train

In [14]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.89


In [15]:
print(classification_report(y_train, y_pred))

               precision    recall  f1-score   support

     business       0.74      1.00      0.85        20
entertainment       0.95      0.95      0.95        20
       sports       0.95      0.90      0.92        20
   technology       1.00      0.70      0.82        20

     accuracy                           0.89        80
    macro avg       0.91      0.89      0.89        80
 weighted avg       0.91      0.89      0.89        80



# tfidf knn train

In [16]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of KNN classifier on test set: 0.80


In [17]:
raw_count = pd.Series(" ".join(df.clean).split()).value_counts()

df_tf = (pd.DataFrame({'raw_count': raw_count}))

In [18]:
df_tf.head()

Unnamed: 0,raw_count
said,74
ad,45
india,31
googl,18
ceo,18


In [19]:
count = CountVectorizer()
X = count.fit_transform(df.clean)
y = df.category

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))


knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [21]:
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

In [22]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.68


In [23]:
print(classification_report(y_train, y_pred))

               precision    recall  f1-score   support

     business       1.00      0.60      0.75        20
entertainment       0.61      0.85      0.71        20
       sports       0.61      1.00      0.75        20
   technology       0.71      0.25      0.37        20

     accuracy                           0.68        80
    macro avg       0.73      0.68      0.65        80
 weighted avg       0.73      0.68      0.65        80



In [24]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of KNN classifier on test set: 0.50


In [25]:
count = CountVectorizer()
X = count.fit_transform(df.clean)
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)




In [26]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.50%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business             18              0       0           0
entertainment         0             20       0           0
sports                0              0      20           0
technology            2              0       0          20
---
               precision    recall  f1-score   support

     business       1.00      0.90      0.95        20
entertainment       1.00      1.00      1.00        20
       sports       1.00      1.00      1.00        20
   technology       0.91      1.00      0.95        20

     accuracy                           0.97        80
    macro avg       0.98      0.97      0.97        80
 weighted avg       0.98      0.97      0.97        80



In [27]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 80.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business              4              0       1           0
entertainment         0              5       0           2
sports                0              0       4           0
technology            1              0       0           3
---
               precision    recall  f1-score   support

     business       0.80      0.80      0.80         5
entertainment       0.71      1.00      0.83         5
       sports       1.00      0.80      0.89         5
   technology       0.75      0.60      0.67         5

     accuracy                           0.80        20
    macro avg       0.82      0.80      0.80        20
 weighted avg       0.82      0.80      0.80        20

