In [145]:
import pandas as pd
import spacy
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
%matplotlib inline

import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
#!pip install spacy

In [None]:
# importing libraries for NLP
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
import re

In [39]:
def make_confusion_matrix(model,x_actual,y_actual,labels=[1, 0], path=''):
    '''
    model : classifier to predict values of X
    y_actual : ground truth  
    
    '''
    y_predict = model.predict(x_actual)
    cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1])
    df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - No","Actual - Yes"]],
                  columns = [i for i in ['Predicted - No','Predicted - Yes']])
    group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cm.flatten()/np.sum(cm)]
    # labels = [f"{v1}\n{v2}" for v1, v2 in
    #           zip(group_counts,group_percentages)]
    # labels = np.asarray(labels).reshape(2,2)
    plt.figure(figsize = (10,7))
    sns.heatmap(df_cm, fmt='')#,annot=labels)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(path, transparent=True)

# Importing Finished DataFrame for modeling

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_val = pd.read_parquet('/content/drive/MyDrive/EiT/Week_6/Presentation_6/Datasets/train_val_df.parquet', engine='pyarrow')
# train_val.drop(columns=['Unnamed: 0'], inplace=True)
# test = pd.read_csv('/content/drive/MyDrive/EiT/Week_6/Presentation_6/Datasets/test_df.csv')

In [None]:
train_val.info()

In [4]:
train_val.labels.value_counts()

Entertainment    200
Health           200
Economy          200
Environment      200
Technology       200
Sports           200
Education        200
Name: labels, dtype: int64

In [44]:
train_val.columns

Index(['clean', 'complete', 'comp_stem', 'comp_lem', 'summary', 'sum_stem',
       'sum_lem', 'labels'],
      dtype='object')

## Splitting Data and building Random_Forest Model with Pipeline

In [185]:
X_sum = train_val['sum_lem']
X_alp = train_val['comp_lem']
y_sum = train_val['labels']
y_alp = train_val['labels']

X_train_sum, X_test_sum, y_train_sum, y_test_sum = train_test_split(X_sum, y_sum, test_size=0.25, random_state=2)
X_train_alp, X_test_alp, y_train_alp, y_test_alp = train_test_split(X_alp, y_alp, test_size=0.25, random_state=2)

In [11]:
tfidvec = TfidfVectorizer(ngram_range=(1,2), max_features=50000)
tfid = tfidvec.fit_transform(X_alp)
tfid_ = tfid.toarray()
tfid_df = pd.DataFrame(tfid_, columns=tfidvec.get_feature_names())
print(tfid_df.shape)
tfid_df

(1400, 50000)


Unnamed: 0,aaron,aaron bolton,aaron die,aaronson,aarp,aarp foundat,abaco,abaco island,abandon,abat,abat opioid,abba,abbi,abbott,abbrevi,abc,abdi,abdul,abdulhadi,abdullah,abeeha,abel,abel prize,abernathi,abha,abha bhattarai,abhorr,abid,abigail,abigail barlow,abigail censki,abil,abil deliv,abil get,abil initi,abil make,abil pay,abil peopl,abil produc,abilen,...,zhan beleniuk,zhang,zhang puyuan,zhang yime,zhao,zhengzhou,zhou,zhou enlai,ziad,ziad buchh,zialcita,zigzag,zillow,zima,zimbabw,zimbalist,zip,zip code,ziprecruit,zitzlsperg,zobrist,zoloth,zombi,zombi invas,zombi movi,zomorodi,zone,zoo,zoom,zoom call,zoom meet,zoë,zubaydah,zucker,zuckerberg,zuckerberg testifi,zuckerberg told,zverev,zwart,zwart piet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098959,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


## complete

In [46]:
text_clf_alp = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(n_jobs=-1, random_state=2))], verbose=True)

text_clf_alp.fit(X_train_alp, y_train_alp)  

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.5s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   1.3s


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [47]:
predictions = text_clf_alp.predict(X_test_alp)

print(confusion_matrix(y_test_alp,predictions))
print()
print(classification_report(y_test_alp, predictions))
print()
print(accuracy_score(y_test_alp, predictions))

[[29  3  3  1  1  2  6]
 [ 4 46  2  1  1  1  0]
 [ 1  2 42  1  1  2  3]
 [ 0  0  2 40  0  0  0]
 [ 2  6  4  1 32  3  2]
 [ 1  3  3  0  0 49  1]
 [ 9  3  6  0  2  2 27]]

               precision    recall  f1-score   support

      Economy       0.63      0.64      0.64        45
    Education       0.73      0.84      0.78        55
Entertainment       0.68      0.81      0.74        52
  Environment       0.91      0.95      0.93        42
       Health       0.86      0.64      0.74        50
       Sports       0.83      0.86      0.84        57
   Technology       0.69      0.55      0.61        49

     accuracy                           0.76       350
    macro avg       0.76      0.76      0.75       350
 weighted avg       0.76      0.76      0.75       350


0.7571428571428571


## summary

In [48]:
text_clf_sum = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(n_jobs=-1, random_state=2))], verbose=True)

text_clf_sum.fit(X_train_sum, y_train_sum)  

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.2s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.9s


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [49]:
predictions = text_clf_sum.predict(X_test_sum)

print(confusion_matrix(y_test_sum,predictions))
print()
print(classification_report(y_test_sum, predictions))
print()
print(accuracy_score(y_test_sum, predictions))

[[34  1  5  2  0  0  3]
 [ 2 45  4  1  2  1  0]
 [ 3  3 39  2  0  3  2]
 [ 1  0  1 39  1  0  0]
 [ 2  5  4  3 32  3  1]
 [ 0  2  2  3  0 50  0]
 [ 9  2  4  3  3  2 26]]

               precision    recall  f1-score   support

      Economy       0.67      0.76      0.71        45
    Education       0.78      0.82      0.80        55
Entertainment       0.66      0.75      0.70        52
  Environment       0.74      0.93      0.82        42
       Health       0.84      0.64      0.73        50
       Sports       0.85      0.88      0.86        57
   Technology       0.81      0.53      0.64        49

     accuracy                           0.76       350
    macro avg       0.76      0.76      0.75       350
 weighted avg       0.77      0.76      0.75       350


0.7571428571428571


## comp_stem

### Test cases

In [207]:
text_clf_alp = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(max_depth= 8, n_estimators= 1250, n_jobs=-1, random_state=2))])

text_clf_alp.fit(X_train_alp, y_train_alp)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=8, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
 

In [208]:
predictions = text_clf_alp.predict(X_test_alp)

print(confusion_matrix(y_test_alp,predictions))
print()
print(classification_report(y_test_alp, predictions))
print()
print(accuracy_score(y_test_alp, predictions))

[[35  1  1  2  0  2  4]
 [ 2 47  3  1  1  1  0]
 [ 2  1 44  1  0  1  3]
 [ 0  0  1 40  1  0  0]
 [ 1  3  4  1 36  4  1]
 [ 0  0  3  3  0 51  0]
 [ 9  0  6  3  2  2 27]]

               precision    recall  f1-score   support

      Economy       0.71      0.78      0.74        45
    Education       0.90      0.85      0.88        55
Entertainment       0.71      0.85      0.77        52
  Environment       0.78      0.95      0.86        42
       Health       0.90      0.72      0.80        50
       Sports       0.84      0.89      0.86        57
   Technology       0.77      0.55      0.64        49

     accuracy                           0.80       350
    macro avg       0.80      0.80      0.79       350
 weighted avg       0.81      0.80      0.80       350


0.8


### Best Case Params

In [None]:
#  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
#                                         class_weight=None, criterion='gini',
#                                         max_depth=10, max_features='auto',
#                                         max_leaf_nodes=None, max_samples=None,
#                                         min_impurity_decrease=0.0,
#                                         min_impurity_split=None,
#                                         min_samples_leaf=2, min_samples_split=3,
#                                         min_weight_fraction_leaf=0.0,
#                                         n_estimators=250, n_jobs=-1,
#                                         oob_score=False, random_state=2,
#                                         verbose=0, warm_start=False))]

In [95]:
predictions = best_grid.predict(X_test_alp)

print(confusion_matrix(y_test_alp,predictions))
print()
print(classification_report(y_test_alp, predictions))
print()
print(accuracy_score(y_test_alp, predictions))

[[32  1  0  0  0  0  0  0]
 [ 2 23  0  0  0  2  0  2]
 [ 1  1 31  0  1  0  1  0]
 [ 1  0  0 24  0  2  0  1]
 [ 0  2  1  2 25  0  0  0]
 [ 0  2  2  1  1 27  0  0]
 [ 0  1  1  0  0  0 38  0]
 [ 0  2  8  2  1  1  0 22]]

               precision    recall  f1-score   support

     business       0.89      0.97      0.93        33
    education       0.72      0.79      0.75        29
entertainment       0.72      0.89      0.79        35
  environment       0.83      0.86      0.84        28
       health       0.89      0.83      0.86        30
     politics       0.84      0.82      0.83        33
       sports       0.97      0.95      0.96        40
         tech       0.88      0.61      0.72        36

     accuracy                           0.84       264
    macro avg       0.84      0.84      0.84       264
 weighted avg       0.85      0.84      0.84       264


0.8409090909090909


### Base Case

In [32]:
text_clf_alp = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(n_jobs=-1, random_state=2))])

text_clf_alp.fit(X_train_alp, y_train_alp)  

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [33]:
predictions = text_clf_alp.predict(X_test_alp)

print(confusion_matrix(y_test_alp,predictions))
print()
print(classification_report(y_test_alp, predictions))
print()
print(accuracy_score(y_test_alp, predictions))

[[39  0  1  1  0  1  3]
 [ 1 47  4  0  1  2  0]
 [ 2  1 41  1  1  3  3]
 [ 0  0  0 40  2  0  0]
 [ 1  2  4  2 38  2  1]
 [ 1  1  3  0  1 50  1]
 [ 8  1  5  2  3  3 27]]

               precision    recall  f1-score   support

      Economy       0.75      0.87      0.80        45
    Education       0.90      0.85      0.88        55
Entertainment       0.71      0.79      0.75        52
  Environment       0.87      0.95      0.91        42
       Health       0.83      0.76      0.79        50
       Sports       0.82      0.88      0.85        57
   Technology       0.77      0.55      0.64        49

     accuracy                           0.81       350
    macro avg       0.81      0.81      0.80       350
 weighted avg       0.81      0.81      0.80       350


0.8057142857142857


In [None]:
predictions = text_clf_alp.predict(X_test_alp)
labs = ['Econ','Edu','Ent','Envir','Heal','Sports','Tech']
array = confusion_matrix(y_test_alp,predictions)
df_cm = pd.DataFrame(array, index = labs, columns = labs)
plt.figure(figsize=(20,20))
sns.heatmap(df_cm, annot=True, cmap='PiYG', annot_kws={"size": 30})
plt.title('Confusion Matrix Heatmap', fontsize=48)
plt.xticks(fontsize=30, rotation=30)
plt.yticks(fontsize=30, rotation=30)
plt.ylabel('Predicted', fontsize=22)
plt.xlabel('Actual', fontsize=22)
plt.savefig('/content/drive/MyDrive/EiT/Week_6/Presentation_6/Visualizations/Confusion_Matrix', transparent=True)
plt.show()

## sum_stem

In [53]:
text_clf_sum = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(n_jobs=-1, random_state=2))], verbose=True)

text_clf_sum.fit(X_train_sum, y_train_sum)  

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.2s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.7s


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [54]:
predictions = text_clf_sum.predict(X_test_sum)

print(confusion_matrix(y_test_sum,predictions))
print()
print(classification_report(y_test_sum, predictions))
print()
print(accuracy_score(y_test_sum, predictions))

[[30  1  5  2  0  0  7]
 [ 3 47  1  1  2  1  0]
 [ 3  3 41  2  1  0  2]
 [ 0  0  2 37  1  0  2]
 [ 2  6  4  2 31  4  1]
 [ 2  2  2  0  0 50  1]
 [ 7  1  6  4  3  3 25]]

               precision    recall  f1-score   support

      Economy       0.64      0.67      0.65        45
    Education       0.78      0.85      0.82        55
Entertainment       0.67      0.79      0.73        52
  Environment       0.77      0.88      0.82        42
       Health       0.82      0.62      0.70        50
       Sports       0.86      0.88      0.87        57
   Technology       0.66      0.51      0.57        49

     accuracy                           0.75       350
    macro avg       0.74      0.74      0.74       350
 weighted avg       0.75      0.75      0.74       350


0.7457142857142857


## comp_lem

In [138]:
text_clf_alp = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(max_depth= 6, n_estimators= 5000, n_jobs=-1, random_state=2))], verbose=True)

text_clf_alp.fit(X_train_alp, y_train_alp)  

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.5s
[Pipeline] ............... (step 2 of 2) Processing clf, total=  15.5s


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=6, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
 

In [140]:
predictions = text_clf_alp.predict(X_test_alp)
preds = text_clf_alp.predict(X_train_alp)

print(confusion_matrix(y_test_alp,predictions))
print()
# print(classification_report(y_test_alp, predictions))
print()
print('Train acc: ',accuracy_score(y_train_alp, preds)) 
print('Val acc: ',accuracy_score(y_test_alp, predictions))

[[35  1  1  2  0  1  5]
 [ 2 47  3  1  1  1  0]
 [ 2  1 45  1  0  0  3]
 [ 0  0  1 39  2  0  0]
 [ 1  4  4  1 35  4  1]
 [ 0  0  3  3  0 51  0]
 [ 8  0  5  3  2  3 28]]


Train acc:  0.9285714285714286
Val acc:  0.8


In [177]:
text = 'Beyond individual developer preferences, we also have to accommodate the fact that station API users can see a lot of story content, photos and audio assets that public API users can\'t.'

In [182]:
text = '''Beyond individual developer preferences, we also have to 
          accommodate the fact that station API users can see a lot 
          of story content, photos and audio assets that public API users can\'t.'''

nlp = spacy.load('en_core_web_sm')
doc = nlp(str(text))
lemmed = ''
for sent in doc.sents:
  temp = ''
  for word in sent:
    temp += ' '+ word.lemma_
  lemmed += ' '+ temp
lemmed = [lemmed]

In [183]:
text_clf_alp.predict(lemmed)

array(['Technology'], dtype=object)

## sum_lem

In [205]:
text_clf_sum = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(max_depth= 7, n_estimators= 500,n_jobs=-1, random_state=2))], verbose=True)

text_clf_sum.fit(X_train_sum, y_train_sum)  

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.2s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   1.5s


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=7, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
 

In [206]:
predictions = text_clf_sum.predict(X_test_sum)
preds = text_clf_alp.predict(X_train_sum)

print(confusion_matrix(y_test_sum,predictions))
print()
print(classification_report(y_test_sum, predictions))
print()
print('Train acc: ',accuracy_score(y_train_sum, preds)) 
print('Val acc: ',accuracy_score(y_test_sum, predictions))

[[31  1  5  2  0  0  6]
 [ 4 47  1  1  2  0  0]
 [ 3  2 40  0  0  3  4]
 [ 0  0  1 40  1  0  0]
 [ 3  5  2  4 32  3  1]
 [ 1  1  0  3  0 51  1]
 [10  1  5  4  3  2 24]]

               precision    recall  f1-score   support

      Economy       0.60      0.69      0.64        45
    Education       0.82      0.85      0.84        55
Entertainment       0.74      0.77      0.75        52
  Environment       0.74      0.95      0.83        42
       Health       0.84      0.64      0.73        50
       Sports       0.86      0.89      0.88        57
   Technology       0.67      0.49      0.56        49

     accuracy                           0.76       350
    macro avg       0.75      0.76      0.75       350
 weighted avg       0.76      0.76      0.75       350


Train acc:  0.8533333333333334
Val acc:  0.7571428571428571
