# Dataset Comparison : Body


In [1]:
### packages 

import numpy as np 
import pandas as pd

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

## for explainer
from lime import lime_text
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
import string
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import warnings
warnings.simplefilter("ignore")
from nltk import word_tokenize, corpus
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from nltk.corpus import stopwords
stopwords.words('english')
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
### Load data 

#look at data
df=pd.read_csv("processed_data.csv").dropna()

###  label encoding 
labels={"Negative": 0, "Neutral": 1, "Positive": 2}
df = df.replace(labels)
df

Unnamed: 0,Sentiment,lemma_meaningful,processed
0,1,"['money', 'sending', 'message']",money sending message
1,0,"['math', 'professor', 'scott', 'steiner', 'say...",math professor scott steiner say number spell ...
2,1,"['exit', 'system']",exit system
3,0,"['new', 'sec', 'filing', 'gme', 'someone', 'le...",new sec filing gme someone le retarded please ...
4,2,"['distract', 'gme', 'thought', 'amc', 'brother...",distract gme thought amc brother aware
...,...,...,...
57663,0,"['got', 'fuck', 'wife', 'morning', 'moon']",got fuck wife morning moon
57664,2,"['rkt', 'tech', 'platform', 'real', 'estate', ...",rkt tech platform real estate doubled auto dou...
57665,2,"['vonage', 'largest', 'ucaas', 'unified', 'com...",vonage largest ucaas unified communication ser...
57666,2,"['know', 'ape', 'autist', 'retard', 'sub', 'ye...",know ape autist retard sub year first exposure...


# Train/Test Split

In [7]:
## split dataset
train_data, test_data = train_test_split(df, test_size=0.2, stratify=df['Sentiment'], random_state=14) 

## get target
y_train = train_data["Sentiment"].values
y_test = test_data["Sentiment"].values

## get text
X_train = train_data["processed"].values
X_test = test_data["processed"].values


# Clssification and evaluation

In [5]:
## make pipeline 

mnb = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.5, max_features=5000, min_df=5, ngram_range=(1,1))),
    ('classifier', MultinomialNB(alpha=0.1,fit_prior=True)),])

## train classifier
mnb.fit(X_train, y_train)

## test classifier 
y_pred = mnb.predict(X_test)
predicted_prob = mnb.predict_proba(X_test)


## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, predicted_prob, 
                            multi_class="ovr")
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(y_test, y_pred))


Accuracy: 0.83
Auc: 0.92
Detail:
              precision    recall  f1-score   support

           0       0.80      0.70      0.75      1876
           1       0.87      0.92      0.90      3504
           2       0.78      0.80      0.79      2293

    accuracy                           0.83      7673
   macro avg       0.82      0.81      0.81      7673
weighted avg       0.83      0.83      0.83      7673



In [8]:
logreg = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.6, max_features=5000, min_df=4, ngram_range=(1,1))),
    ('classifier', LogisticRegression(random_state=0, C=0.9, penalty='l2', solver='newton-cg')),])


## train classifier
logreg.fit(X_train, y_train)

## test classifier 
y_pred = logreg.predict(X_test)
predicted_prob = logreg.predict_proba(X_test)


## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, predicted_prob, 
                            multi_class="ovr")
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(y_test, y_pred))


Accuracy: 0.87
Auc: 0.96
Detail:
              precision    recall  f1-score   support

           0       0.87      0.75      0.81      3138
           1       0.85      0.98      0.91      4057
           2       0.88      0.85      0.87      4235

    accuracy                           0.87     11430
   macro avg       0.87      0.86      0.86     11430
weighted avg       0.87      0.87      0.87     11430



In [9]:
### Define all the classifiers and get evaluation metrics 

# define pipelines for classifiers (with optimal parameters)
mnb = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.5, max_features=5000, min_df=5, ngram_range=(1,1))),
    ('classifier', MultinomialNB(alpha=0.1,fit_prior=True)),])

logreg = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.6, max_features=5000, min_df=4, ngram_range=(1,1))),
    ('classifier', LogisticRegression(random_state=0, C=0.9, penalty='l2', solver='newton-cg')),])

knn = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.6, max_features=5000, min_df=4,use_idf=False)), #max_df?
    ('classifier', KNeighborsClassifier(n_neighbors=1,weights='uniform')),])

rf = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.7, max_features=5000, min_df=5,use_idf=False)),
    ('classifier', RandomForestClassifier(random_state=14,bootstrap=False,max_depth=5,max_features='sqrt', 
                                          min_samples_split=3,n_estimators=110)),])

svm = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.5, max_features=5000, min_df=4,use_idf=True )),
    ('classifier', SVC(random_state=2, gamma='scale',kernel='rbf')),])

xgb = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.7, min_df=4,use_idf=False)),
    ('classifier', XGBClassifier(random_state=14, verbosity=1, booster='dart',learning_rate=0.05,
                                 max_depth=8,use_label_encoder=True)),]) #max_features=5000

bag = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.5, max_features=5000, min_df=4,use_idf=False )),
    ('classifier', BaggingClassifier(random_state=14, n_estimators=100)),])


    
##### put in all models 
all_models = [
    ("mnb", mnb),
    ("knn", knn),
    ("rf", rf),
    ("svm", svm),
    ("xgb", xgb),
    ("bag", bag),
    ("logreg", logr eg),]


 
unsorted_scores = [(name, cross_val_score(model, X_train, y_train, cv=2).mean()) for name, model in all_models]
scores = sorted(unsorted_scores, key=lambda x: -x[1])



In [10]:
scores = sorted(unsorted_scores, key=lambda x: -x[1])
print(scores)




[('svm', 0.849974848664281), ('logreg', 0.8435880149589114), ('bag', 0.8194405424758275), ('mnb', 0.7483540619914308), ('xgb', 0.7234192596960858), ('rf', 0.5435596180426263), ('knn', 0.49016817580421984)]


In [11]:
### XGBoost Classifier
### build a pipeline
xgb2 = Pipeline([
    ('vectorizer', TfidfVectorizer()), #max_df=0.7, min_df=4,use_idf=False)
    ('classifier', XGBClassifier(random_state=14, verbosity=1)),])



### define parameters to be tested usign k-fold CV ###CHANGE###

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html 
xgb2_params = {'classifier__learning_rate': [0.03, 0.05, 0.07],
              'classifier__booster': ['dart', 'gbtree'], 
              'classifier__max_depth' : [8,9,10],
              'vectorizer__min_df': [3,4],
              'classifier__use_label_encoder' : [True, False],
              'vectorizer__use_idf': [True, False],}
 


xgb_gs2 = HalvingGridSearchCV(xgb2,xgb2_params,cv=5,n_jobs=-1, verbose=1, factor=2)#5-fold, computation will be dispatched on all the CPUs
xgb_gs2 = xgb_gs2.fit(X_train, y_train)

print('Done')

n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 357
max_resources_: 45719
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 144
n_resources: 357
Fitting 5 folds for each of 144 candidates, totalling 720 fits
----------
iter: 1
n_candidates: 72
n_resources: 714
Fitting 5 folds for each of 72 candidates, totalling 360 fits
----------
iter: 2
n_candidates: 36
n_resources: 1428
Fitting 5 folds for each of 36 candidates, totalling 180 fits
----------
iter: 3
n_candidates: 18
n_resources: 2856
Fitting 5 folds for each of 18 candidates, totalling 90 fits
----------
iter: 4
n_candidates: 9
n_resources: 5712
Fitting 5 folds for each of 9 candidates, totalling 45 fits
----------
iter: 5
n_candidates: 5
n_resources: 11424
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 6
n_candidates: 3
n_resources: 22848
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 7
n_candidates: 2
n_resources: 

In [12]:
## Best model and parameters 

### get the best model and params
print('Best score on training data:',xgb_gs2.score(X_train, y_train))
print('Best score on testing data:',xgb_gs2.score(X_test, y_test))
print('Best score',xgb_gs2.best_score_)
print('Best parameters',xgb_gs2.best_params_)


Best score on training data: 0.8048951201907303
Best score on testing data: 0.7566054243219598
Best score 0.7574297771311974
Best parameters {'classifier__booster': 'dart', 'classifier__learning_rate': 0.07, 'classifier__max_depth': 10, 'classifier__use_label_encoder': True, 'vectorizer__min_df': 4, 'vectorizer__use_idf': True}


In [13]:
#### Accuracy, Precision, Recall

## train classifier
#xgb_gs2.fit(X_train, y_train)

## test classifier 
y_predt = xgb_gs2.predict(X_test)
predicted_probt = xgb_gs2.predict_proba(X_test)


accuracy = metrics.accuracy_score(y_test, y_predt)
auc = metrics.roc_auc_score(y_test, predicted_probt, 
                            multi_class="ovr")
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(y_test, y_predt))

Accuracy: 0.76
Auc: 0.9
Detail:
              precision    recall  f1-score   support

           0       0.82      0.50      0.62      3138
           1       0.70      0.98      0.81      4057
           2       0.81      0.74      0.77      4235

    accuracy                           0.76     11430
   macro avg       0.78      0.74      0.74     11430
weighted avg       0.77      0.76      0.75     11430

