In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm, tree
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('labelled.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Preprocessing of dataset

* Encode the 'label' column to integers

* Concatenate summaryClean and reviewCleanLemm to include information on both summary and the actual review itself

* Split dataset into train and test

In [3]:
le = LabelEncoder()
df['label_cat'] = le.fit_transform(df['label'])

In [4]:
df['summaryReview'] = df.summaryClean + ' ' + df.reviewCleanLemm

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df[['summaryReview']], df['label_cat'], shuffle=True)

## Tuning parameters of TfidfVectorizer

Setup an sklearn pipeline, and use LogisticRegression as a basis of comparison for the different sets of hyperparameters. 

LR is used as it gave the best performance with default parameters when used on TfidfVectorizer with default parameters as well

In [None]:
pipeline = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('logreg', LogisticRegression())
])

pipeline.fit(X_train.summaryReview, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('tfidf', TfidfVectorizer()), ('logreg', LogisticRegression())])

In [None]:
metrics.accuracy_score(y_test, pipeline.predict(X_test.summaryReview))

0.88408

In [None]:
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': np.linspace(0.00, 0.2, 11)[1:],
    'tfidf__min_df': range(3, 11),
#     'tfidf__max_features': np.linspace(3000, 10000, 8)
}

In [None]:
%%time

clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=5)
clf.fit(X_train.summaryReview, y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits
CPU times: user 53.9 s, sys: 22 s, total: 1min 15s
Wall time: 13min 3s


GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('logreg', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'tfidf__max_df': array([0.02, 0.04, 0.06, 0.08, 0.1 , 0.12, 0.14, 0.16, 0.18, 0.2 ]),
                         'tfidf__min_df': range(3, 11),
                         'tfidf__ngram_range': [(1, 1), (1, 2)]},
             verbose=5)

### Tune again with an updated set of parameter space to give more conclusive results

In [None]:
parameters_updated = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1,3)],
    'tfidf__max_df': np.linspace(0.16, 0.21, 6),
    'tfidf__min_df': range(7, 15),
#     'tfidf__max_features': np.linspace(3000, 10000, 8)
}

In [None]:
clf_updated = GridSearchCV(pipeline, parameters_updated, n_jobs=-1, verbose=5)
clf_updated.fit(X_train.summaryReview, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('logreg', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'tfidf__max_df': array([0.16, 0.17, 0.18, 0.19, 0.2 , 0.21]),
                         'tfidf__min_df': range(7, 15),
                         'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]},
             verbose=5)

In [None]:
clf.best_params_

{'tfidf__max_df': 0.18, 'tfidf__min_df': 9, 'tfidf__ngram_range': (1, 2)}

# Using this parameters, vectorize summaryReview column

In [7]:
tfidf_vect = TfidfVectorizer(max_df=0.18, min_df=9, ngram_range=(1, 2))

X_train_dtm = tfidf_vect.fit_transform(X_train.summaryReview)
X_test_dtm = tfidf_vect.transform(X_test.summaryReview)

### Tune Logistic Regression model

In [None]:
pipeline_logreg = Pipeline([
                     ('tfidf', TfidfVectorizer(max_df=0.18, min_df=9, ngram_range=(1, 2))),
                     ('logreg', LogisticRegression())
])

pipeline_logreg.fit(X_train.summaryReview, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.18, min_df=9, ngram_range=(1, 2))),
                ('logreg', LogisticRegression())])

In [None]:
parameters_logreg = {
    'logreg__penalty': ['l1', 'l2'],
    'logreg__C': [100, 10, 1.0, 0.1, 0.01]
}

In [None]:
%%time

clf_logreg = GridSearchCV(pipeline_logreg, parameters_logreg, n_jobs=-1, verbose=5)
clf_logreg.fit(X_train.summaryReview, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


        nan 0.822             nan 0.75938667]


CPU times: user 15.6 s, sys: 19.8 s, total: 35.4 s
Wall time: 1min 4s


GridSearchCV(estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(max_df=0.18, min_df=9,
                                                        ngram_range=(1, 2))),
                                       ('logreg', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'logreg__C': [100, 10, 1.0, 0.1, 0.01],
                         'logreg__penalty': ['l1', 'l2']},
             verbose=5)

In [None]:
clf_logreg.best_params_

{'logreg__C': 1.0, 'logreg__penalty': 'l2'}

In [8]:
lr = LogisticRegression(C=1, penalty='l2')

lr.fit(X_train_dtm, y_train)

metrics.accuracy_score(y_test, lr.predict(X_test_dtm))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.87592

### Tune SVM model

In [None]:
%%time

svm_params = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01]}

sv = svm.SVC()

svm_clf = GridSearchCV(sv, svm_params, verbose=5, n_jobs=-1)
svm_clf.fit(X_train_dtm, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 20min 15s, sys: 254 ms, total: 20min 15s
Wall time: 2h 4min 11s


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01]},
             verbose=5)

In [None]:
svm_clf.best_params_

{'C': 100, 'gamma': 1}

In [9]:
sv = svm.SVC(C=100, gamma=1)

sv.fit(X_train_dtm, y_train)

SVC(C=100, gamma=1)

In [None]:
metrics.accuracy_score(y_test, sv.predict(X_test_dtm))

0.892

### Tune Naive Bayes model

In [None]:
nb = MultinomialNB()

nb.fit(X_train_dtm, y_train)

metrics.accuracy_score(y_test, nb.predict(X_test_dtm))

0.86344

In [None]:
nb_params = {
    'alpha': [1000,500,100,50,10,7,6,5,4,2,1,0.5,0.1,0.05,0.01,0.005,0.001]
}

clf_nb = GridSearchCV(nb, nb_params, verbose=5, n_jobs=-1)
clf_nb.fit(X_train_dtm, y_train)

Fitting 5 folds for each of 17 candidates, totalling 85 fits


GridSearchCV(estimator=MultinomialNB(), n_jobs=-1,
             param_grid={'alpha': [1000, 500, 100, 50, 10, 7, 6, 5, 4, 2, 1,
                                   0.5, 0.1, 0.05, 0.01, 0.005, 0.001]},
             verbose=5)

In [None]:
clf_nb.best_params_

{'alpha': 0.1}

In [10]:
nb = MultinomialNB(alpha=0.1)

nb.fit(X_train_dtm, y_train)

metrics.accuracy_score(y_test, nb.predict(X_test_dtm))

0.866

### Tune XGBoost model

In [None]:
%%time

xg_params = {'max_depth': range(7,17,2), 
             'min_child_weight': range(3,13,2)
            }

xg = XGBClassifier()

xg_clf = GridSearchCV(xg, xg_params, verbose=5, n_jobs=-1)
xg_clf.fit(X_train_dtm, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits




CPU times: user 2min 2s, sys: 337 ms, total: 2min 2s
Wall time: 58min 36s


GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameter

In [None]:
xg_clf.best_params_

{'max_depth': 15, 'min_child_weight': 7}

In [None]:
%%time

xg_params_updated = {'max_depth': range(15,23,2), 
             'min_child_weight': [7]
            }

xg = XGBClassifier()

xg_clf_updated = GridSearchCV(xg, xg_params_updated, verbose=5, n_jobs=-1)
xg_clf_updated.fit(X_train_dtm, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




CPU times: user 2min 13s, sys: 262 ms, total: 2min 14s
Wall time: 14min 40s


GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameter

In [None]:
xg_clf_updated.best_params_

{'max_depth': 17, 'min_child_weight': 7}

In [11]:
xg = XGBClassifier(max_depth=17, min_child_weight=7)

xg.fit(X_train_dtm, y_train)

metrics.accuracy_score(y_test, xg.predict(X_test_dtm))





0.86808

### Tuning Decision Tree model

In [None]:
%%time

dt_params = {'criterion': ['gini', 'entropy'], 
             'splitter': ['best', 'random'], 
             'max_depth': [2, 3, 4, 5, 10, 15, 20, 25, 30], 
             'min_samples_split': [2, 3, 5, 7, 9, 15, 30, 50, 100],
             'min_samples_leaf': [1, 2, 5, 10],
            }

dt = tree.DecisionTreeClassifier()

dt_clf = GridSearchCV(dt, dt_params, verbose=5, n_jobs=-1)
dt_clf.fit(X_train_dtm, y_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
CPU times: user 38.7 s, sys: 6.19 s, total: 44.9 s
Wall time: 39min 1s


GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 10, 15, 20, 25, 30],
                         'min_samples_leaf': [1, 2, 5, 10],
                         'min_samples_split': [2, 3, 5, 7, 9, 15, 30, 50, 100],
                         'splitter': ['best', 'random']},
             verbose=5)

In [None]:
dt_clf.best_params_

{'criterion': 'gini',
 'max_depth': 30,
 'min_samples_leaf': 5,
 'min_samples_split': 100,
 'splitter': 'random'}

In [12]:
dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_leaf=5, min_samples_split=100, splitter='random')

dt.fit(X_train_dtm, y_train)

metrics.accuracy_score(y_test, dt.predict(X_test_dtm))

0.82752

### Tuning KNN model

In [None]:
%%time

knn_params = {'n_neighbors': np.arange(1,100,2)}

knn = KNeighborsClassifier()

knn_clf = GridSearchCV(knn, knn_params, verbose=5, n_jobs=-1)
knn_clf.fit(X_train_dtm, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
CPU times: user 1.7 s, sys: 469 ms, total: 2.17 s
Wall time: 7min 19s


GridSearchCV(estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33,
       35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
       69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99])},
             verbose=5)

In [None]:
knn_clf.best_params_

{'n_neighbors': 17}

In [13]:
knn = KNeighborsClassifier(n_neighbors=17)

knn.fit(X_train_dtm, y_train)

metrics.accuracy_score(y_test, knn.predict(X_test_dtm))

0.8212

# Stacking models

Stack the base models, and develop a meta learner to improve performance

In [14]:
base_models = [
    ('lr', lr),
    ('sv', sv),
    ('nb', nb),
    ('xg', xg),
#     ('dt', dt),
#     ('knn', knn)
]
meta_model = LogisticRegression(max_iter=1000)
# meta_model = XGBClassifier()
# cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=4222)

# stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=cv, n_jobs=-1, verbose=5)

In [15]:
%%time

stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1, verbose=5)
stacked_model.fit(X_train_dtm, y_train)

CPU times: user 266 ms, sys: 292 ms, total: 558 ms
Wall time: 35min 44s


StackingClassifier(estimators=[('lr', LogisticRegression(C=1)),
                               ('sv', SVC(C=100, gamma=1)),
                               ('nb', MultinomialNB(alpha=0.1)),
                               ('xg',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1, gamma=0,
                                              gpu_id=-1, importance_type='gain',
                                              interaction_constraints='',
                                              learning_rate=0.300000012,
                                              max_delta_step=0, max_depth=17,
                                              min_child_weight=7, missing=nan,
                                              monotone_constraints='()',
                           

In [18]:
stacked_model.score(X_test_dtm, y_test)

0.88704

In [14]:
%%time

stacked_model_svm = StackingClassifier(estimators=base_models, final_estimator=svm.SVC(), n_jobs=-1, verbose=5)
stacked_model_svm.fit(X_train_dtm, y_train)

CPU times: user 20.4 s, sys: 461 ms, total: 20.9 s
Wall time: 31min 58s


StackingClassifier(estimators=[('lr', LogisticRegression(C=1)),
                               ('sv', SVC(C=100, gamma=1)),
                               ('nb', MultinomialNB(alpha=0.1)),
                               ('xg',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1, gamma=0,
                                              gpu_id=-1, importance_type='gain',
                                              interaction_constraints='',
                                              learning_rate=0.300000012,
                                              max_delta_step=0, max_depth=17,
                                              min_child_weight=7, missing=nan,
                                              monotone_constraints='()',
                           

In [15]:
stacked_model_svm.score(X_test_dtm, y_test)

0.88704

In [19]:
def model_performance(models, model_names):
    #create empty df with col names
    df = pd.DataFrame(columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 score', 'ROC AUC'])
    
    for n, model in enumerate(models):

        y_pred = model.predict(X_test_dtm)

        name = model_names[n]
        
        acc = metrics.accuracy_score(y_test, y_pred)
        prec = metrics.precision_score(y_test, y_pred)
        recall = metrics.recall_score(y_test, y_pred)
        f1 = metrics.f1_score(y_test, y_pred)
        roc_auc = metrics.roc_auc_score(y_test, y_pred)

        #append row to df
        df = df.append(
            {
                'Model' : name,
                'Accuracy': acc,
                'Precision': prec,
                'Recall': recall,
                'F1 score': f1,
                'ROC AUC': roc_auc
            }, ignore_index = True)
            
    return df.set_index('Model').transpose()

In [20]:
model_performance([lr, sv, nb, xg, dt, knn, stacked_model], ['lr', 'sv', 'nb', 'xg', 'dt', 'knn', 'stack'])

Model,lr,sv,nb,xg,dt,knn,stack
Accuracy,0.87592,0.88344,0.866,0.86808,0.82752,0.8212,0.8828
Precision,0.885764,0.898831,0.885737,0.888548,0.847499,0.844383,0.90608
Recall,0.960766,0.954139,0.945829,0.945093,0.942884,0.93773,0.943726
F1 score,0.921742,0.925659,0.914797,0.915949,0.892651,0.888612,0.92452
ROC AUC,0.78359,0.806505,0.779129,0.784274,0.70198,0.694391,0.8165
