# Importing Libraries

In [161]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
import sklearn

# Model Building
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# Metrics
from sklearn.metrics import fbeta_score

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Title - Metadata & Sentiment Features

In [103]:
news = pd.read_csv("news_final.csv")
news.head()

Unnamed: 0,title,text,subject,date,fake,all_text,char_count,word_count,sent_count,capital_word_count,...,stopword_count.1,unique_word_count.1,punct_count.1,avg_wordlength.1,avg_sentlength.1,unique_vs_words.1,stopwords_vs_words.1,noun_count.1,adverb_count.1,sentiment_score.1
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,Donald Trump Sends Out Embarrassing New Year’...,2973,507,26,5,...,2,12,1,6.583333,12.0,1.0,0.166667,5,0,-0.7096
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,Drunk Bragging Trump Staffer Started Russian ...,1968,313,11,3,...,0,8,0,8.625,8.0,1.0,0.0,5,0,-0.34
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,Sheriff David Clarke Becomes An Internet Joke...,3688,595,25,42,...,0,15,0,6.0,15.0,1.0,0.0,7,0,-0.296
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,Trump Is So Obsessed He Even Has Obama’s Name...,2853,458,15,6,...,1,14,2,5.571429,14.0,1.0,0.071429,3,2,-0.3052
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,Pope Francis Just Called Out Donald Trump Dur...,2417,431,19,0,...,0,11,0,6.363636,11.0,1.0,0.0,5,1,0.0


In [104]:
X_title = news.iloc[:,22:]
y_title = news['fake']

## Train Test Split 

In [105]:
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(X_title, y_title, test_size=0.2, shuffle=True, random_state=1) #20% test

## Model Building

### 1.1 Logistic Regression

In [107]:
%%time
lr_clf_title = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(random_state=1))
])

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=1)
lr_cv_results_title = cross_validate(lr_clf_title, X_train_title, y_train_title, cv=cv, return_train_score=True)

CPU times: user 4.33 s, sys: 183 ms, total: 4.51 s
Wall time: 1.88 s


In [108]:
lr_cv_results_title

{'fit_time': array([0.19818497, 0.15242815, 0.16256189, 0.26305985, 0.33280683,
        0.09227586, 0.124336  , 0.10667014, 0.17816925, 0.10858631]),
 'score_time': array([0.00529027, 0.00230789, 0.0067451 , 0.00626421, 0.00278187,
        0.00219011, 0.002738  , 0.00507784, 0.00373602, 0.003335  ]),
 'test_score': array([0.94002586, 0.94794698, 0.94406725, 0.94148076, 0.94180407,
        0.94358228, 0.94115745, 0.94228904, 0.94083414, 0.9442289 ]),
 'train_score': array([0.9434519 , 0.94135004, 0.94232013, 0.94320938, 0.94288601,
        0.94211803, 0.94316896, 0.94292643, 0.94292643, 0.94236055])}

In [109]:
print(f'Time for fitting classifier on the train set: {lr_cv_results_title["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {lr_cv_results_title["score_time"].mean()}')
print(f'Accuracy of Train: {lr_cv_results_title["train_score"].mean()}')
print(f'Accuracy of Validation: {lr_cv_results_title["test_score"].mean()}')

Time for fitting classifier on the train set: 0.17190792560577392
Time for scoring classifier on the validation set: 0.004046630859375
Accuracy of Train: 0.9426717865804365
Accuracy of Validation: 0.9427416747494343


### 1.2 Naive Bayes

In [113]:
%%time
gnb_clf_title = Pipeline([
    ('scale', Normalizer()),
    ('clf', GaussianNB())
])

gnb_cv_results_title = cross_validate(gnb_clf_title, X_train_title, y_train_title, cv=cv, return_train_score=True)

CPU times: user 234 ms, sys: 37.3 ms, total: 272 ms
Wall time: 282 ms


In [114]:
gnb_cv_results_title

{'fit_time': array([0.0185039 , 0.01550007, 0.01448488, 0.01420498, 0.01976585,
        0.01460099, 0.01365304, 0.01381922, 0.013942  , 0.0138061 ]),
 'score_time': array([0.00384998, 0.00280905, 0.00282192, 0.00310493, 0.00515103,
        0.00278401, 0.00283098, 0.00281978, 0.00280094, 0.00280285]),
 'test_score': array([0.92935661, 0.93598448, 0.93404462, 0.92870999, 0.92709344,
        0.93452958, 0.93178144, 0.93129648, 0.93388296, 0.93614614]),
 'train_score': array([0.93298302, 0.93168957, 0.93136621, 0.9327405 , 0.92914309,
        0.93168957, 0.93221504, 0.93367017, 0.93229588, 0.93080032])}

In [115]:
print(f'Time for fitting classifier on the train set: {gnb_cv_results_title["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {gnb_cv_results_title["score_time"].mean()}')
print(f'Accuracy of Train: {gnb_cv_results_title["train_score"].mean()}')
print(f'Accuracy of Validation: {gnb_cv_results_title["test_score"].mean()}')

Time for fitting classifier on the train set: 0.01522810459136963
Time for scoring classifier on the validation set: 0.0031775474548339845
Accuracy of Train: 0.9318593371059013
Accuracy of Validation: 0.9322825735531847


### 1.3 SVM

In [116]:
%%time

svm_clf_title = Pipeline([
    ('scale', StandardScaler()),
    ('clf', SVC(random_state=1))
])

svm_cv_results_title = cross_validate(svm_clf_title, X_train_title, y_train_title, cv=cv, return_train_score=True)

CPU times: user 2min 41s, sys: 3.63 s, total: 2min 44s
Wall time: 2min 51s


In [117]:
svm_cv_results_title

{'fit_time': array([5.4379921 , 5.67425704, 5.26771498, 4.96598077, 5.93168688,
        5.40598679, 4.94716382, 5.27366686, 6.62897682, 5.41495299]),
 'score_time': array([2.8310039 , 2.37703395, 2.3134501 , 2.29794216, 2.37787485,
        2.40980601, 2.25872207, 2.27631903, 2.43772912, 2.26801991]),
 'test_score': array([0.94746201, 0.95457485, 0.95489816, 0.95376657, 0.95134174,
        0.95279664, 0.9500485 , 0.95085677, 0.95279664, 0.9547365 ]),
 'train_score': array([0.95533549, 0.95408246, 0.95347615, 0.95363783, 0.95468876,
        0.95412288, 0.95464834, 0.95485044, 0.95444624, 0.95359741])}

In [118]:
print(f'Time for fitting classifier on the train set: {svm_cv_results_title["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {svm_cv_results_title["score_time"].mean()}')
print(f'Accuracy of Train: {svm_cv_results_title["train_score"].mean()}')
print(f'Accuracy of Validation: {svm_cv_results_title["test_score"].mean()}')

Time for fitting classifier on the train set: 5.494837903976441
Time for scoring classifier on the validation set: 2.3847901105880736
Accuracy of Train: 0.9542886014551334
Accuracy of Validation: 0.9523278370514063


### 1.4 Random Forest

In [119]:
%%time

rf_clf_title = Pipeline([
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=1))
])

rf_cv_results_title = cross_validate(rf_clf_title, X_train_title, y_train_title, cv=cv, return_train_score=True)

CPU times: user 20.3 s, sys: 629 ms, total: 20.9 s
Wall time: 22.3 s


In [120]:
rf_cv_results_title

{'fit_time': array([1.86879277, 1.55097795, 1.70199513, 2.24268794, 1.8272028 ,
        1.92721915, 1.861022  , 1.67886686, 1.77366495, 1.53000903]),
 'score_time': array([0.09291911, 0.08585405, 0.08606267, 0.09484386, 0.12158513,
        0.11577702, 0.08738303, 0.09752011, 0.08693719, 0.08548093]),
 'test_score': array([0.94374394, 0.94827029, 0.94616877, 0.94697704, 0.9409958 ,
        0.94746201, 0.94390559, 0.94390559, 0.94633042, 0.94616877]),
 'train_score': array([0.99882781, 0.99890865, 0.99907033, 0.99882781, 0.99894907,
        0.99890865, 0.99866613, 0.99894907, 0.99894907, 0.99882781])}

In [121]:
print(f'Time for fitting classifier on the train set: {rf_cv_results_title["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {rf_cv_results_title["score_time"].mean()}')
print(f'Accuracy of Train: {rf_cv_results_title["train_score"].mean()}')
print(f'Accuracy of Validation: {rf_cv_results_title["test_score"].mean()}')

Time for fitting classifier on the train set: 1.7962438583374023
Time for scoring classifier on the validation set: 0.09543631076812745
Accuracy of Train: 0.9988884397736459
Accuracy of Validation: 0.945392822502425


### 1.5 XGBoost

In [122]:
%%time

xgboost_clf_title = Pipeline([
    ('scale', StandardScaler()),
    ('clf', XGBClassifier(random_state=1))
])

xgboost_cv_results_title = cross_validate(xgboost_clf_title, X_train_title, y_train_title, cv=cv, return_train_score=True)

CPU times: user 10.9 s, sys: 219 ms, total: 11.2 s
Wall time: 11.5 s


In [123]:
xgboost_cv_results_title

{'fit_time': array([1.13135266, 1.09480977, 1.05672097, 1.00600076, 1.0791533 ,
        1.10272479, 1.03688908, 1.01619005, 1.11051106, 1.00568104]),
 'score_time': array([0.01792622, 0.01956701, 0.01663494, 0.01667833, 0.01923513,
        0.01669621, 0.01694012, 0.01639891, 0.01827383, 0.01706004]),
 'test_score': array([0.94519884, 0.95085677, 0.95118008, 0.94988684, 0.94600711,
        0.95101843, 0.94843194, 0.94681539, 0.94827029, 0.95053346]),
 'train_score': array([0.9525869 , 0.95133387, 0.95161681, 0.95226354, 0.95323363,
        0.95153597, 0.95291027, 0.95181892, 0.95157639, 0.95185934])}

In [124]:
print(f'Time for fitting classifier on the train set: {xgboost_cv_results_title["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {xgboost_cv_results_title["score_time"].mean()}')
print(f'Accuracy of Train: {xgboost_cv_results_title["train_score"].mean()}')
print(f'Accuracy of Validation: {xgboost_cv_results_title["test_score"].mean()}')

Time for fitting classifier on the train set: 1.064003348350525
Time for scoring classifier on the validation set: 0.01754107475280762
Accuracy of Train: 0.9520735650767987
Accuracy of Validation: 0.9488199159392176


## Storing Title Results

In [125]:
results_title = pd.DataFrame(index=['Logistic Regression', 'Gaussian Naive Bayes', 'Support Vector Machine', 'Random Forest', 'XGBoost'])

results_title.loc['Logistic Regression', 'train_acc'] = lr_cv_results_title["train_score"].mean()
results_title.loc['Logistic Regression', 'val_acc'] = lr_cv_results_title["test_score"].mean()
results_title.loc['Logistic Regression', 'fit_time'] = lr_cv_results_title["fit_time"].mean()

results_title.loc['Gaussian Naive Bayes', 'train_acc'] = gnb_cv_results_title["train_score"].mean()
results_title.loc['Gaussian Naive Bayes', 'val_acc'] = gnb_cv_results_title["test_score"].mean()
results_title.loc['Gaussian Naive Bayes', 'fit_time'] = gnb_cv_results_title["fit_time"].mean()

results_title.loc['Support Vector Machine', 'train_acc'] = svm_cv_results_title["train_score"].mean()
results_title.loc['Support Vector Machine', 'val_acc'] = svm_cv_results_title["test_score"].mean()
results_title.loc['Support Vector Machine', 'fit_time'] = svm_cv_results_title["fit_time"].mean()

results_title.loc['Random Forest', 'train_acc'] = rf_cv_results_title["train_score"].mean()
results_title.loc['Random Forest', 'val_acc'] = rf_cv_results_title["test_score"].mean()
results_title.loc['Random Forest', 'fit_time'] = rf_cv_results_title["fit_time"].mean()

results_title.loc['XGBoost', 'train_acc'] = xgboost_cv_results_title["train_score"].mean()
results_title.loc['XGBoost', 'val_acc'] = xgboost_cv_results_title["test_score"].mean()
results_title.loc['XGBoost', 'fit_time'] = xgboost_cv_results_title["fit_time"].mean()

In [126]:
results_title

Unnamed: 0,train_acc,val_acc,fit_time
Logistic Regression,0.942672,0.942742,0.171908
Gaussian Naive Bayes,0.931859,0.932283,0.015228
Support Vector Machine,0.954289,0.952328,5.494838
Random Forest,0.998888,0.945393,1.796244
XGBoost,0.952074,0.94882,1.064003


# 2. All Text - Metadata & Sentiment Features

In [127]:
X = news.iloc[:,6:21]
y = news['fake']

In [128]:
X.head()

Unnamed: 0,char_count,word_count,sent_count,capital_word_count,quoted_word_count,stopword_count,unique_word_count,punct_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words,noun_count,adverb_count,sentiment_score
0,2973,507,26,5,0,195,282,122,5.863905,19.5,0.556213,0.384615,116,36,-0.9139
1,1968,313,11,3,0,120,209,39,6.28754,28.454545,0.667732,0.383387,94,10,-0.7685
2,3688,595,25,42,0,220,344,148,6.198319,23.8,0.578151,0.369748,167,20,-0.9955
3,2853,458,15,6,0,164,278,120,6.229258,30.533333,0.606987,0.358079,143,24,-0.9269
4,2417,431,19,0,0,197,244,40,5.607889,22.684211,0.566125,0.457077,79,16,0.3134


## Train Test Split

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 1) #20% test

## Model Building

### 2.1 Logistic Regression

In [130]:
%%time 

lr_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(random_state=1))
])

lr_cv_results = cross_validate(lr_clf, X_train, y_train, cv=cv, return_train_score=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 6.56 s, sys: 219 ms, total: 6.78 s
Wall time: 2.03 s


In [131]:
lr_cv_results

{'fit_time': array([0.19882393, 0.1548872 , 0.22434378, 0.19513083, 0.187222  ,
        0.26130104, 0.16943002, 0.16552305, 0.17698097, 0.17061496]),
 'score_time': array([0.00555301, 0.00338888, 0.00374508, 0.00244403, 0.00262189,
        0.00405407, 0.0030098 , 0.00254273, 0.00236011, 0.00219703]),
 'test_score': array([0.86129971, 0.85693501, 0.8533786 , 0.86663434, 0.86420951,
        0.8580666 , 0.85984481, 0.86744261, 0.85305529, 0.86372454]),
 'train_score': array([0.86046888, 0.85877122, 0.86176233, 0.8589329 , 0.85856912,
        0.86046888, 0.85957963, 0.85711399, 0.86228779, 0.85877122])}

In [132]:
print(f'Time for fitting classifier on the train set: {lr_cv_results["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {lr_cv_results["score_time"].mean()}')
print(f'Accuracy of Train: {lr_cv_results["train_score"].mean()}')
print(f'Accuracy of Validation: {lr_cv_results["test_score"].mean()}')

Time for fitting classifier on the train set: 0.19042577743530273
Time for scoring classifier on the validation set: 0.0031916618347167967
Accuracy of Train: 0.8596725949878741
Accuracy of Validation: 0.8604591011962496


### 2.2 Naive Bayes

In [133]:
%%time

gnb_clf = Pipeline([
    ('scale', Normalizer()),
    ('clf', GaussianNB())
])

gnb_cv_results = cross_validate(gnb_clf, X_train, y_train, cv=cv, return_train_score=True)

In [134]:
gnb_cv_results

{'fit_time': array([0.01579094, 0.01647902, 0.01483417, 0.01516294, 0.01523423,
        0.01939082, 0.02032304, 0.02065516, 0.02066493, 0.02154994]),
 'score_time': array([0.00309801, 0.00283813, 0.00304008, 0.00274181, 0.00557375,
        0.00451994, 0.00408816, 0.00418901, 0.00599885, 0.00462389]),
 'test_score': array([0.69915939, 0.70400905, 0.68913676, 0.68461041, 0.6933398 ,
        0.68962173, 0.69689622, 0.6962496 , 0.6871969 , 0.69608794]),
 'train_score': array([0.69122878, 0.70145513, 0.68823767, 0.68864188, 0.68569119,
        0.69409863, 0.69090542, 0.69664511, 0.69353274, 0.68755053])}

In [135]:
print(f'Time for fitting classifier on the train set: {gnb_cv_results["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {gnb_cv_results["score_time"].mean()}')
print(f'Accuracy of Train: {gnb_cv_results["train_score"].mean()}')
print(f'Accuracy of Validation: {gnb_cv_results["test_score"].mean()}')

Time for fitting classifier on the train set: 0.01800851821899414
Time for scoring classifier on the validation set: 0.004071164131164551
Accuracy of Train: 0.6917987065481003
Accuracy of Validation: 0.6936307791787908


### 2.3 SVM

In [137]:
%%time

svm_clf = Pipeline([
    ('scale', StandardScaler()),
    ('clf', SVC(random_state=1))
])

svm_cv_results = cross_validate(svm_clf, X_train, y_train, cv=cv, return_train_score=True)

CPU times: user 5min 18s, sys: 5.4 s, total: 5min 23s
Wall time: 5min 33s


In [138]:
svm_cv_results

{'fit_time': array([10.93141985, 11.12288809, 10.55821705, 10.92484903,  9.91726708,
         9.72398472, 10.40651608,  9.47267985, 11.34606314,  9.85449076]),
 'score_time': array([4.72791195, 4.61257195, 4.35795712, 5.11404991, 4.34168077,
        4.59718513, 4.25743198, 4.2073741 , 4.33628392, 4.55799222]),
 'test_score': array([0.90413838, 0.90171355, 0.90268348, 0.90656321, 0.90753314,
        0.90462334, 0.90413838, 0.909473  , 0.89896541, 0.91189783]),
 'train_score': array([0.90909458, 0.90901374, 0.90788197, 0.90743735, 0.90800323,
        0.90763945, 0.90747777, 0.90739693, 0.90978173, 0.90541633])}

In [139]:
print(f'Time for fitting classifier on the train set: {svm_cv_results["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {svm_cv_results["score_time"].mean()}')
print(f'Accuracy of Train: {svm_cv_results["train_score"].mean()}')
print(f'Accuracy of Validation: {svm_cv_results["test_score"].mean()}')

Time for fitting classifier on the train set: 10.425837564468384
Time for scoring classifier on the validation set: 4.511043906211853
Accuracy of Train: 0.907914308811641
Accuracy of Validation: 0.9051729712253476


### 2.4 Random Forest

In [140]:
%%time

rf_clf = Pipeline([
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=1))
])

rf_cv_results = cross_validate(rf_clf, X_train, y_train, cv=cv, return_train_score=True)

CPU times: user 48.4 s, sys: 1.04 s, total: 49.4 s
Wall time: 51.4 s


In [141]:
rf_cv_results

{'fit_time': array([4.63230085, 4.35264325, 4.43624878, 5.25108194, 5.13699007,
        4.96061206, 4.58648562, 4.197474  , 4.17424273, 4.39387488]),
 'score_time': array([0.12995505, 0.10323787, 0.11905313, 0.14497495, 0.16381001,
        0.14874196, 0.10498619, 0.09998298, 0.10900521, 0.09955287]),
 'test_score': array([0.90139024, 0.90058196, 0.90203686, 0.90446169, 0.904785  ,
        0.90397672, 0.90543162, 0.90575493, 0.89993534, 0.91319108]),
 'train_score': array([1.        , 0.99995958, 0.99995958, 1.        , 0.99995958,
        0.99995958, 1.        , 1.        , 1.        , 1.        ])}

In [142]:
print(f'Time for fitting classifier on the train set: {rf_cv_results["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {rf_cv_results["score_time"].mean()}')
print(f'Accuracy of Train: {rf_cv_results["train_score"].mean()}')
print(f'Accuracy of Validation: {rf_cv_results["test_score"].mean()}')

Time for fitting classifier on the train set: 4.612195420265198
Time for scoring classifier on the validation set: 0.12233002185821533
Accuracy of Train: 0.9999838318512531
Accuracy of Validation: 0.9041545425153572


### 2.5 XGBoost

In [143]:
%%time

xgboost_clf = Pipeline([
    ('scale', StandardScaler()),
    ('clf', XGBClassifier(random_state=1))
])

xgboost_cv_results = cross_validate(xgboost_clf, X_train, y_train, cv=cv, return_train_score=True)

CPU times: user 15.2 s, sys: 287 ms, total: 15.5 s
Wall time: 15.9 s


In [144]:
xgboost_cv_results

{'fit_time': array([1.579983  , 1.50591302, 1.44315314, 1.435112  , 1.51270604,
        1.47728205, 1.54843497, 1.49269199, 1.46346712, 1.47735286]),
 'score_time': array([0.020926  , 0.01819706, 0.01838613, 0.0181911 , 0.01838422,
        0.01802874, 0.01831388, 0.01866221, 0.01803184, 0.01886487]),
 'test_score': array([0.89104429, 0.89185257, 0.89395409, 0.89945037, 0.8942774 ,
        0.89460071, 0.89476237, 0.90074361, 0.89314581, 0.90607824]),
 'train_score': array([0.90626516, 0.90505255, 0.90412288, 0.90307195, 0.90493129,
        0.90323363, 0.90388036, 0.90254648, 0.9041633 , 0.90194018])}

In [145]:
print(f'Time for fitting classifier on the train set: {xgboost_cv_results["fit_time"].mean()}')
print(f'Time for scoring classifier on the validation set: {xgboost_cv_results["score_time"].mean()}')
print(f'Accuracy of Train: {xgboost_cv_results["train_score"].mean()}')
print(f'Accuracy of Validation: {xgboost_cv_results["test_score"].mean()}')

Time for fitting classifier on the train set: 1.493609619140625
Time for scoring classifier on the validation set: 0.018598604202270507
Accuracy of Train: 0.9039207760711401
Accuracy of Validation: 0.8959909473003556


## Storing All Text Results

In [146]:
results_all_text = pd.DataFrame(index=['Logistic Regression', 'Gaussian Naive Bayes', 'Support Vector Machine', 'Random Forest', 'XGBoost'])

results_all_text.loc['Logistic Regression', 'train_acc'] = lr_cv_results["train_score"].mean()
results_all_text.loc['Logistic Regression', 'val_acc'] = lr_cv_results["test_score"].mean()
results_all_text.loc['Logistic Regression', 'fit_time'] = lr_cv_results["fit_time"].mean()

results_all_text.loc['Gaussian Naive Bayes', 'train_acc'] = gnb_cv_results["train_score"].mean()
results_all_text.loc['Gaussian Naive Bayes', 'val_acc'] = gnb_cv_results["test_score"].mean()
results_all_text.loc['Gaussian Naive Bayes', 'fit_time'] = gnb_cv_results["fit_time"].mean()

results_all_text.loc['Support Vector Machine', 'train_acc'] = svm_cv_results["train_score"].mean()
results_all_text.loc['Support Vector Machine', 'val_acc'] = svm_cv_results["test_score"].mean()
results_all_text.loc['Support Vector Machine', 'fit_time'] = svm_cv_results["fit_time"].mean()

results_all_text.loc['Random Forest', 'train_acc'] = rf_cv_results["train_score"].mean()
results_all_text.loc['Random Forest', 'val_acc'] = rf_cv_results["test_score"].mean()
results_all_text.loc['Random Forest', 'fit_time'] = rf_cv_results["fit_time"].mean()

results_all_text.loc['XGBoost', 'train_acc'] = xgboost_cv_results["train_score"].mean()
results_all_text.loc['XGBoost', 'val_acc'] = xgboost_cv_results["test_score"].mean()
results_all_text.loc['XGBoost', 'fit_time'] = xgboost_cv_results["fit_time"].mean()

In [147]:
results_all_text

Unnamed: 0,train_acc,val_acc,fit_time
Logistic Regression,0.859673,0.860459,0.190426
Gaussian Naive Bayes,0.691799,0.693631,0.018009
Support Vector Machine,0.907914,0.905173,10.425838
Random Forest,0.999984,0.904155,4.612195
XGBoost,0.903921,0.895991,1.49361


# 3. Comparing Validation Accuracy and Fit Time

In [148]:
from IPython.display import display_html 

results_title_style = results_title.style.set_table_attributes("style='display:inline; margin-right:20px;'").set_caption("Title Only")
results_all_text_style = results_all_text.style.set_table_attributes("style='display:inline'").set_caption("Title & Text")

display_html(results_title_style._repr_html_() + results_all_text_style._repr_html_(), raw=True)

Unnamed: 0,train_acc,val_acc,fit_time
Logistic Regression,0.942672,0.942742,0.171908
Gaussian Naive Bayes,0.931859,0.932283,0.015228
Support Vector Machine,0.954289,0.952328,5.494838
Random Forest,0.998888,0.945393,1.796244
XGBoost,0.952074,0.94882,1.064003

Unnamed: 0,train_acc,val_acc,fit_time
Logistic Regression,0.859673,0.860459,0.190426
Gaussian Naive Bayes,0.691799,0.693631,0.018009
Support Vector Machine,0.907914,0.905173,10.425838
Random Forest,0.999984,0.904155,4.612195
XGBoost,0.903921,0.895991,1.49361


1. SVM takes the longest to fit across both experiments
2. For title only, although SVM performed the best, XGBoost wasn't far off in terms of validation accuracy and took 6.5 times less to fit
3. For all text, although SVM performed the best, the long fitting time dissauades us from using it, instead XGBoost or even Random Forest would be a better choice
4. On average, validation accuracy for title is higher than that of all text, hence we could say that we are able to predict if an article is real based on the title alone

# 4. Title - Test Accuracy

In [149]:
results_title_final = pd.DataFrame(index=['Logistic Regression', 'Gaussian Naive Bayes', 'Support Vector Machine', 'Random Forest', 'XGBoost'])

In [162]:
# Linear Regression
lr_clf_title = sklearn.base.clone(lr_clf_title)
lr_clf_title.fit(X_train_title, y_train_title)
lr_clf_title_train_acc = lr_clf_title.score(X_train_title, y_train_title)
lr_clf_title_test_acc = lr_clf_title.score(X_test_title, y_test_title)
results_title_final.loc['Logistic Regression', 'train_acc'] = lr_clf_title_train_acc
results_title_final.loc['Logistic Regression', 'test_acc'] = lr_clf_title_test_acc
lr_y_pred_title = lr_clf_title.predict(X_test_title)
results_title_final.loc['Logistic Regression', 'F1-Weighted'] = fbeta_score(y_test_title, lr_y_pred_title, beta=1, average="weighted")
results_title_final.loc['Logistic Regression', 'F2-Weighted'] = fbeta_score(y_test_title, lr_y_pred_title, beta=2, average="weighted")


# Naive Bayes
gnb_clf_title = sklearn.base.clone(gnb_clf_title)
gnb_clf_title.fit(X_train_title, y_train_title)
gnb_clf_title_train_acc = gnb_clf_title.score(X_train_title, y_train_title)
gnb_clf_title_test_acc = gnb_clf_title.score(X_test_title, y_test_title)
results_title_final.loc['Gaussian Naive Bayes', 'train_acc'] = gnb_clf_title_train_acc
results_title_final.loc['Gaussian Naive Bayes', 'test_acc'] = gnb_clf_title_test_acc
gnb_y_pred_title = gnb_clf_title.predict(X_test_title)
results_title_final.loc['Gaussian Naive Bayes', 'F1-Weighted'] = fbeta_score(y_test_title, gnb_y_pred_title, beta=1, average="weighted")
results_title_final.loc['Gaussian Naive Bayes', 'F2-Weighted'] = fbeta_score(y_test_title, gnb_y_pred_title, beta=2, average="weighted")

# SVM
svm_clf_title = sklearn.base.clone(svm_clf_title)
svm_clf_title.fit(X_train_title, y_train_title)
svm_clf_title_train_acc = svm_clf_title.score(X_train_title, y_train_title)
svm_clf_title_test_acc = svm_clf_title.score(X_test_title, y_test_title)
results_title_final.loc['Support Vector Machine', 'train_acc'] = svm_clf_title_train_acc
results_title_final.loc['Support Vector Machine', 'test_acc'] = svm_clf_title_test_acc
svm_y_pred_title = svm_clf_title.predict(X_test_title)
results_title_final.loc['Support Vector Machine', 'F1-Weighted'] = fbeta_score(y_test_title, svm_y_pred_title, beta=1, average="weighted")
results_title_final.loc['Support Vector Machine', 'F2-Weighted'] = fbeta_score(y_test_title, svm_y_pred_title, beta=2, average="weighted")

# Random Forest
rf_clf_title = sklearn.base.clone(rf_clf_title)
rf_clf_title.fit(X_train_title, y_train_title)
rf_clf_title_train_acc = rf_clf_title.score(X_train_title, y_train_title)
rf_clf_title_test_acc = rf_clf_title.score(X_test_title, y_test_title)
results_title_final.loc['Random Forest', 'train_acc'] = rf_clf_title_train_acc
results_title_final.loc['Random Forest', 'test_acc'] = rf_clf_title_test_acc
rf_y_pred_title = rf_clf_title.predict(X_test_title)
results_title_final.loc['Random Forest', 'F1-Weighted'] = fbeta_score(y_test_title, rf_y_pred_title, beta=1, average="weighted")
results_title_final.loc['Random Forest', 'F2-Weighted'] = fbeta_score(y_test_title, rf_y_pred_title, beta=2, average="weighted")

# XGBoost
xgboost_clf_title = sklearn.base.clone(xgboost_clf_title)
xgboost_clf_title.fit(X_train_title, y_train_title)
xgboost_clf_title_train_acc = xgboost_clf_title.score(X_train_title, y_train_title)
xgboost_clf_title_test_acc = xgboost_clf_title.score(X_test_title, y_test_title)
results_title_final.loc['XGBoost', 'train_acc'] = xgboost_clf_title_train_acc
results_title_final.loc['XGBoost', 'test_acc'] = xgboost_clf_title_test_acc
xgboost_y_pred_title = xgboost_clf_title.predict(X_test_title)
results_title_final.loc['XGBoost', 'F1-Weighted'] = fbeta_score(y_test_title, xgboost_y_pred_title, beta=1, average="weighted")
results_title_final.loc['XGBoost', 'F2-Weighted'] = fbeta_score(y_test_title, xgboost_y_pred_title, beta=2, average="weighted")

results_title_final


Unnamed: 0,train_acc,test_acc,F1-Weighted,F2-Weighted
Logistic Regression,0.942896,0.945292,0.945169,0.945144
Gaussian Naive Bayes,0.931773,0.930031,0.929666,0.929516
Support Vector Machine,0.954116,0.95344,0.953305,0.953235
Random Forest,0.998739,0.945163,0.945058,0.945046
XGBoost,0.951303,0.949302,0.949171,0.949125


# 5. All Text - Test Accuracy

In [163]:
results_all_final = pd.DataFrame(index=['Logistic Regression', 'Gaussian Naive Bayes', 'Support Vector Machine', 'Random Forest', 'XGBoost'])

In [164]:
# Linear Regression
lr_clf = sklearn.base.clone(lr_clf)
lr_clf.fit(X_train, y_train)
lr_all_train_acc = lr_clf.score(X_train, y_train)
lr_all_test_acc = lr_clf.score(X_test, y_test)
results_all_final.loc['Logistic Regression', 'train_acc'] = lr_all_train_acc
results_all_final.loc['Logistic Regression', 'test_acc'] = lr_all_test_acc
lr_y_pred = lr_clf.predict(X_test)
results_all_final.loc['Logistic Regression', 'F1-Weighted'] = fbeta_score(y_test, lr_y_pred, beta=1, average="weighted")
results_all_final.loc['Logistic Regression', 'F2-Weighted'] = fbeta_score(y_test, lr_y_pred, beta=2, average="weighted")

# Naive Bayes
gnb_clf = sklearn.base.clone(gnb_clf)
gnb_clf.fit(X_train, y_train)
gnb_all_train_acc = gnb_clf.score(X_train, y_train)
gnb_all_test_acc = gnb_clf.score(X_test, y_test)
results_all_final.loc['Gaussian Naive Bayes', 'train_acc'] = gnb_all_train_acc
results_all_final.loc['Gaussian Naive Bayes', 'test_acc'] = gnb_all_test_acc
gnb_y_pred = gnb_clf.predict(X_test)
results_all_final.loc['Gaussian Naive Bayes', 'F1-Weighted'] = fbeta_score(y_test, gnb_y_pred, beta=1, average="weighted")
results_all_final.loc['Gaussian Naive Bayes', 'F2-Weighted'] = fbeta_score(y_test, gnb_y_pred, beta=2, average="weighted")

# SVM
svm_clf = sklearn.base.clone(svm_clf)
svm_clf.fit(X_train, y_train)
svm_all_train_acc = svm_clf.score(X_train, y_train)
svm_all_test_acc = svm_clf.score(X_test, y_test)
results_all_final.loc['Support Vector Machine', 'train_acc'] = svm_all_train_acc
results_all_final.loc['Support Vector Machine', 'test_acc'] = svm_all_test_acc
svm_y_pred = svm_clf.predict(X_test)
results_all_final.loc['Support Vector Machine', 'F1-Weighted'] = fbeta_score(y_test, svm_y_pred, beta=1, average="weighted")
results_all_final.loc['Support Vector Machine', 'F2-Weighted'] = fbeta_score(y_test, svm_y_pred, beta=2, average="weighted")

# Random Forest
rf_clf = sklearn.base.clone(rf_clf)
rf_clf.fit(X_train, y_train)
rf_all_train_acc = rf_clf.score(X_train, y_train)
rf_all_test_acc = rf_clf.score(X_test, y_test)
results_all_final.loc['Random Forest', 'train_acc'] = rf_all_train_acc
results_all_final.loc['Random Forest', 'test_acc'] = rf_all_test_acc
rf_y_pred = rf_clf.predict(X_test)
results_all_final.loc['Random Forest', 'F1-Weighted'] = fbeta_score(y_test, rf_y_pred, beta=1, average="weighted")
results_all_final.loc['Random Forest', 'F2-Weighted'] = fbeta_score(y_test, rf_y_pred, beta=2, average="weighted")

# XGBoost
xgboost_clf = sklearn.base.clone(xgboost_clf)
xgboost_clf.fit(X_train, y_train)
xgboost_all_train_acc = xgboost_clf.score(X_train, y_train)
xgboost_all_test_acc = xgboost_clf.score(X_test, y_test)
results_all_final.loc['XGBoost', 'train_acc'] = xgboost_all_train_acc
results_all_final.loc['XGBoost', 'test_acc'] = xgboost_all_test_acc
xgboost_y_pred = xgboost_clf.predict(X_test)
results_all_final.loc['XGBoost', 'F1-Weighted'] = fbeta_score(y_test, xgboost_y_pred, beta=1, average="weighted")
results_all_final.loc['XGBoost', 'F2-Weighted'] = fbeta_score(y_test, xgboost_y_pred, beta=2, average="weighted")

results_all_final

Unnamed: 0,train_acc,test_acc,F1-Weighted,F2-Weighted
Logistic Regression,0.859891,0.850362,0.849962,0.850085
Gaussian Naive Bayes,0.689614,0.696198,0.672898,0.678883
Support Vector Machine,0.908491,0.896793,0.896624,0.896671
Random Forest,1.0,0.900802,0.900746,0.900771
XGBoost,0.903673,0.886834,0.886547,0.886607


In [165]:
results_title_style = results_title_final.style.set_table_attributes("style='display:inline; margin-right:20px;'").set_caption("Title Only")
results_all_text_style = results_all_final.style.set_table_attributes("style='display:inline'").set_caption("Title & Text")

display_html(results_title_style._repr_html_() + results_all_text_style._repr_html_(), raw=True)

Unnamed: 0,train_acc,test_acc,F1-Weighted,F2-Weighted
Logistic Regression,0.942896,0.945292,0.945169,0.945144
Gaussian Naive Bayes,0.931773,0.930031,0.929666,0.929516
Support Vector Machine,0.954116,0.95344,0.953305,0.953235
Random Forest,0.998739,0.945163,0.945058,0.945046
XGBoost,0.951303,0.949302,0.949171,0.949125

Unnamed: 0,train_acc,test_acc,F1-Weighted,F2-Weighted
Logistic Regression,0.859891,0.850362,0.849962,0.850085
Gaussian Naive Bayes,0.689614,0.696198,0.672898,0.678883
Support Vector Machine,0.908491,0.896793,0.896624,0.896671
Random Forest,1.0,0.900802,0.900746,0.900771
XGBoost,0.903673,0.886834,0.886547,0.886607


# 6. TF-IDF

## 6.1 All Text

In [55]:
X = news['all_text']
y = news['fake']

In [56]:
X.head()

0     Donald Trump Sends Out Embarrassing New Year’...
1     Drunk Bragging Trump Staffer Started Russian ...
2     Sheriff David Clarke Becomes An Internet Joke...
3     Trump Is So Obsessed He Even Has Obama’s Name...
4     Pope Francis Just Called Out Donald Trump Dur...
Name: all_text, dtype: object

### 6.1.1 Train Test Split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 1) #20% test

### 6.1.2 Transforming All Text to Vectors

In [58]:
vectorization = TfidfVectorizer()
X_train_vec = vectorization.fit_transform(X_train)
X_test_vec = vectorization.transform(X_test)

### 6.1.3 Model Building

In [59]:
nlp_results = {}

#### 6.1.3.1 Logistic Regression

In [61]:
LR = LogisticRegression()
LR.fit(X_train_vec, y_train)
nlp_results['LR'] = [LR.score(X_train_vec, y_train), LR.score(X_test_vec, y_test)] 

#### 6.1.3.2 Naive Bayes

In [62]:
NB = MultinomialNB().fit(X_train_vec, y_train)
nlp_results['NB']= [NB.score(X_train_vec, y_train), NB.score(X_test_vec, y_test)]

#### 6.1.3.3 SVM

In [63]:
SVM = SVC(kernel='rbf', gamma='scale', random_state=1)
SVM.fit(X_train_vec, y_train)
nlp_results['SVM']= [SVM.score(X_train_vec, y_train), SVM.score(X_test_vec, y_test)]

#### 6.1.3.4 Random Forest

In [64]:
RFC = RandomForestClassifier(random_state=1)
RFC.fit(X_train_vec, y_train)

nlp_results['RFC']= [RFC.score(X_train_vec, y_train), RFC.score(X_test_vec, y_test)]

{'LR': [0.9884563150746944, 0.9811174340403518], 'NB': [0.9457737825777662, 0.9372736678737713], 'SVM': [0.99899760719136, 0.987325400931195], 'RFC': [1.0, 0.9714174857734093]}


#### 6.1.3.5 XGBoost

In [3]:
XGB = XGBClassifier(random_state=1)
XGB.fit(X_train_vec, y_train)
nlp_results['XGB'] = [XGB.score(X_train_vec, y_train), XGB.score(X_test_vec, y_test)]

In [4]:
print(nlp_results)

{'RFC': [1.0, 0.9714174857734093], 'XGB': [1.0, 0.987842731505432], 'LR': [0.9884563150746944, 0.9811174340403518], 'NB': [0.9457737825777662, 0.9372736678737713], 'SVM': [0.99899760719136, 0.987325400931195]}


### 6.1.4 Storing All Test TF-IDF Results

In [5]:
import pandas as pd
tfidf_results_all_text = pd.DataFrame()

tfidf_results_all_text.loc['Logistic Regression', 'train_acc'] = nlp_results['LR'][0]
tfidf_results_all_text.loc['Logistic Regression', 'test_acc'] = nlp_results['LR'][1]

tfidf_results_all_text.loc['Multinomial Naive Bayes', 'train_acc'] = nlp_results['NB'][0]
tfidf_results_all_text.loc['Multinomial Naive Bayes', 'test_acc'] = nlp_results['NB'][1]

tfidf_results_all_text.loc['Support Vector Machine', 'train_acc'] = nlp_results['SVM'][0]
tfidf_results_all_text.loc['Support Vector Machine', 'test_acc'] = nlp_results['SVM'][1]

tfidf_results_all_text.loc['Random Forest', 'train_acc'] = nlp_results['RFC'][0]
tfidf_results_all_text.loc['Random Forest', 'test_acc'] = nlp_results['RFC'][1]

tfidf_results_all_text.loc['XGBoost', 'train_acc'] = nlp_results['XGB'][0]
tfidf_results_all_text.loc['XGBoost', 'test_acc'] = nlp_results['XGB'][1]
tfidf_results_all_text

Unnamed: 0,train_acc,test_acc
Logistic Regression,0.988456,0.981117
Multinomial Naive Bayes,0.945774,0.937274
Support Vector Machine,0.998998,0.987325
Random Forest,1.0,0.971417
XGBoost,1.0,0.987843


## 6.2 Title

In [70]:
X2 = news['title']
y2 = news['fake']

In [71]:
X2.head()

0     Donald Trump Sends Out Embarrassing New Year’...
1     Drunk Bragging Trump Staffer Started Russian ...
2     Sheriff David Clarke Becomes An Internet Joke...
3     Trump Is So Obsessed He Even Has Obama’s Name...
4     Pope Francis Just Called Out Donald Trump Dur...
Name: title, dtype: object

### 6.2.1 Train Test Split

In [72]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, shuffle = True, random_state = 1) #20% test

### 6.2.2 Transforming Title to Vectors

In [73]:
# Converting text to vectors

from sklearn.feature_extraction.text import TfidfVectorizer

vectorization2 = TfidfVectorizer()
X_train_vec2 = vectorization2.fit_transform(X_train2)
X_test_vec2 = vectorization2.transform(X_test2)

### 6.2.3 Model Building

In [74]:
nlp_results_title = {}

#### 6.2.3.1 Logistic Regression

In [75]:
LR2 = LogisticRegression()
LR2.fit(X_train_vec2, y_train2)
nlp_results_title['LR'] = [LR2.score(X_train_vec2, y_train2), LR2.score(X_test_vec2, y_test2)] 

#### 6.2.3.2 Naive Bayes

In [None]:
NB2 = MultinomialNB().fit(X_train_vec2, y_train2)
nlp_results_title['NB']= [NB2.score(X_train_vec2, y_train2), NB2.score(X_test_vec2, y_test2)]

#### 6.2.3.3 SVM

In [None]:
from sklearn.svm import SVC

SVM2 = SVC(kernel='rbf', gamma='scale', random_state=1)
SVM2.fit(X_train_vec2, y_train2)
nlp_results_title['SVM']= [SVM2.score(X_train_vec2, y_train2), SVM2.score(X_test_vec2, y_test2)]

#### 6.2.3.4 Random Forest

In [76]:
RFC2 = RandomForestClassifier(random_state=1)
RFC2.fit(X_train_vec2, y_train2)
nlp_results_title['RFC']= [RFC2.score(X_train_vec2, y_train2), RFC2.score(X_test_vec2, y_test2)]

#### 6.2.3.5 XGBoost

In [10]:
XGB2 = XGBClassifier(random_state=1)
XGB2.fit(X_train_vec2, y_train2)
nlp_results_title['XGB'] = [XGB2.score(X_train_vec2, y_train2), XGB2.score(X_test_vec2, y_test2)]

In [11]:
print(nlp_results_title)

{'LR': [0.9650779279570588, 0.9443869632695292], 'NB': [0.957964172540904, 0.9408949818934299], 'RFC': [1.0, 0.9425763062596999], 'XGB': [0.9545366358403932, 0.9335230212105535], 'SVM': [0.9959904287654401, 0.9511122607346094]}


### 6.2.4 Storing Title TF-IDF Results

In [12]:
import pandas as pd
tfidf_results_title = pd.DataFrame()

tfidf_results_title.loc['Logistic Regression', 'train_acc'] = nlp_results_title['LR'][0]
tfidf_results_title.loc['Logistic Regression', 'test_acc'] = nlp_results_title['LR'][1]

tfidf_results_title.loc['Multinomial Naive Bayes', 'train_acc'] = nlp_results_title['NB'][0]
tfidf_results_title.loc['Multinomial Naive Bayes', 'test_acc'] = nlp_results_title['NB'][1]

tfidf_results_title.loc['Support Vector Machine', 'train_acc'] = nlp_results_title['SVM'][0]
tfidf_results_title.loc['Support Vector Machine', 'test_acc'] = nlp_results_title['SVM'][1]

tfidf_results_title.loc['Random Forest', 'train_acc'] = nlp_results_title['RFC'][0]
tfidf_results_title.loc['Random Forest', 'test_acc'] = nlp_results_title['RFC'][1]

tfidf_results_title.loc['XGBoost', 'train_acc'] = nlp_results_title['XGB'][0]
tfidf_results_title.loc['XGBoost', 'test_acc'] = nlp_results_title['XGB'][1]
tfidf_results_title

Unnamed: 0,train_acc,test_acc
Logistic Regression,0.965078,0.944387
Multinomial Naive Bayes,0.957964,0.940895
Support Vector Machine,0.99599,0.951112
Random Forest,1.0,0.942576
XGBoost,0.954537,0.933523


# Hyperparameter tuning XGBoost

In [7]:
from sklearn.model_selection import RandomizedSearchCV
params = {'max_depth': [3, 6, 10, 15],
      'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
      'subsample': np.arange(0.5, 1.0, 0.1),
      'colsample_bytree': np.arange(0.5, 1.0, 0.1),
      'colsample_bylevel': np.arange(0.5, 1.0, 0.1),
      }
search = RandomizedSearchCV(estimator=XGB,
                         param_distributions=params,
                         scoring='accuracy',
                         n_jobs=2, random_state=1)
search.fit(X_train_vec, y_train)
print(search.best_score_)
search.best_params_

0.987033547209766


{'subsample': 0.7999999999999999,
 'max_depth': 15,
 'learning_rate': 0.3,
 'colsample_bytree': 0.7,
 'colsample_bylevel': 0.7}

In [9]:
XGB_tuned = XGBClassifier(**search.best_params_, random_state=1)
XGB_tuned.fit(X_train_vec, y_train)
nlp_results['XGB Tuned'] = [XGB_tuned.score(X_train_vec, y_train), XGB_tuned.score(X_test_vec, y_test)]
print("Training Accuracy:", nlp_results['XGB Tuned'][0])
print("Testing Accuracy:", nlp_results['XGB Tuned'][1])

Training Accuracy: 1.0
Testing Accuracy: 0.9879720641489912


Accuracy went up by a marginal 0.02% after tuning hyperparameters. Hence, hyperparameter tuning may not be worth the efforts.

In [17]:
y_pred_prob = XGB.predict_proba(X_test_vec)[:,1]
y_pred_prob2 = XGB_tuned.predict_proba(X_test_vec)[:,1]
print('XGB ROC_AUC:', metrics.roc_auc_score(y_test, y_pred_prob))
print('XGB Tuned ROC_AUC:', metrics.roc_auc_score(y_test, y_pred_prob2))

XGB ROC_AUC: 0.999339845224613
XGB Tuned ROC_AUC: 0.9992480316022218
