In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, precision_score

import xgboost as xgb
from xgboost.sklearn import XGBClassifier




In [2]:
df = pd.read_csv('../fake_finder_df_full.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,title,label,dc_score,sub,tokens,text_length,lemma,clean_text,vs
0,0,U.S. Secretary of State John F. Kerry said Mon...,Kerry to go to Paris in gesture of sympathy,0,27.93,0.348775,"['U', 'S', 'Secretary', 'of', 'State', 'John',...",439,U S Secretary of State John F Kerry said Monda...,U S Secretary of State John F Kerry said Monda...,0.833
1,1,It's primary day in New York and front-runners...,The Battle of New York: Why This Primary Matters,0,22.23,0.44809,"['It', 's', 'primary', 'day', 'in', 'New', 'Yo...",335,It s primary day in New York and front runner ...,It s primary day in New York and front runner ...,0.745
2,2,A Czech stockbroker who saved more than 650 Je...,‘Britain’s Schindler’ Dies at 106,0,12.74,0.31875,"['A', 'Czech', 'stockbroker', 'who', 'saved', ...",125,A Czech stockbroker who saved more than 650 Je...,A Czech stockbroker who saved more than 650 Je...,0.863
3,3,Hillary Clinton and Donald Trump made some ina...,Fact check: Trump and Clinton at the 'commande...,0,125.54,0.375304,"['Hillary', 'Clinton', 'and', 'Donald', 'Trump...",2424,Hillary Clinton and Donald Trump made some ina...,Hillary Clinton and Donald Trump made some ina...,0.771
4,4,Iranian negotiators reportedly have made a las...,Iran reportedly makes new push for uranium con...,0,42.7,0.407948,"['Iranian', 'negotiators', 'reportedly', 'have...",732,Iranian negotiator reportedly have made a last...,Iranian negotiator reportedly have made a last...,0.815


In [4]:
df.isna().mean()

Unnamed: 0     0.000000
text           0.000000
title          0.000000
label          0.000000
dc_score       0.000000
sub            0.000000
tokens         0.000000
text_length    0.000000
lemma          0.004604
clean_text     0.004604
vs             0.000000
dtype: float64

In [5]:
df.shape

(24107, 11)

In [6]:
df.dropna(inplace=True)

In [9]:
tf = TfidfVectorizer(max_df=0.8, min_df = 3, stop_words = 'english', ngram_range=(1,2))

X = tf.fit_transform(df['clean_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [13]:
xgb_0 = xgb.XGBClassifier(max_depth=6, n_jobs = -1, verbosity=1)

xgb_0.fit(X_train, y_train)

print(f'Training score is {xgb_0.score(X_train, y_train)}')
print(f'Test score is {xgb_0.score(X_test, y_test)}')

[02:19:45] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[02:19:46] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 66 extra nodes, 0 pruned nodes, max_depth=6
[02:19:48] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 68 extra nodes, 0 pruned nodes, max_depth=6
[02:19:49] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=6
[02:19:50] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=6
[02:19:52] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 0 pruned nodes, max_depth=6
[02:

[02:20:55] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 72 extra nodes, 0 pruned nodes, max_depth=6
[02:20:56] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=6
[02:20:58] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=6
[02:20:59] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=6
[02:21:00] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=6
[02:21:02] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=6
[02:

Training score is 0.966494415735956
Test score is 0.9393232205367561


In [14]:
xgb_1 = xgb.XGBClassifier(max_depth=8, n_jobs = -1, verbosity=1)

xgb_1.fit(X_train, y_train)



Training score is 0.966494415735956
Test score is 0.9393232205367561


In [15]:
print(f'Training score is {xgb_1.score(X_train, y_train)}')
print(f'Test score is {xgb_1.score(X_test, y_test)}')

Training score is 0.9821636939489915
Test score is 0.9433238873145524


(23996, 546199)

In [16]:
preds = xgb_1.predict_proba(X)

In [23]:
preds.shape

(23996, 2)

In [24]:
xgb_1.classes_

array([0, 1], dtype=int64)

In [29]:
preds[23995]

array([0.03572702, 0.964273  ], dtype=float32)

In [30]:
prob_true = []
prob_fake = []

for i in range(len(df.index)):
    prob_true.append(preds[i][0])
    prob_fake.append(preds[i][1])

df['prob_true'] = prob_true
df['prob_fake'] = prob_fake
    

In [32]:
pred_label = xgb_1.predict(X)

In [33]:
df['pred'] = pred_label

In [34]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,text,title,label,dc_score,sub,tokens,text_length,lemma,clean_text,vs,prob_true,prob_fake,pred
16809,18066,If a gaffe is when a politician speaks the tr...,KRAUTHAMMER Skewers John Kerry On Comment Abou...,1,12.25,0.473765,"['If', 'a', 'gaffe', 'is', 'when', 'a', 'polit...",110,If a gaffe is when a politician speaks the tru...,If a gaffe is when a politician speaks the tru...,0.787,0.003694,0.996306,1
9582,9649,Region: Russia in the World More bang for the ...,More Bang for the Buck | New Eastern Outlook,1,70.66,0.372431,"['Region', 'Russia', 'in', 'the', 'World', 'Mo...",1290,Region Russia in the World More bang for the b...,Region Russia in the World More bang for the b...,0.84,0.080092,0.919908,1
12006,12410,Search form Search From Bad to Worse : Obamas...,From Bad to Worse: Obamas Ransom Payment to I...,1,65.38,0.449996,"['Search', 'form', 'Search', 'From', 'Bad', 't...",1182,Search form Search From Bad to Worse Obama s R...,Search form Search From Bad to Worse Obama s R...,0.802,0.007979,0.992021,1
13126,13764,Print \nPremiums will go up sharply next year...,"Obamacare shrinks employment, drives up premiu...",1,34.53,0.415057,"['Print', 'Premiums', 'will', 'go', 'up', 'sha...",563,Print Premiums will go up sharply next year un...,Print Premiums will go up sharply next year un...,0.797,0.244122,0.755878,1
3605,3610,Here's a look at some recent cases of foreign ...,Detained Americans Fast Facts - CNN.com,0,152.65,0.26135,"['Here', 's', 'a', 'look', 'at', 'some', 'rece...",2961,Here s a look at some recent case of foreign g...,Here s a look at some recent case of foreign g...,0.786,0.942047,0.057953,0
5391,5408,"As he considers a presidential run, Louisiana'...","In Common Core, a larger battle over the size ...",0,35.49,0.421839,"['As', 'he', 'considers', 'a', 'presidential',...",594,As he considers a presidential run Louisiana s...,As he considers a presidential run Louisiana s...,0.908,0.983861,0.016139,0
14997,16070,"\n My Name is Fate (@Destini41) October 29, 2...",Flashback: Clinton campaigns press secretary ...,1,28.37,0.458394,"['My', 'Name', 'is', 'Fate', 'Destini41', 'Oct...",445,My Name is Fate Destini41 October 29 2016 Clin...,My Name is Fate Destini41 October 29 2016 Clin...,0.903,0.015015,0.984985,1
1741,1744,"William’s wife, Patricia, turned the maisonett...",William F. Buckley would have loathed Fox News...,0,148.51,0.467201,"['William', 's', 'wife', 'Patricia', 'turned',...",2859,William s wife Patricia turned the maisonette ...,William s wife Patricia turned the maisonette ...,0.733,0.970454,0.029546,0
16848,18106,Hillarys been coughing to the point of almost...,HILLARYS PHYSICIAN CLAIMS She Has PneumoniaD...,1,16.69,0.440741,"['Hillary', 's', 'been', 'coughing', 'to', 'th...",214,Hillary s been coughing to the point of almost...,Hillary s been coughing to the point of almost...,0.856,0.00228,0.99772,1
13025,13640,FBI Reviewing Clinton Emails Found While Inves...,FBI Reviewing Clinton Emails Found While Inves...,1,35.87,0.41501,"['FBI', 'Reviewing', 'Clinton', 'Emails', 'Fou...",596,FBI Reviewing Clinton Emails Found While Inves...,FBI Reviewing Clinton Emails Found While Inves...,0.923,0.006456,0.993544,1


In [35]:
df[['text', 'title', 'label', 'prob_true', 'prob_fake', 'pred']].to_csv('../datasets/xgb_preds_df.csv')

In [36]:
df[['text', 'title', 'label', 'prob_true', 'prob_fake', 'pred']]

Unnamed: 0,text,title,label,prob_true,prob_fake,pred
0,U.S. Secretary of State John F. Kerry said Mon...,Kerry to go to Paris in gesture of sympathy,0,0.988266,0.011734,0
1,It's primary day in New York and front-runners...,The Battle of New York: Why This Primary Matters,0,0.922406,0.077594,0
2,A Czech stockbroker who saved more than 650 Je...,‘Britain’s Schindler’ Dies at 106,0,0.157023,0.842977,1
3,Hillary Clinton and Donald Trump made some ina...,Fact check: Trump and Clinton at the 'commande...,0,0.910249,0.089751,0
4,Iranian negotiators reportedly have made a las...,Iran reportedly makes new push for uranium con...,0,0.971948,0.028052,0
...,...,...,...,...,...,...
24102,"Don't Diss The Dark Ages Oct 26, 2016 2:50 P...",Don't Diss The Dark Ages,1,0.045484,0.954516,1
24103,"Oct 26, 2016 4:26 PM 0 SHARES \nThere was a su...",Tesla Earnings Smash Expectations After Dramat...,1,0.026493,0.973507,1
24104,"The following video is a must watch, particula...",Rules For Rulers (Or How The World Really Works),1,0.071306,0.928694,1
24105,"YHC-FTSE Oct 26, 2016 5:14 PM \nWould have bee...",Caught On Tape: ISIS Destroys Iraqi Abrams Wit...,1,0.029730,0.970270,1


In [37]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [39]:
f1_score(df['label'], df['pred'])

0.9805993366792873

In [41]:
print(f'The f1 score is {f1_score(df["label"], df["pred"])}')

print(f'The precision score is {precision_score(df["label"], df["pred"])}')

print(f'The recall score is {recall_score(df["label"], df["pred"])}')

The f1 score is 0.9805993366792873
The precision score is 0.9698676265675801
The recall score is 0.9915711996201104
