In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, precision_score

import xgboost as xgb
from xgboost.sklearn import XGBClassifier




In [3]:
df = pd.read_csv('../../fake_finder_df_full.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,title,label,dc_score,sub,tokens,text_length,lemma,clean_text,vs
0,0,U.S. Secretary of State John F. Kerry said Mon...,Kerry to go to Paris in gesture of sympathy,0,27.93,0.348775,"['U', 'S', 'Secretary', 'of', 'State', 'John',...",439,U S Secretary of State John F Kerry said Monda...,U S Secretary of State John F Kerry said Monda...,0.833
1,1,It's primary day in New York and front-runners...,The Battle of New York: Why This Primary Matters,0,22.23,0.44809,"['It', 's', 'primary', 'day', 'in', 'New', 'Yo...",335,It s primary day in New York and front runner ...,It s primary day in New York and front runner ...,0.745
2,2,A Czech stockbroker who saved more than 650 Je...,‘Britain’s Schindler’ Dies at 106,0,12.74,0.31875,"['A', 'Czech', 'stockbroker', 'who', 'saved', ...",125,A Czech stockbroker who saved more than 650 Je...,A Czech stockbroker who saved more than 650 Je...,0.863
3,3,Hillary Clinton and Donald Trump made some ina...,Fact check: Trump and Clinton at the 'commande...,0,125.54,0.375304,"['Hillary', 'Clinton', 'and', 'Donald', 'Trump...",2424,Hillary Clinton and Donald Trump made some ina...,Hillary Clinton and Donald Trump made some ina...,0.771
4,4,Iranian negotiators reportedly have made a las...,Iran reportedly makes new push for uranium con...,0,42.7,0.407948,"['Iranian', 'negotiators', 'reportedly', 'have...",732,Iranian negotiator reportedly have made a last...,Iranian negotiator reportedly have made a last...,0.815


In [5]:
df.isna().mean()

Unnamed: 0     0.000000
text           0.000000
title          0.000000
label          0.000000
dc_score       0.000000
sub            0.000000
tokens         0.000000
text_length    0.000000
lemma          0.004604
clean_text     0.004604
vs             0.000000
dtype: float64

In [6]:
df.shape

(24107, 11)

In [7]:
df.dropna(inplace=True)

In [8]:
tf = TfidfVectorizer(max_df=0.8, min_df = 3, stop_words = 'english', ngram_range=(1,2))

X = tf.fit_transform(df['clean_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [9]:
# xgb_0 = xgb.XGBClassifier(max_depth=6, n_jobs = -1, verbosity=1)

# xgb_0.fit(X_train, y_train)

# print(f'Training score is {xgb_0.score(X_train, y_train)}')
# print(f'Test score is {xgb_0.score(X_test, y_test)}')

In [10]:
xgb_1 = xgb.XGBClassifier(max_depth=8, n_jobs = -1, verbosity=1)

xgb_1.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [11]:
print(f'Training score is {xgb_1.score(X_train, y_train)}')
print(f'Test score is {xgb_1.score(X_test, y_test)}')

Training score is 0.9821636939489915
Test score is 0.9433238873145524


In [16]:
preds = xgb_1.predict_proba(X)

preds.shape

xgb_1.classes_

preds[23995]

prob_true = []
prob_fake = []

for i in range(len(df.index)):
    prob_true.append(preds[i][0])
    prob_fake.append(preds[i][1])

df['prob_true'] = prob_true
df['prob_fake'] = prob_fake
    

pred_label = xgb_1.predict(X)

df['pred'] = pred_label

In [34]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,text,title,label,dc_score,sub,tokens,text_length,lemma,clean_text,vs,prob_true,prob_fake,pred
16809,18066,If a gaffe is when a politician speaks the tr...,KRAUTHAMMER Skewers John Kerry On Comment Abou...,1,12.25,0.473765,"['If', 'a', 'gaffe', 'is', 'when', 'a', 'polit...",110,If a gaffe is when a politician speaks the tru...,If a gaffe is when a politician speaks the tru...,0.787,0.003694,0.996306,1
9582,9649,Region: Russia in the World More bang for the ...,More Bang for the Buck | New Eastern Outlook,1,70.66,0.372431,"['Region', 'Russia', 'in', 'the', 'World', 'Mo...",1290,Region Russia in the World More bang for the b...,Region Russia in the World More bang for the b...,0.84,0.080092,0.919908,1
12006,12410,Search form Search From Bad to Worse : Obamas...,From Bad to Worse: Obamas Ransom Payment to I...,1,65.38,0.449996,"['Search', 'form', 'Search', 'From', 'Bad', 't...",1182,Search form Search From Bad to Worse Obama s R...,Search form Search From Bad to Worse Obama s R...,0.802,0.007979,0.992021,1
13126,13764,Print \nPremiums will go up sharply next year...,"Obamacare shrinks employment, drives up premiu...",1,34.53,0.415057,"['Print', 'Premiums', 'will', 'go', 'up', 'sha...",563,Print Premiums will go up sharply next year un...,Print Premiums will go up sharply next year un...,0.797,0.244122,0.755878,1
3605,3610,Here's a look at some recent cases of foreign ...,Detained Americans Fast Facts - CNN.com,0,152.65,0.26135,"['Here', 's', 'a', 'look', 'at', 'some', 'rece...",2961,Here s a look at some recent case of foreign g...,Here s a look at some recent case of foreign g...,0.786,0.942047,0.057953,0
5391,5408,"As he considers a presidential run, Louisiana'...","In Common Core, a larger battle over the size ...",0,35.49,0.421839,"['As', 'he', 'considers', 'a', 'presidential',...",594,As he considers a presidential run Louisiana s...,As he considers a presidential run Louisiana s...,0.908,0.983861,0.016139,0
14997,16070,"\n My Name is Fate (@Destini41) October 29, 2...",Flashback: Clinton campaigns press secretary ...,1,28.37,0.458394,"['My', 'Name', 'is', 'Fate', 'Destini41', 'Oct...",445,My Name is Fate Destini41 October 29 2016 Clin...,My Name is Fate Destini41 October 29 2016 Clin...,0.903,0.015015,0.984985,1
1741,1744,"William’s wife, Patricia, turned the maisonett...",William F. Buckley would have loathed Fox News...,0,148.51,0.467201,"['William', 's', 'wife', 'Patricia', 'turned',...",2859,William s wife Patricia turned the maisonette ...,William s wife Patricia turned the maisonette ...,0.733,0.970454,0.029546,0
16848,18106,Hillarys been coughing to the point of almost...,HILLARYS PHYSICIAN CLAIMS She Has PneumoniaD...,1,16.69,0.440741,"['Hillary', 's', 'been', 'coughing', 'to', 'th...",214,Hillary s been coughing to the point of almost...,Hillary s been coughing to the point of almost...,0.856,0.00228,0.99772,1
13025,13640,FBI Reviewing Clinton Emails Found While Inves...,FBI Reviewing Clinton Emails Found While Inves...,1,35.87,0.41501,"['FBI', 'Reviewing', 'Clinton', 'Emails', 'Fou...",596,FBI Reviewing Clinton Emails Found While Inves...,FBI Reviewing Clinton Emails Found While Inves...,0.923,0.006456,0.993544,1


In [35]:
df[['text', 'title', 'label', 'prob_true', 'prob_fake', 'pred']].to_csv('../datasets/xgb_preds_df.csv')

In [36]:
df[['text', 'title', 'label', 'prob_true', 'prob_fake', 'pred']]

Unnamed: 0,text,title,label,prob_true,prob_fake,pred
0,U.S. Secretary of State John F. Kerry said Mon...,Kerry to go to Paris in gesture of sympathy,0,0.988266,0.011734,0
1,It's primary day in New York and front-runners...,The Battle of New York: Why This Primary Matters,0,0.922406,0.077594,0
2,A Czech stockbroker who saved more than 650 Je...,‘Britain’s Schindler’ Dies at 106,0,0.157023,0.842977,1
3,Hillary Clinton and Donald Trump made some ina...,Fact check: Trump and Clinton at the 'commande...,0,0.910249,0.089751,0
4,Iranian negotiators reportedly have made a las...,Iran reportedly makes new push for uranium con...,0,0.971948,0.028052,0
...,...,...,...,...,...,...
24102,"Don't Diss The Dark Ages Oct 26, 2016 2:50 P...",Don't Diss The Dark Ages,1,0.045484,0.954516,1
24103,"Oct 26, 2016 4:26 PM 0 SHARES \nThere was a su...",Tesla Earnings Smash Expectations After Dramat...,1,0.026493,0.973507,1
24104,"The following video is a must watch, particula...",Rules For Rulers (Or How The World Really Works),1,0.071306,0.928694,1
24105,"YHC-FTSE Oct 26, 2016 5:14 PM \nWould have bee...",Caught On Tape: ISIS Destroys Iraqi Abrams Wit...,1,0.029730,0.970270,1


In [23]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [39]:
f1_score(df['label'], df['pred'])

0.9805993366792873

In [41]:
print(f'The f1 score is {f1_score(df["label"], df["pred"])}')

print(f'The precision score is {precision_score(df["label"], df["pred"])}')

print(f'The recall score is {recall_score(df["label"], df["pred"])}')

The f1 score is 0.9805993366792873
The precision score is 0.9698676265675801
The recall score is 0.9915711996201104


In [18]:
len(xgb_1.predict(X_test))

5999

In [19]:
len(X_test)

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [20]:
print(f'The test f1 score is {f1_score(y_test, xgb_1.predict(X_test))}')

print(f'The test precision score is {precision_score(y_test, xgb_1.predict(X_test))}')

print(f'The test trecall score is {recall_score(y_test, xgb_1.predict(X_test))}')

The test f1 score is 0.9603452297644041
The test precision score is 0.943833104080697
The test trecall score is 0.9774453941120608


In [24]:
confusion_matrix(y_test, xgb_1.predict(X_test))

array([[1542,  245],
       [  95, 4117]], dtype=int64)

In [26]:
sample_article = "ATFORD, England — President Trump, who has demeaned his domestic political rivals for being laughed at around the world, found himself the scorned child on the global playground at a NATO summit here Wednesday, as widely circulated video showed leaders gossiping about and mocking him. The video, captured at a Buckingham Palace reception Tuesday evening, appeared to show Canadian Prime Minister Justin Trudeau, French President Emmanuel Macron, British Prime Minister Boris Johnson and others laughing about Trump’s freewheeling news conferences earlier in the day. “I just watched his team’s jaws drop to the floor,” Trudeau told the others, dropping his hand toward the ground to dramatize his retelling. And so it was Wednesday morning that Trump presented a sulking, brooding president, as he slapped down Trudeau as “two-faced” and engaged with other foreign counterparts at a secluded estate here outside London. As the summit concluded, Trump abruptly canceled a planned news conference, saying that he had already answered so many questions from reporters in other settings during his visit to Britain. He took off for Washington as the House Judiciary Committee’s impeachment hearing was underway. As impeachment inquiry rages at home, Trump unsettles the world stage at NATO. Though his conduct here fit his pattern of disruption at international summits, Trump did not make the fiery threats that have punctuated previous gatherings. NATO leaders were almost giddy as they survived another encounter with Trump with their alliance intact. Trump’s canceled news conference — eliminating one last chance for him to take aim at them — was to many the departure gift. The day’s drama centered on Trump and Trudeau, who previously feuded at the Group of Seven summit in 2018. Asked Wednesday by journalists about Trudeau’s mockery, Trump fired back at the Canadian prime minister. Well, hes two-faced, Trump said of Trudeau. And honestly, with Trudeau, he’s a nice guy. I find him to be a very nice guy. But, you know, the truth is that I called him out on the fact that hes not paying 2 percent, and I guess hes not very happy about it.”"

In [33]:
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer

def tokenize(x):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize()

#sample_article = tokenize(sample_article)

In [34]:
len(sample_article)

354

In [35]:
def text_joiner(x):
    return ' '.join([word for word in x])

sample_article = text_joiner(sample_article)

In [36]:
sample_article

'ATFORD England President Trump who has demeaned his domestic political rivals for being laughed at around the world found himself the scorned child on the global playground at a NATO summit here Wednesday as widely circulated video showed leaders gossiping about and mocking him The video captured at a Buckingham Palace reception Tuesday evening appeared to show Canadian Prime Minister Justin Trudeau French President Emmanuel Macron British Prime Minister Boris Johnson and others laughing about Trump s freewheeling news conferences earlier in the day I just watched his team s jaws drop to the floor Trudeau told the others dropping his hand toward the ground to dramatize his retelling And so it was Wednesday morning that Trump presented a sulking brooding president as he slapped down Trudeau as two faced and engaged with other foreign counterparts at a secluded estate here outside London As the summit concluded Trump abruptly canceled a planned news conference saying that he had already

In [43]:
tf = TfidfVectorizer(stop_words='english')

x_list = [sample_article, 'I do not like Trump or Trudeau', 'Team America World Police', 'Something else Im tired']

x = tf.fit_transform(x_list)

In [46]:
df.columns

Index(['Unnamed: 0', 'text', 'title', 'label', 'dc_score', 'sub', 'tokens',
       'text_length', 'lemma', 'clean_text', 'vs'],
      dtype='object')

In [1]:
df.head()

NameError: name 'df' is not defined