In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

import xgboost
from xgboost.sklearn import XGBClassifier

In [2]:
df = pd.read_csv('../../datasets/Fake_finder/clean_text.csv')

df.drop(columns = 'Unnamed: 0', inplace = True)

df.isna().mean()

clean_text    0.004604
label         0.000000
dtype: float64

In [3]:
df.dropna(inplace = True)

In [4]:
tf = TfidfVectorizer(max_df=0.8, min_df = 3, stop_words = 'english', ngram_range=(1,2))

X = tf.fit_transform(df['clean_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [5]:
xgb = xgboost.XGBClassifier(max_depth=8, n_jobs = -1, verbosity=1)

xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [6]:
print(f'Training score is {xgb.score(X_train, y_train)}')
print(f'Test score is {xgb.score(X_test, y_test)}')

Training score is 0.983052731010724
Test score is 0.9444907484580763


In [7]:
probs = xgb.predict_proba(X)
pred_label = xgb.predict(X)

df['pred'] = pred_label


prob_true = []
prob_fake = []

for i in range(len(df.index)):
    prob_true.append(probs[i][0])
    prob_fake.append(probs[i][1])

df['prob_true'] = prob_true
df['prob_fake'] = prob_fake

In [8]:
print(f'The f1 score is {f1_score(df["label"], df["pred"])}')


print(f'The precision score is {precision_score(df["label"], df["pred"])}')


print(f'The recall score is {recall_score(df["label"], df["pred"])}')

The f1 score is 0.9541732509696883
The precision score is 0.9806584969732762
The recall score is 0.9290809903483005


In [9]:
df.columns

Index(['clean_text', 'label', 'pred', 'prob_true', 'prob_fake'], dtype='object')

In [10]:
df.columns = ['clean_text', 'label', 'pred', 'prob_fake', 'prob_true']

In [11]:
df.sort_values(by = 'prob_fake', ascending=False).tail(4)

Unnamed: 0,clean_text,label,pred,prob_fake,prob_true
2191,CNN Politicians journalist and conservative ac...,1,1,0.001247,0.998753
1559,The strength of anti establishment fervor in t...,1,1,0.000997,0.999003
2587,WASHINGTON Facing an increasingly narrow path ...,1,1,0.000993,0.999007
1794,Hillary Clinton and Donald Trump look to rebou...,1,1,0.00074,0.99926


In [12]:
df.sort_values(by = 'prob_fake', ascending=False).head(4)

Unnamed: 0,clean_text,label,pred,prob_fake,prob_true
23640,Subscribe John Pilger did an interview with As...,-1,-1,0.999566,0.000434
20304,Billionaire Globalist Soros Exposed a Hidden H...,-1,-1,0.99955,0.00045
12261,Billionaire Globalist Soros Exposed a Hidden H...,-1,-1,0.99955,0.00045
24018,We Are Change In the last few day we have seen...,-1,-1,0.999381,0.000618


1794, 2587, 1559, 2191

24018, 20304, 12261, 23640

In [13]:
df['clean_text'][20304]

'Billionaire Globalist Soros Exposed a Hidden Hand Behind Trump Protests Provoking US Color Revolution Billionaire globalist financier George Soros MoveOn org ha been revealed to be a driving force beh Print Email http humansarefree com 2016 11 billionaire globalist soros exposed a html Billionaire globalist financier George Soros MoveOn org ha been revealed to be a driving force behind the organizing of nationwide protest against the election of Donald Trump exposing the protest to largely be an organized top down operation and not an organic movement of concerned Americans taking to the street a reported by the mainstream medium Wednesday saw protest in the street of at least 10 major U S city Chicago New York Los Angeles Philadelphia Boston Washington D C Portland Ore St Paul Minn Seattle and several other city saw protest according to USA Today In light of the protest and rioting that have transpired since the election of Trump a closer analysis of the dynamic at play is warranted 

In [14]:
df['clean_text'][2191]

'CNN Politicians journalist and conservative activist will swoop into Des Moines this weekend for a major gathering of Republicans that s widely viewed a the first cattle call this cycle for the GOP presidential race Close to 10 potential candidate will speak at the daylong Iowa Freedom Summit on Saturday co hosted by the group Citizens United and Rep Steve King a high profile Republican from Iowa with serious clout among social conservative With Iowa the first state to vote in the presidential nominating season it s considered a must stop for White House hopeful on both side of the aisle and this weekend give 2016 player a chance to roll out their message to core caucus goer strategist say Among those expected to speak are New Jersey Gov Chris Christie Sen Ted Cruz of Texas Wisconsin Gov Scott Walker former Texas Gov Rick Perry former Arkansas Gov Mike Huckabee renowned neurosurgeon Ben Carson former Sen Rick Santorum of Pennsylvania and former Hewlett Packard CEO Carly Fiorina Notabl

In [23]:
df_close = df[(df['prob_fake'] < .55) & (df['prob_fake'] > .45 )]

In [24]:
df_close.index

Int64Index([   41,    78,    87,    88,   125,   129,   192,   203,   226,
              293,
            ...
            22387, 22448, 22850, 22979, 23085, 23105, 23143, 23151, 23268,
            23483],
           dtype='int64', length=257)

78, 226, 22850

In [51]:
df_close[['clean_text', 'label', 'pred']].iloc[203]

clean_text    BNI Store Nov 6 2016 UK Muslim family who conv...
label                                                        -1
pred                                                          1
Name: 10748, dtype: object

In [52]:
df_close['clean_text'].iloc[203]

'BNI Store Nov 6 2016 UK Muslim family who converted to Christianity forced to flee Manningham home a violence and threat by Muslims against them continue to escalate A British Christian convert who said he suffered seven year of persecution from Muslims ha been forced to flee his home under armed guard amid fear for his safety Nissar Hussain wa with his family when police arrived and moved him to a safe place The Telegraph and Argus Mr Hussain said the culmination of the extreme persecution had devastated his family and the dramatic arrival of armed police wa a complete surprise My family are distraught and extremely traumatised to be leaving said Mr Hussain But when your life is at stake there is no other choice Mr Hussain converted to Christianity 20 year ago but say in recent year he ha been subjected to harassment and violence by section of the Islamic community This extreme persecution by certain people in the Muslim community because we are convert ha broken u a a family he said

In [46]:
df_close[['clean_text', 'label', 'pred']].iloc[226]

clean_text    WARS AND RUMORS OF WARS Russia unveils Satan 2...
label                                                        -1
pred                                                          1
Name: 15394, dtype: object

In [39]:
df_close['clean_text'].iloc[226]

'WARS AND RUMORS OF WARS Russia unveils Satan 2 missile Nuclear weapon could wipe out France or Texas report say Published 7 hour ago CNN A Russian missile design company ha unveiled the first image of a new weapon in Russia s arsenal the Sarmat intercontinental ballistic missile nicknamed Satan 2 The RS 28 Sarmat rocket is capable of wiping out part of the earth the size of Texas or France Russian state news outlet Sputnik reported in May The image wa published by the Makeyev Rocket Design Bureau on it website Russian Deputy Defense Minister Yuri Borsiov said the Sarmat warhead wa capable of destroying target flying across both North and South Poles Russian state news agency TASS reported Tuesday'

In [45]:
df_close['clean_text'][22850]

'The Onion s Special Coverage Of Election Day 2016 Pinned Articles 5 51 PM Report Turnout Fairly Unattractive Throughout Suburban Philadelphia Our correspondent are reporting a steady turnout of homely and hideous resident throughout the suburban Philadelphia area that show no sign of slowing down In Montgomery County polling place are said to be bustling with score of schlubby overweight voter while Bucks and Delaware county were reportedly busy all morning with an influx of unsightly poorly groomed constituent with many of the local oinkers having to wait upwards of an hour in line to cast their ballot Overall turnout in the area appears to be way uglier than analyst initially predicted Compiled from report out of The Onion s Philadelphia bureau a part of our Precinct Watch initiative dedicated to covering all voting right obstacle encountered by the feckless and feeble minded American electorate The Onion 9 51 PM The Onion Calls Upon The Cowardly State Of Iowa To Stop Its Dithering 

In [56]:
df_close[(df_close['label'] == 1) & (df_close['pred'] == -1)].sample(10)

Unnamed: 0,clean_text,label,pred,prob_fake,prob_true
616,In the two year since the horrific marathon bo...,1,-1,0.500809,0.499191
2239,Top Dems want White House to call off Part B d...,1,-1,0.541699,0.458301
2897,Top Dems want White House to call off Part B d...,1,-1,0.541699,0.458301
2326,Atrocities such a the horrific shooting in Cha...,1,-1,0.513872,0.486128
1822,Clinton who resigned from her philanthropy s b...,1,-1,0.505925,0.494075
5344,Here is a look at the life of former Texas Gov...,1,-1,0.507597,0.492403
2935,Top Dems want White House to call off Part B d...,1,-1,0.541699,0.458301
813,Top Dems want White House to call off Part B d...,1,-1,0.541699,0.458301
5470,A Chinese investment of nearly 1bn in Britain ...,1,-1,0.510867,0.489133
2526,When President Obama called for two year of fr...,1,-1,0.518821,0.481179


In [57]:
df_close['clean_text'][5344]

'Here is a look at the life of former Texas Governor Rick Perry March 4 1950Paint Creek Texas James Richard Perry Joseph Ray Perry a farmerAmelia Holt Perry Anita Thigpen Perry November 6 1982 present Sydney GriffinTexas A M University B S 1972 US Air Force 1972 1977 MethodistIs an Eagle Scout Met his wife Anita in elementary school Is the longest serving governor in Texas history Serves in the US Air Force flying transport plane Returns to Prairie Creek Texas to live and work on his father s farm Forms JR Perry Farms with his father Member of the Texas House of Representatives a a Democrat from the 64th District Switches to the Republican Party Commissioner of the Texas Department of Agriculture Lieutenant Governor of Texas Perry is sworn in a governor after resigns to become president of the United States Perry is elected to a four year term Is re elected governor Perry s book On My Honor Why the American Values of the Boy Scouts Are Worth Fighting For is published Perry is elected f

In [35]:
df_close[['clean_text', 'label', 'pred']].iloc[293]

IndexError: single positional indexer is out-of-bounds