In [94]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack

import textstat
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import stanfordnlp
from textblob import TextBlob

from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB 
from sklearn.naive_bayes import MultinomialNB 


In [95]:
df = pd.read_csv('../datasets/buzzfeed-v02-originalLabels.txt', sep='\t', header=None )
df.head()

Unnamed: 0,0,1,2,3,4,5
0,1204,http://occupydemocrats.com/2016/09/23/donald-t...,mostly true,"A few days ago, DonaldTrump despicable spawn ...",occupydemocrats.com,buzzfeed
1,1157,http://occupydemocrats.com/2016/09/19/just-50-...,mostly true,A group of over fifty former intelligence off...,occupydemocrats.com,buzzfeed
2,1165,http://occupydemocrats.com/2016/09/20/proof-tr...,mixture of true and false,A new investigation has determined that Donal...,occupydemocrats.com,buzzfeed
3,1199,http://occupydemocrats.com/2016/09/23/breaking...,mostly true,A new video has emerged of the moments before...,occupydemocrats.com,buzzfeed
4,1210,http://occupydemocrats.com/2016/09/26/wwii-vet...,mostly true,A ninety-six-year-old World War II veteran an...,occupydemocrats.com,buzzfeed


In [96]:
df[2].value_counts(normalize = True)

mostly true                  0.789855
mixture of true and false    0.123188
mostly false                 0.046377
no factual content           0.040580
Name: 2, dtype: float64

In [97]:
df = df[df[2] != 'no factual content']

df = df[[1, 2, 3]]

df.columns = ['URL', 'label', 'text']

df = df[['label', 'URL', 'text']]

In [98]:
df.head()

Unnamed: 0,label,URL,text
0,mostly true,http://occupydemocrats.com/2016/09/23/donald-t...,"A few days ago, DonaldTrump despicable spawn ..."
1,mostly true,http://occupydemocrats.com/2016/09/19/just-50-...,A group of over fifty former intelligence off...
2,mixture of true and false,http://occupydemocrats.com/2016/09/20/proof-tr...,A new investigation has determined that Donal...
3,mostly true,http://occupydemocrats.com/2016/09/23/breaking...,A new video has emerged of the moments before...
4,mostly true,http://occupydemocrats.com/2016/09/26/wwii-vet...,A ninety-six-year-old World War II veteran an...


In [99]:
df_2 = pd.read_csv('../datasets/snopes_checked_v02.csv', encoding='iso-8859-1' )
df_2.sample(10)

Unnamed: 0,fact_rating_phase1,snopes_url_phase1,article_title_phase1,article_category_phase1,article_date_phase1,article_claim_phase1,article_origin_url_phase1,index_paragraph_phase1,page_is_first_citation_phase1,error_phase2,original_article_text_phase2,article_title_phase2,publish_date_phase2,author_phase2,Jerry-label,Jill-label,Fatemeh-label,notes,original order,Agreement
60,FALSE,https://www.snopes.com/fact-check/gates-of-hea...,Gates of Heaven Photograph,Fauxtography,22-Jan-16,Scientists are stumped by a photograph taken b...,http://www.eso.org/public/images/eso1119a/,3,False,No Error,VST image of the star-forming region Messier 1...,VST image of the star-forming region Messier 17,,Information Eso.Org,right,irrelevant,context,photo,61,x
63,FALSE,https://www.snopes.com/fact-check/fbi-wasserma...,Did The Federal Bureau of Investigation Seize ...,Politics,26-Jul-17,FBI agents raided Rep. Debbie Wasserman-Schult...,http://archive.is/aBCGK,1,False,No Error,"Ever since Trump got into office, liberals con...",FBI Raided Wasserman-Schultzs House After Shes...,7/26/2017,,right,right,right,,64,
44,mixture,https://www.snopes.com/fact-check/emordnilap/,Emordnilap,Language,13-Dec-14,"""Emordnilap"" is a real word referring to words...",http://www.merriam-webster.com/dictionary/word,3,False,No Error,The word word has a wide range of meanings and...,Definition of Word by Merriam-Webster,,Michael Mccann,irrelevant,irrelevant,ambiguous,context/irrelevant,45,
179,mostly false,https://www.snopes.com/fact-check/senate-vote-...,Senate Voted 51-48 to Repeal the Affordable Ca...,Uncategorized,27-Jan-17,The senate voted 51-48 to repeal the Affordabl...,https://www.congress.gov/bill/115th-congress/s...,3,True,No Error,Array ( => 2017-01-13 [displayText] => Passed...,S.Con.Res.3 - 115th Congress (2017-2018): A co...,,Enzi,context,context,n,,180,
252,mixture,https://www.snopes.com/fact-check/members-cong...,Do Members of Congress Enjoy Free Health Care?,Politics,15-Mar-17,"Congressional members receive free, high-quali...",http://www.factcheck.org/2013/05/congress-and-...,2,False,No Error,Q: Is it true that there are bills in Congress...,Congress and an Exemption from Obamacare?,3/13/2005,Lori Robertson,debunking,debunking,n,,253,
157,mostly true,https://www.snopes.com/fact-check/bills-introd...,Bills Introduced by Republicans While We Were ...,Politics,7-Mar-17,Legislation introduced by Republicans during P...,http://archive.is/zXeMT,1,True,No Error,The Republican-led Congress is wasting no time...,"While we were distracted by Trump, Republicans...",5/17/2003,,right,right,n,,158,
168,mostly false,https://www.snopes.com/fact-check/peanut-butte...,Three Reasons You Should Stop Eating Peanut Bu...,Science Medical,31-Mar-17,"Three additives in peanut butter cups, soy lec...",http://archive.is/W7VJ5,1,False,No Error,"If youre reading this, Ill assume youve had a ...",3 Reasons Why You Should Stop Eating Peanut Bu...,3/30/2017,,right,right,n,,169,
144,mostly false,https://www.snopes.com/fact-check/thomas-ediso...,Thomas Edisons Mom Lied about a Letter Expelli...,History,1-Dec-16,Thomas Edisons mother lied about the contents ...,https://www.nps.gov/edis/learn/historyculture/...,4,True,No Error,Young Thomas Edison. NPS Photo Thomas Alva Edi...,Thomas Edison National Historical Park (U.S. N...,,,context,context,n,,145,
297,TRUE,https://www.snopes.com/fact-check/woman-prosec...,Woman Prosecuted for Laughing During Jeff Sess...,Legal Affairs,3-May-17,Code Pink member Desiree Fairooz was prosecute...,https://www.nytimes.com/2017/05/03/us/code-pin...,2,False,No Error,A jury on Wednesday convicted three Code Pink ...,A Code Pink Protester Laughs Over a Trump Nomi...,3/17/2005,,right,right,n,,298,
108,FALSE,https://www.snopes.com/fact-check/greenpeace-a...,Did a Greenpeace Activist Get His Arm Bitten O...,Fake News,2-Feb-18,A Greenpeace activist had his arm bitten off w...,https://archive.is/Mg9Ac,1,True,No Error,An environmental activist was almost killed Ye...,Greenpeace activist gets arm bitten off after ...,2/18/2002,,right,right,n,,109,


In [100]:
df_2.shape

(312, 20)

In [101]:
df_2['original_article_text_phase2'][1]

'As of right now I dont know what kinds of animals they are using.\r\r Kenn\r\rAnimal testing for cosmetics is a hot topic in the beauty industry, with most consumers and many companies deciding to avoid products made with animal testing. After more than two decades following a no testing on animals policy, cosmetics giants Avon, Mary Kay, and Estee Lauder have resumed the practice without letting consumers know. In the late 1980s, PETAs Avon Killing campaign eventually led to the company banning animal testing for their products, with Mary Kay and Este Lauder following suit.\r\rAccording to a press release from Rush PR, the news was broken after animal rights organization PETA removed the companies from their cruelty-free list after discovering that they had returned to using testing on animals for their products. The three companies began the tests again in order to satisfy regulations from the Chinese government so they could sell their products in China. However, PETA contends that

In [102]:
df_2['fact_rating_phase1'].value_counts(normalize = True)

mixture         0.230769
mostly true     0.227564
TRUE            0.208333
mostly false    0.169872
FALSE           0.163462
Name: fact_rating_phase1, dtype: float64

In [103]:
df_2 = df_2[['fact_rating_phase1', 'snopes_url_phase1', 'original_article_text_phase2']]

df_2.columns = ['label', 'URL', 'text']

df_2.head()

Unnamed: 0,label,URL,text
0,mixture,https://www.snopes.com/fact-check/elizabeth-wa...,"Elizabeth Warren, the Harvard Law School profe..."
1,mixture,https://www.snopes.com/fact-check/the-company-...,As of right now I dont know what kinds of anim...
2,mixture,https://www.snopes.com/fact-check/hillary-clin...,\r\r\r\r\r\r Former Secretary of State Hillar...
3,mixture,https://www.snopes.com/fact-check/light-fingered/,Crushing Lamps\r\rCrushing and breaking fluore...
4,mixture,https://www.snopes.com/fact-check/va-hospital-...,"This was their response: ""Logistically, we cou..."


In [104]:
df = pd.concat([df, df_2])

df.sample(10)

Unnamed: 0,label,URL,text
49,mixture,https://www.snopes.com/fact-check/trump-anti-g...,Signup to receive a daily roundup of the top L...
2,mixture of true and false,http://occupydemocrats.com/2016/09/20/proof-tr...,A new investigation has determined that Donal...
1292,mostly true,http://cnn.it/2cRGTGk,Washington Hillary Clinton is leading Donald ...
37,mixture of true and false,http://cnn.it/2deCyLx,Former President George H.W. Bush said Monday...
569,mostly false,http://www.addictinginfo.org/2016/09/19/proof-...,I woke up this morning to find a variation of ...
491,mostly true,http://politi.co/2d4NwH7,Former New York City Mayor Rudy Giuliani rippe...
366,mostly true,http://cnn.it/2dnoYpa,Critical Counties is a CNN series exploring 11...
768,mostly true,http://politi.co/2d8M4PU,"On Media Blog Archives Select Date? December, ..."
519,mostly true,http://occupydemocrats.com/2016/09/22/just-bar...,"He?s basically a bully, but like all bullies,h..."
717,mixture of true and false,http://rightwingnews.com/hillary-clinton-2/nav...,"Navy Seal Does What Others REFUSE To, Fact Che..."


In [105]:
df.index = range(0, len(df.index))

In [106]:
df.to_csv('../combined_bfsnopes.csv')

### Feature engineering

In [107]:
dc_score = []
for i in df.index:
    score = textstat.dale_chall_readability_score(df['text'][i])
    dc_score.append(score)

df['dc_score'] = dc_score



In [108]:
subjectivity = []
for i in df.index:
    text = TextBlob(df['text'][i])
    sub = text.sentiment
    subjectivity.append(sub[1])

df['sub'] = subjectivity

In [109]:
analyzer = SentimentIntensityAnalyzer()
vs_list = []
for i in df.index:
    vs = analyzer.polarity_scores(df['text'][i])
    vs_list.append(vs['neu'])

df['vs'] = vs_list

In [110]:
df.sample(5)

Unnamed: 0,label,URL,text,dc_score,sub,vs
1316,mostly true,http://www.addictinginfo.org/2016/09/21/dixie-...,With this presidential election being as absol...,7.12,0.522531,0.801
732,mostly true,http://politi.co/2doAYGE,"On Media Blog Archives Select Date? December, ...",7.66,0.394545,0.826
1144,mostly true,http://www.addictinginfo.org/2016/09/19/chelse...,There may be a few women out there who enjoy a...,6.55,0.365212,0.748
1067,mostly true,http://politi.co/2cBjhCQ,The ad praises Donald Trump for ?doing what ot...,10.05,0.446014,0.814
1261,mostly true,http://cnn.it/2cPYxdT,Washington With two months to go before Elect...,7.2,0.410722,0.865


In [111]:
def tokenize(x):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(x)

df['tokens'] = df['text'].map(tokenize)

text_length = []
for i in df.index:
    length = len(df['tokens'][i])
    text_length.append(length)



    
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in x])

df['lemma'] = df['tokens'].map(lemmatize)

In [112]:
average_word_lengths = []

for i in df.index:
    token_lens = [len(token) for token in df['tokens'][i]]
    average_word_lengths.append(np.mean(token_lens))

    
df['avg_word_len'] = average_word_lengths

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [113]:
df['text_length'] = text_length

In [115]:
df['label'].value_counts(normalize=True)

mostly true                  0.710092
mixture of true and false    0.103976
mostly false                 0.070948
mixture                      0.044037
TRUE                         0.039755
FALSE                        0.031193
Name: label, dtype: float64

In [116]:
df['label'] = df['label'].map({'mostly true' : 2, 'mixture of true and false' : 1, 'mostly false' : 0, 'mixture' : 1, 'TRUE' : 3, 'FALSE' : 0})

In [117]:
df['label'].value_counts(normalize = True)

2    0.710092
1    0.148012
0    0.102141
3    0.039755
Name: label, dtype: float64

In [118]:
df.head()

Unnamed: 0,label,URL,text,dc_score,sub,vs,tokens,lemma,avg_word_len,text_length
0,2,http://occupydemocrats.com/2016/09/23/donald-t...,"A few days ago, DonaldTrump despicable spawn ...",8.72,0.273561,0.696,"[A, few, days, ago, DonaldTrump, despicable, s...",A few day ago DonaldTrump despicable spawn twe...,5.031359,287
1,2,http://occupydemocrats.com/2016/09/19/just-50-...,A group of over fifty former intelligence off...,9.16,0.381547,0.865,"[A, group, of, over, fifty, former, intelligen...",A group of over fifty former intelligence offi...,5.323741,556
2,1,http://occupydemocrats.com/2016/09/20/proof-tr...,A new investigation has determined that Donal...,8.02,0.48767,0.855,"[A, new, investigation, has, determined, that,...",A new investigation ha determined that DonaldT...,4.937962,677
3,2,http://occupydemocrats.com/2016/09/23/breaking...,A new video has emerged of the moments before...,7.23,0.490105,0.77,"[A, new, video, has, emerged, of, the, moments...",A new video ha emerged of the moment before an...,4.492105,380
4,2,http://occupydemocrats.com/2016/09/26/wwii-vet...,A ninety-six-year-old World War II veteran an...,8.21,0.426849,0.74,"[A, ninety, six, year, old, World, War, II, ve...",A ninety six year old World War II veteran and...,4.645331,953


In [119]:
df.dropna(inplace = True)

In [120]:
cvec = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_df = .8, min_df = 3)
X = cvec.fit_transform(df['lemma'])

X_full =  hstack((X,np.array(df['sub'])[:,None]))

X_full =  hstack((X,np.array(df['vs'])[:,None]))

X_full =  hstack((X,np.array(df['dc_score'])[:,None]))

X_full =  hstack((X,np.array(df['text_length'])[:,None]))

X_full =  hstack((X,np.array(df['avg_word_len'])[:,None]))

X_full.shape

y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X_full, y,
                                                    random_state=42, stratify = y)

nb = MultinomialNB()

nb = MultinomialNB()
nb.fit(X_train, y_train)
print(nb.score(X_train, y_train))
print(nb.score(X_test, y_test))

0.898042414355628
0.6650366748166259


In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42, stratify = y)

nb = MultinomialNB()

nb = MultinomialNB()
nb.fit(X_train, y_train)
print(nb.score(X_train, y_train))
print(nb.score(X_test, y_test))

0.901305057096248
0.6699266503667481


In [122]:
rf = RandomForestClassifier()

pgrid_rf = {
    'n_estimators' : [5, 25, 50, 100],
    'max_depth' : [4, 5, 6, 8],
    'max_features' : [None, 8, 16]
    
}

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y,
                                                    random_state=42, stratify = y)


gs_rf = GridSearchCV(rf, pgrid_rf, cv = 5, n_jobs = -1, verbose = 1)

gs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.9min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [127]:
gs_rf.score(X_train, y_train)


0.7251223491027733

In [128]:
gs_rf.score(X_test, y_test)


0.706601466992665