In [1]:
import numpy as np
import pandas as pd
import gensim, logging
from gensim.models.keyedvectors import KeyedVectors


# Load the dataset

In [571]:
df = pd.read_csv('Fake news meta/Fake_news_meta.csv')

In [572]:
df.head()

Unnamed: 0,doc_id,title,mainText,veracity
0,0,BREAKING STUDY PROVES TRUMP WAS RIGHT … EVERY ...,Results from a new study concluded that millio...,False
1,1,President Trump Signed An Executive Order That...,Illegal immigrants do NOT have the same rights...,False
2,2,New Document Reveals Lynch Instructed Comey to...,"Ah, the infamous tarmac meeting. You know…the...",False
3,3,Newly Leaked Report Shows Hillary DID NOT Win ...,President Trump won the election fair and squa...,False
4,4,KATIE COURIC SPEAKS OUT ON CANCELING CLINTON I...,Katie Couric has come out publicly and corrobo...,False


# Word2vec embedding

In [4]:
wv = KeyedVectors.load_word2vec_format("model/GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

In [354]:
from sklearn.model_selection import train_test_split, cross_val_predict
#X = df['mainText']
X = df['title'] 
#X = df['title']
y = df['veracity']
#y = df['hyperpartisan']

In [355]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [356]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import PassiveAggressiveClassifier
import nltk

In [557]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens


tokenized = df.apply(lambda r: w2v_tokenize_text(r['title_mainText']), axis=1).values
word_average = word_averaging_list(wv,tokenized)


  


In [558]:
word_average 

array([[ 0.0360489 ,  0.03188297,  0.04928768, ..., -0.0405544 ,
         0.04483195, -0.06672225],
       [ 0.00804245,  0.02227169,  0.05752258, ..., -0.05678951,
         0.05489797, -0.06118035],
       [ 0.04702037,  0.04448877,  0.03820797, ..., -0.06246035,
         0.04717097, -0.05447618],
       ...,
       [ 0.04949335,  0.01599878,  0.03789001, ..., -0.03503651,
         0.07338407, -0.04990248],
       [ 0.05329169,  0.02957432,  0.05688357, ..., -0.03233018,
         0.04731039, -0.03830012],
       [ 0.05523902,  0.04775526,  0.02667516, ..., -0.04412571,
         0.04560996, -0.04402268]], dtype=float32)

# Combine features

In [474]:
from sklearn.model_selection import train_test_split, cross_val_predict
features= [c for c in df1.columns.values if c not in ['doc_id', 'mainText', 'veracity','title_mainText', 'hyperpartisan',
                                                      'mainText_preprocessed', 'title_preprocessed',
                                                      'title_mainText_preprocessed', 'title', 'mainText_nouns','mainText_verbs']]
numeric_features= [c for c in df1.columns.values if c not in ['doc_id', 'title', 'main_text', 'mainText_preprocessed', 
                                                      'title_mainText', 'title_preprocessed', 'mainText_nouns', 
                                                      'mainText_verbs', 'title_mainText_V', 'title_mainText_N', 'hyperpartisan','mainText']]

X = df[features]
y = df['veracity']



In [573]:
features= [c for c in df.columns.values if c in ['hyperpartisan']]


In [476]:
# Convert binary hyperpartisan label into 1, 0

# Test different scalers
#scaler = StandardScaler()
scaler = MinMaxScaler()
#scaler = RobustScaler()


df1_minmax = scaler.fit_transform(df[features])


  return self.partial_fit(X, y)


In [477]:
df1_minmax

array([[0.52631579, 0.        , 0.04347826, ..., 0.        , 0.14285714,
        0.05493911],
       [0.        , 0.00771208, 0.04347826, ..., 0.        , 0.12987013,
        0.02895805],
       [0.        , 0.00257069, 0.04347826, ..., 0.        , 0.09090909,
        0.02056834],
       ...,
       [0.10526316, 0.00257069, 0.08695652, ..., 0.04761905, 0.14285714,
        0.05142084],
       [0.05263158, 0.00257069, 0.04347826, ..., 0.        , 0.19480519,
        0.03355886],
       [0.05263158, 0.01028278, 0.13043478, ..., 0.        , 0.07792208,
        0.03707713]])

In [574]:
# concatenate word embedding and hyperpartisan label
train = np.concatenate((df1_minmax, word_average), axis=1)

In [560]:
train.shape

(2268, 308)

# Cross Validation

In [566]:
from sklearn.linear_model import LogisticRegression
text_clf = SVC(C=10, kernel= 'linear', max_iter= 1000, tol=0.0001, probability=True)
#text_clf = LogisticRegression(n_jobs=1, C=10)

In [567]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
#scoring = ['precision_macro', 'recall_macro']

#scores = cross_validate(text_clf, word_average, y, cv=10, return_train_score=False)
scores = cross_validate(text_clf, train, df['veracity'], cv=10, return_train_score=False)
sorted(scores.keys())                         

scores['test_score']  



array([0.84210526, 0.8061674 , 0.84581498, 0.76211454, 0.82819383,
       0.74008811, 0.75330396, 0.76106195, 0.78761062, 0.7699115 ])

In [568]:
print (scores['test_score'].mean())

0.7896372153337615


In [569]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
#scores = cross_val_score(text_clf, word_average, y, cv=10, scoring='f1_macro')
scores = cross_val_score(text_clf, train, df['veracity'], cv=10, scoring='f1_macro')
scores     



array([0.84199584, 0.80616364, 0.84522627, 0.76107126, 0.82819383,
       0.73844351, 0.75272331, 0.75552885, 0.7832654 , 0.76771031])

In [570]:
print (scores.mean())

0.7880322225971351


# Grid Search

In [545]:
from sklearn.model_selection import GridSearchCV
parameters = {
#              'clf__loss': ('hinge', 'squared_hinge'),
#              'tfidf__use_idf': (True, False),
              'C':[1,10,100,1000],
              'max_iter': [100,500,1000]}
#              'clf__penalty': ('l1', 'l2'),
#               'C' : [0.01, 0.1, 1, 10, 100,1000,1e5,1e6]} 
#              'clf__kernel':['linear','rbf']}


gs_clf_svm = GridSearchCV(text_clf, parameters, cv=10)
gs_clf_svm = gs_clf_svm.fit(word_average, df['veracity'])










In [546]:
gs_clf_svm.best_params_

{'C': 10, 'max_iter': 1000}