In [167]:
%matplotlib inline
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib as mpl
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

import en_core_web_sm
nlp = en_core_web_sm.load()

import seaborn as sb

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument 
from gensim.models import Doc2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

plt.style.use('dark_background')

from datetime import datetime
import pytz
import time

import os
import pickle

__Reading the data__

In [67]:
if os.path.isfile("raw_data.pkl"): 
    with open("raw_data.pkl", "br") as f:
        raw_data = pickle.load(f)
    print("Raw data read from 'raw_data.pkl'.")
else:
    fake_news = pd.read_csv("fake.csv.zip", parse_dates=['published'])
    articles1 = pd.read_csv("articles1.csv.zip")
    articles2 = pd.read_csv("articles2.csv.zip")
    articles3 = pd.read_csv("articles3.csv.zip")
    
    true_news = pd.concat([articles1, articles2, articles3])
    true_news = true_news[['publication', 'title', 'author', 'content']]
    true_news = true_news.loc[true_news.publication.isin(['New York Times'])]
    true_news = true_news[['title', 'author', 'content']]
    true_news['label'] = 0
    
    tz = pytz.timezone('America/Chicago')
    fake_news = fake_news.loc[fake_news.language == 'english']
    fake_news = fake_news.loc[fake_news.published > tz.localize(datetime.strptime('2016-11-01', '%Y-%m-%d'))]
    fake_news = fake_news[['title', 'author', 'text']]
    fake_news.rename(columns={'text':'content'}, inplace=True)
    fake_news['label'] = 1
    
    raw_data = pd.concat([true_news, fake_news])
    raw_data = raw_data.loc[(raw_data.title.isna() == False) & (raw_data.content.isna() == False)]
    raw_data = raw_data.sample(frac=1).reset_index(drop=True)
    
    with open("raw_data.pkl", "bw") as f:
        pickle.dump(raw_data, f)
    print("Raw data written to 'raw_data.pkl'.")

Raw data read from 'raw_data.pkl'.


In [68]:
raw_data.head()

Unnamed: 0,title,author,content,label
0,John Kerry’s Trip to the South Pole: Nazi Root...,Author,Region: USA in the World So why did the US Sec...,1
1,John Kerry Is Said to Side With Diplomats’ Cri...,Mark Landler,WASHINGTON — For a cabinet member whose dep...,0
2,Tim Kaine Compares Donald Trump’s Comments on ...,Yamiche Alcindor,Senator Tim Kaine of Virginia on Sunday compar...,0
3,"Revel in the Bounty of Spring, With a Feast Fr...",Sam Sifton,The first thing Yotam Ottolenghi did before he...,0
4,"In Betsy DeVos for Education, Trump Taps Into ...","Vanessa Friedman, Maggie Haberman and Alan Rap...",Donald J. Trump has reached into Western Mic...,0


__Preprocessing__

In [66]:
stop_words = stopwords.words('english')

def remove_stopwords(corpus):
    corpus = [[word for word in simple_preprocess(doc, min_len=3, deacc=True) if word not in stop_words] for doc in corpus]
    return [' '.join(word) for word in corpus]

def lemmatize(corpus):
    corpus = [[token.lemma_ for token in doc] for doc in nlp.pipe(corpus, batch_size=3000, n_threads=-1)]
    return [' '.join(word) for word in corpus]

In [71]:
time_ = datetime.now()
if os.path.isfile("preprocessed_data.pkl"): 
    with open("preprocessed_data.pkl", "br") as f:
        preprocessed_data = pickle.load(f)
    print("Preprocessed data read from 'preprocessed_data.pkl'.")
    
else:
    preprocessed_data = pd.DataFrame(columns=['title', 'author', 'content'], index=raw_data.index)
    
    preprocessed_data['title'] = remove_stopwords(raw_data.title)
    preprocessed_data['content'] = remove_stopwords(raw_data.content)
    preprocessed_data['author'] = raw_data.author
    preprocessed_data['label'] = raw_data.label
    
    preprocessed_data['title'] = lemmatize(preprocessed_data.title)
    preprocessed_data['content'] = lemmatize(preprocessed_data.content)
    
    with open("preprocessed_data.pkl", "bw") as f:
        pickle.dump(preprocessed_data, f)
    print("Preprocessed data written to 'preprocessed_data.pkl'.")

time_ = datetime.now() - time_

Preprocessed data written to 'preprocessed_data.pkl'.


In [72]:
time_

datetime.timedelta(seconds=1815, microseconds=54989)

In [90]:
def constructTaggedDocument(data):
    sentences = []
    for i, text in data.iteritems():
        sentences.append(TaggedDocument(text.split(), ['Train_' + str(i)]))
    return sentences

In [120]:
X = constructTaggedDocument(preprocessed_data.title)
y = preprocessed_data.label.values

In [129]:
d2v_model = Doc2Vec(min_count=2, window=5, vector_size=50, sample=1e-4, workers=5, epochs=10)
d2v_model.build_vocab(X)

2020-01-16 16:25:44,605 : INFO : collecting all words and their counts
2020-01-16 16:25:44,608 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-01-16 16:25:44,675 : INFO : PROGRESS: at example #10000, processed 87222 words (1339744/s), 11432 word types, 10000 tags
2020-01-16 16:25:44,696 : INFO : collected 13306 word types and 13582 unique tags from a corpus of 13582 examples and 118284 words
2020-01-16 16:25:44,697 : INFO : Loading a fresh vocabulary
2020-01-16 16:25:44,717 : INFO : effective_min_count=2 retains 7051 unique words (52% of original 13306, drops 6255)
2020-01-16 16:25:44,718 : INFO : effective_min_count=2 leaves 112029 word corpus (94% of original 118284, drops 6255)
2020-01-16 16:25:44,758 : INFO : deleting the raw counts dictionary of 13306 items
2020-01-16 16:25:44,760 : INFO : sample=0.0001 downsamples 629 most-common words
2020-01-16 16:25:44,761 : INFO : downsampling leaves estimated 64700 word corpus (57.8% of prior 112029)
2020

In [130]:
d2v_model.train(X, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

2020-01-16 16:27:39,589 : INFO : training model with 5 workers on 7051 vocabulary and 50 features, using sg=0 hs=0 sample=0.0001 negative=5 window=5
2020-01-16 16:27:41,054 : INFO : EPOCH 1 - PROGRESS: at 50.96% examples, 27628 words/s, in_qsize 6, out_qsize 0
2020-01-16 16:27:41,112 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-01-16 16:27:41,115 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-01-16 16:27:41,130 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-01-16 16:27:41,306 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-16 16:27:41,315 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-16 16:27:41,315 : INFO : EPOCH - 1 : training on 118284 raw words (78255 effective words) took 1.7s, 46113 effective words/s
2020-01-16 16:27:42,345 : INFO : EPOCH 2 - PROGRESS: at 42.44% examples, 32590 words/s, in_qsize 7, out_qsize 0
2020-01-16 16:27:43,056 : INFO : worker

In [138]:
train_size = int(0.75 * len(X))
test_size = len(X) - train_size

In [144]:
train_text_arrays = np.zeros((train_size, 50))
test_text_arrays = np.zeros((test_size, 50))

train_labels = np.zeros(train_size)
test_labels = np.zeros(test_size)

for i in range(train_size):
    train_text_arrays[i] = d2v_model.docvecs['Train_' + str(i)]
    train_labels[i] = preprocessed_data.label[i]
    
for i in range(test_size):
    test_text_arrays[i] = d2v_model.docvecs['Train_' + str(train_size + i)]
    test_labels[i] = preprocessed_data.label[train_size + i]

In [152]:
nb_clf = BernoulliNB()
nb_clf.fit(train_text_arrays, train_labels)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [154]:
c = nb_clf.predict(test_text_arrays)

In [162]:
nb_clf.score(train_text_arrays, train_labels)

0.4828347906120571

In [159]:
len(np.where(c == 0)[0])

717

In [161]:
len(c)

2717

In [164]:
nb_clf = GaussianNB()
nb_clf.fit(train_text_arrays, train_labels)

GaussianNB(priors=None, var_smoothing=1e-09)

In [165]:
nb_clf.score(train_text_arrays, train_labels)

0.5491946617579383

In [166]:
nb_clf.score(test_text_arrays, test_labels)

0.5458225984541774

In [305]:
len(preprocessed_data)

train_size = int(0.75 * len(preprocessed_data))
cv_size = int(0.15 * len(preprocessed_data))
test_size = len(preprocessed_data) - train_size - cv_size

vectorizer = CountVectorizer(stop_words='english', min_df=10, binary=True)
dtm_matrix = vectorizer.fit_transform(preprocessed_data.title.values)

train_data = dtm_matrix[:train_size]
train_labels = preprocessed_data.label.values[:train_size]

cv_data = dtm_matrix[train_size:train_size + cv_size]
cv_labels = preprocessed_data.label.values[train_size:train_size + cv_size]

test_data = dtm_matrix[train_size + cv_size:]
test_labels = preprocessed_data.label.values[train_size + cv_size:]

In [306]:
train_data

<10186x1753 sparse matrix of type '<class 'numpy.int64'>'
	with 64801 stored elements in Compressed Sparse Row format>

In [307]:
print('Headline after vectorization: \n{}'.format(train_data[124]))

Headline after vectorization: 
  (0, 1040)	1
  (0, 1747)	1
  (0, 1584)	1
  (0, 270)	1
  (0, 918)	1
  (0, 235)	1
  (0, 1313)	1
  (0, 713)	1


In [308]:
nb_clf = BernoulliNB()
nb_clf.fit(train_data, train_labels)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [320]:
nb_clf.feature_log_prob_

array([[-6.88993069, -7.98854298, -5.68595789, ..., -7.07225225,
        -4.87502767, -6.04263283],
       [-6.41952949, -5.72638231, -6.75600172, ..., -6.75600172,
        -6.75600172, -6.41952949]])

In [326]:
np.exp(nb_clf.predict_proba(cv_data))[:10]

array([[1.        , 2.71828183],
       [2.71827987, 1.00000072],
       [2.71828177, 1.00000002],
       [1.        , 2.71828183],
       [1.        , 2.71828183],
       [2.71828067, 1.00000043],
       [2.71827388, 1.00000292],
       [2.71828143, 1.00000015],
       [2.71828128, 1.0000002 ],
       [1.        , 2.71828183]])

In [325]:
cv_pred[:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1])

In [294]:
nb_clf.score(train_data, train_labels)

0.9987237384645592

In [295]:
cv_pred = nb_clf.predict(cv_data.toarray())

In [296]:
np.sum(cv_pred == cv_labels)/len(cv_pred)

1.0

In [297]:
preprocessed_data.iloc[train_size+3]

title      donald trump win year month day old first full...
author                                          James Staten
content    archives michael television donald trump win y...
label                                                      1
Name: 10189, dtype: object

In [298]:
cv_pred[3]

1

In [299]:
test_pred = nb_clf.predict(test_data.toarray())

In [300]:
np.sum(test_pred == test_labels)/len(test_pred)

1.0

In [301]:
preprocessed_data.iloc[train_size+cv_size+3]

title                 circus hague kiev europe bow migration
author                  Jafe Arnoldski (noreply@blogger.com)
content    november eduard popov fort russ translated arn...
label                                                      1
Name: 12226, dtype: object

In [302]:
test_pred[3]

1

In [394]:
train_size = int(0.8 * len(preprocessed_data))
test_size = len(preprocessed_data) - train_size

vectorizer = CountVectorizer(stop_words='english', min_df=100, binary=True)
dtm_matrix = vectorizer.fit_transform(preprocessed_data.content.values)

train_data = dtm_matrix[:train_size]
train_labels = preprocessed_data.label.values[:train_size]

test_data = dtm_matrix[train_size:]
test_labels = preprocessed_data.label.values[train_size:]

In [395]:
train_data

<10865x5556 sparse matrix of type '<class 'numpy.int64'>'
	with 2529123 stored elements in Compressed Sparse Row format>

In [396]:
nb_clf = BernoulliNB()
nb_clf.fit(train_data, train_labels)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [397]:
nb_clf.score(train_data, train_labels)

0.8237459733087897

In [398]:
test_pred = nb_clf.predict(test_data)

In [399]:
np.sum(test_pred == test_labels)/len(test_pred)

0.8211262421788738

In [433]:
"""feature probabilities for fake news"""
highest_posterior_prob_indices = np.flip(np.argsort(np.exp(nb_clf.feature_log_prob_)[1]))[:20]
lowest_posterior_prob_indices = np.argsort(np.exp(nb_clf.feature_log_prob_)[1])[:20]

In [434]:
nb_clf.feature_count_.shape[1]

5556

In [435]:
arr = np.zeros(nb_clf.feature_count_.shape[1])

for i in highest_posterior_prob_indices:
    arr[i] = 1
vectorizer.inverse_transform(arr)

[array(['come', 'day', 'election', 'know', 'like', 'make', 'new',
        'november', 'people', 'president', 'right', 'say', 'state', 'time',
        'trump', 'use', 'way', 'work', 'world', 'year'], dtype='<U15')]

In [436]:
arr = np.zeros(nb_clf.feature_count_.shape[1])

for i in lowest_posterior_prob_indices:
    arr[i] = 1
vectorizer.inverse_transform(arr)

[array(['broadway', 'bronx', 'dancer', 'exhibition', 'janeiro', 'leather',
        'lineup', 'nytimes', 'onetime', 'onstage', 'opera', 'playoff',
        'rhythm', 'spicer', 'teammate', 'tex', 'tillerson', 'tournament',
        'weekday', 'weekdays'], dtype='<U15')]

In [442]:
np.flip(np.argsort(np.exp(nb_clf.feature_log_prob_)[1]))[:20]

array([4375, 3005, 5079, 3609, 5538, 2910, 3319,  947, 2798, 4741, 5303,
       5513, 1279, 5188, 3366, 5507, 3824, 4261, 5427, 1645])