In [13]:
from webpage_parser import *
import pandas as pd
import pickle
import json
import requests
import matplotlib.pyplot as plt
import re

In [16]:
df = pd.read_csv('fake_or_real_news.csv')
df = df.drop('Unnamed: 0', 1)
print(df.shape)
df.head()

(6335, 3)


Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [17]:
r = requests.get('https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt')
stopwords = set(r.text.split())

In [28]:
def clean_string(string):
    string.strip(',')
    string.strip('.')
    string.strip('?')
    string.strip('/')
    string.strip('!')
    string.strip('$')
    string.strip('#')
    string.strip('^')
    string.strip('&')
    string.strip(':')
    string.strip(';')
    string.strip('-')
    string.strip('=')
    string.strip('"')
    lst = string.split()
    return " ".join(list(filter(lambda x: filter_word(x), lst)))
    

def filter_word(word):
    if 'http' in word or '@' in word:
        return False
    elif '(' in word or ')' in word:
        return False
    elif word in stopwords:
        return False 
    elif len(word) <= 2:
        return False
    return True  

In [30]:
for i in range(len(df['title'])):
    df['title'][i] = clean_string(df['title'][i])
    df['text'][i] = clean_string(df['text'][i])

In [31]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, Shillman Journalism Fellow ...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry Paris gesture sympathy,U.S. Secretary State John Kerry said Monday wi...,REAL
3,Bernie supporters Twitter erupt anger DNC: 'We...,Kaydee King November 2016 The lesson tonight's...,FAKE
4,The Battle New York: Why This Primary Matters,It's primary day New York front-runners Hillar...,REAL


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix

In [51]:
count_vect_text = CountVectorizer()
count_vect_title = CountVectorizer()

text_counts = count_vect_text.fit_transform(df['text'])
title_counts = count_vect_title.fit_transform(df['title'])

In [52]:
tfidf_transformer = TfidfTransformer()

In [53]:
title_tfidf = tfidf_transformer.fit_transform(title_counts)
text_tfidf = tfidf_transformer.fit_transform(text_counts)

In [54]:
from scipy.sparse import hstack

In [56]:
tfidf = hstack([title_tfidf, text_tfidf])

In [57]:
tfidf.shape

(6335, 75649)

In [58]:
title_vocab = count_vect_title.vocabulary_
text_vocab = count_vect_text.vocabulary_

In [97]:
title_vocab_file = 'title_vocab.sav'
text_vocab_file = 'text_vocab.sav'
pickle.dump(title_vocab, open(title_vocab_file, 'wb'))
pickle.dump(text_vocab, open(text_vocab_file, 'wb'))

In [59]:
(X_tfidf_train, X_tfidf_test, Y_train, Y_test) = train_test_split(tfidf, 
                                                                  df['label'], 
                                                                  test_size=0.2, random_state=1)

In [60]:
gbc = GradientBoostingClassifier().fit(X_tfidf_train, Y_train)
Y_pred = gbc.predict(X_tfidf_test)
confusion_matrix(Y_test, Y_pred) # Only using two Tfidfs

array([[599,  52],
       [ 62, 554]])

In [87]:
(599+554)/(599+554+52+62)

0.9100236779794791

In [68]:
def w2v_sum(tfidf, row, w2v, vocab_lst):
    result = np.array([0.0]*50)
    cur = tfidf[row]
    for i in range(len(cur)):
        if cur[i] > 0 and vocab_lst[i] in w2v.keys():
            result += w2v[vocab_lst[i]]
    return result

In [69]:
dense_title_tfidf = title_tfidf.todense().tolist()
dense_text_tfidf = text_tfidf.todense().tolist()

In [71]:
import numpy as np
with open("glove/glove.6B.50d.txt", "r") as lines:
    w2v = {line.split()[0]: np.array(list(map(float, line.split()[1:])))
           for line in lines}

In [72]:
title_vocab_lst = list(title_vocab.keys())
text_vocab_lst = list(text_vocab.keys())

In [73]:
title_vector = np.array([[0]*50])
text_vector = np.array([[0]*50])
for i in range(tfidf.shape[0]):
    title_v = w2v_sum(dense_title_tfidf, i, w2v, title_vocab_lst)
    title_vector = np.append(title_vector, [title_v], axis = 0)
    text_v = w2v_sum(dense_text_tfidf, i, w2v, text_vocab_lst)
    text_vector = np.append(text_vector, [text_v], axis = 0)

In [78]:
title_vector[1:].shape

(6335, 50)

In [79]:
text_vector[1:].shape

(6335, 50)

In [76]:
tfidf.shape

(6335, 75649)

In [80]:
tfidf_w2v_mat = np.concatenate((tfidf.todense(), title_vector[1:]), axis = 1)
tfidf_w2v_mat = np.concatenate((tfidf_w2v_mat, text_vector[1:]), axis = 1)

In [81]:
tfidf_w2v_mat.shape

(6335, 75749)

In [83]:
from scipy import sparse
sparse_mat = sparse.csr_matrix(tfidf_w2v_mat)

In [84]:
(X_w2v_train, X_w2v_test, Y_w2v_train, Y_w2v_test) = train_test_split(sparse_mat, 
                                                      df['label'], 
                                                      test_size = 0.2, random_state = 1)

In [85]:
gbc_w2v = GradientBoostingClassifier().fit(X_w2v_train, Y_w2v_train)

In [86]:
Y_pred = gbc_w2v.predict(X_w2v_test)
confusion_matrix(Y_w2v_test, Y_pred)

array([[597,  54],
       [ 62, 554]])

In [88]:
(597+554)/(597+554+54+62)

0.9084451460142068

In [89]:
filename = 'title_text_classifier.sav'
pickle.dump(gbc, open(filename, 'wb'))

In [102]:
def read_text(filename):
    text = ""
    with open(filename, "r") as file:
        for line in file.readlines():
            text += clean_string(line)
    return text

def get_tfidf(title, text, title_vocab, text_vocab):
    count_vect_title = CountVectorizer(title_vocab)
    count_vect_text = CountVectorizer(text_vocab)
    title_counts = count_vect_title.fit_transform([title])
    text_counts = count_vect_text.fit_transform([text])
    print(title_counts.shape)
    print(text_counts.shape)

    tfidf_transformer = TfidfTransformer()
    title_tfidf = tfidf_transformer.fit_transform(title_counts)
    text_tfidf = tfidf_transformer.fit_transform(text_counts)
    tfidf = hstack([title_tfidf, text_tfidf])
    return tfidf

title = 'Bill Cosby found guilty of sexual assault in retrial'
text = read_text('text.txt')

In [103]:
title_text_classifier = pickle.load(open('title_text_classifier.sav', 'rb'))
title_vocab = pickle.load(open('title_vocab.sav', 'rb'))
text_vocab = pickle.load(open('text_vocab.sav', 'rb'))

In [104]:
tfidf = get_tfidf(title, text, title_vocab, text_vocab)

news_reliability = title_text_classifier.predict(tfidf)
print('The news is predicted to be: ')
print(news_reliability)


(1, 9)
(1, 417)


ValueError: Number of features of the model must match the input. Model n_features is 75649 and input n_features is 426 