In [20]:
# JVW Fake News Detector
# 
# The "JVW Fake News Detector is a system that tells our users if the article/site they're reading
#is giving them real or fake news. Users of the detector will be able to insert the article/site of 
#interest into a simple and easy to use GUI. Once they click saying that this is the site they want looked at, 
#with help from API's and our large dataset of news entries, the detector will clearly display to our user if 
#the news is fake or fact.


import numpy as np
import pandas as pd
import pickle
import itertools
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score




# reads into out dataset
#we are using a dataset created by Hassan Amin- kaggle.com
#Source Dataset: https://www.kaggle.com/hassanamin/textdb3
df = pd.read_csv('NewsDataset.csv',header=0)
#conversion dictionary- to make our dataset binary
#conversion_dict = {'FAKE': 1, 'REAL':0}
#df['label'] = df['label'].replace(conversion_dict)
#df.label = df.label.map(dict(REAL=0, FAKE=1))
df.label.value_counts()

# Show first 5 columns in our dataset
df.shape
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [45]:
#testing to see if transformation into binary has worked/accurate
#dataframe labels the text as FAKE(1) or REAL(0)
labels=df.label
labels.head(10)

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
5    FAKE
6    FAKE
7    REAL
8    REAL
9    REAL
Name: label, dtype: object

In [50]:
#looking to see how many pieces of data are included in our training dataset as a whole
df.shape

(6335, 4)

In [56]:
#checking dataset for empty cells within the rows/columns
df.isnull().any()

Unnamed: 0    False
title         False
text          False
label         False
dtype: bool

In [62]:
#include title

x = df['text']
y = df['label']

In [35]:
#training and testing our data/text
#training our system to recognize the relationship between the text in the dataset

#we have 6335 pieces of data, so the test size will take 35% of that and test and train it
#if results aren't how we like we can make shuffle=TRUE. this will shuffle the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.35, random_state=7)

In [63]:
#CountVectorizer is used to convert a collection of text documents to a vector of term/token counts
#Vectorizer helps to reduce unstructured data from our training dataset. This helps our system by giving our
#implemented algorithms a better chance of understanding what is actually included in the text
countVector = CountVectorizer(stop_words='english')

In [65]:
countTrain = countVector.fit_transform(x_train)

In [66]:
print(countVector)

CountVectorizer(stop_words='english')


In [48]:
#system prints out the matrix values for the variable countTrain
print(countTrain)

  (0, 54372)	1
  (0, 24185)	1
  (0, 38819)	1
  (0, 44311)	1
  (0, 27200)	1
  (0, 16294)	1
  (0, 43343)	1
  (0, 13600)	1
  (0, 12468)	1
  (0, 18947)	1
  (0, 8110)	1
  (0, 26297)	1
  (0, 14772)	1
  (0, 26768)	1
  (0, 10797)	2
  (0, 13430)	1
  (0, 3588)	1
  (0, 34070)	1
  (0, 14430)	1
  (0, 23614)	1
  (0, 10300)	2
  (0, 16825)	1
  (0, 43319)	1
  (0, 34047)	1
  (0, 47187)	1
  :	:
  (4116, 46622)	1
  (4116, 12920)	1
  (4116, 23297)	1
  (4116, 21844)	1
  (4116, 3446)	2
  (4116, 27045)	1
  (4116, 37143)	1
  (4116, 38785)	1
  (4116, 43885)	1
  (4116, 36696)	1
  (4116, 38326)	2
  (4116, 29619)	1
  (4116, 51610)	1
  (4116, 24457)	1
  (4116, 26006)	1
  (4116, 39045)	1
  (4116, 11181)	1
  (4116, 46332)	1
  (4116, 49476)	1
  (4116, 18131)	1
  (4116, 15971)	1
  (4116, 52716)	1
  (4116, 7125)	1
  (4116, 21098)	1
  (4116, 33327)	1


In [67]:
def get_countVector_stats():
    
    print(countTrain.shape)
    print(countVector.vocabulary_)
    
get_countVector_stats()
#our training doc matrix size is (4117, 56148)

steal': 47646, 'wingnut': 54612, 'astroturfing': 4620, 'heileman': 23286, 'allahpundit': 3022, 'interlocutors': 26061, 'trainor': 50598, 'delineated': 13788, 'colloquial': 10682, 'ina': 25098, 'backlogged': 5187, 'anker': 3556, 'proprietorial': 39329, '410': 1016, 'nlsh0ocvdv': 34345, 'rfeb7bkgfm': 42286, 'bard': 5541, 'elie': 16749, 'wiesel': 54477, 'whitegenocidetm': 54374, 'squelch': 47045, 'cameos': 8486, 'picketed': 37509, 'risqué': 42508, 'stuever': 47893, 'vainglorious': 52802, 'accoutered': 2043, 'defray': 13674, 'lodging': 29802, 'paging': 36147, 'kappel': 27539, 'cabo': 8329, 'cayman': 9098, 'jules': 27325, 'witcover': 54708, 'hoffman': 23817, 'bugging': 7996, 'speedboats': 46737, '0843': 79, 'fadavi': 18547, 'riverine': 42542, 'schreck': 43970, 'klapper': 28100, 'lardner': 28730, 'gambrell': 20800, 'karimi': 27566, 'libbed': 29328, 'restrooms': 42012, 'valedictorian': 52809, 'demurrals': 13934, 'blended': 6785, 'dyspeptic': 16243, 'walloons': 53787, 'hahaha': 22524, 'dullard

In [68]:
countTest = countVector.transform(x_test)

In [69]:
#tfidfVectorizer- helps our system to recognize stop-words from any text
#function will identify unnessecary words and once it reaches 0.70 it will delete them so the system can focus on the important parts of the text

tfidfVector=TfidfVectorizer(stop_words='english',max_df=0.7)
tfidfVector_train=tfidfVector.fit_transform(x_train.values.astype('U')) 
tfidfVector_test=tfidfVector.transform(x_test.values.astype('U'))

In [70]:
def get_tfidfVector_stats():
    tfidfVector_train.shape
    print(tfidfVector_train.A[:10])
get_tfidfVector_stats()

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [75]:
tfidfVector_test = tfidfVector.transform(x_test)

In [77]:
#shows us feature names from the training dataset we implemented into the system
print(countVector.get_feature_names()[:10])

['00', '000', '0000', '00000031', '00006', '0001', '0001pt', '0002', '000billion', '000ft']


In [72]:
#feature names of the tfidfVectorizer we implemented into the system
print(tfidfVector.get_feature_names()[:-10])

esleyan', 'wespa', 'wesson', 'west', 'westampton', 'westboro', 'westbound', 'westbrook', 'westbury', 'westchester', 'western', 'westerners', 'westernization', 'westernized', 'westernmost', 'westerville', 'westfall', 'westfield', 'westgate', 'westinghouse', 'westland', 'westminster', 'westmipolitics', 'westpac', 'westward', 'westwards', 'westwick', 'westwingreport', 'westwood', 'wet', 'wetback', 'wetiko', 'wetland', 'wetlands', 'wets', 'wetters', 'wetting', 'wevote', 'wexler', 'weâ', 'wfaa', 'wfla', 'wfp', 'wfpl', 'wgn', 'wgv', 'wgv8dps', 'wh', 'wha', 'whack', 'whacked', 'whacky', 'whaddya', 'whale', 'whalen', 'whales', 'whaling', 'whammy', 'wharton', 'whatnot', 'whatreallyhappened', 'whats', 'whatsapp', 'whatsoever', 'whatsofuckingever', 'whatta', 'whatâ', 'wheat', 'wheaton', 'whedon', 'wheel', 'wheelchair', 'wheeled', 'wheeler', 'wheelhouse', 'wheeling', 'wheels', 'wheezing', 'whelan', 'whereabouts', 'whereever', 'wherewithal', 'whet', 'whetsel', 'whetted', 'whew', 'whichever', 'whiff

In [73]:
from sklearn.linear_model import LogisticRegression
#Logistic Regression Classification- a technique for machine learning problems. Teachings system to classify new data into 
# two class values: i.e. for our project=FAKE or REAL

#df = df.dropna()

logregClassifier = LogisticRegression()
logregClassifier.fit(tfidfVector_train, y_train)


LogisticRegression()

In [61]:
logregClassifier.score(tfidfVector_test, y_test)

0.9165915238954012

In [74]:
#Pipelines are set up with the fit/transform/predict functionality, so that we can fit the whole pipeline to the training
#data and transform to the test data without having to do it individually for everything you do

#NB=Nominal Bore:
pip = Pipeline([('tfidfVector',TfidfVectorizer(stop_words='english')),
             ('nb', MultinomialNB())])

In [76]:
pip.fit(x_train, y_train)

Pipeline(steps=[('tfidfVector', TfidfVectorizer(stop_words='english')),
                ('nb', MultinomialNB())])

In [46]:
score = pip.score(x_test,y_test)
print('Accuracy Score of Pipeline Implementation:', score)

Accuracy Score of Pipeline Implementation: 0.8214607754733995


In [49]:
#Confusion Matrix- table used to show performance of a dataset within a system

y_pred=logregClassifier.predict(tfidfVector_test)

confusion_matrix(y_test,y_pred,labels=['FAKE','REAL'])

#with the dataset we are using, the confusion matrix is helping us
#to identify that the dataset contains 1056 true positives,
#977 true negatives, 119 false positives, and 66 false negatives

array([[1056,   66],
       [ 119,  977]], dtype=int64)

In [53]:
applyClassifier = Pipeline([
    ('vector', tfidfVector),
    ('Logistic Regression', LogisticRegression(max_iter=50))])
applyClassifier.fit(x_train, y_train)

Pipeline(steps=[('vector', TfidfVectorizer(max_df=0.7, stop_words='english')),
                ('Logistic Regression', LogisticRegression(max_iter=50))])

In [57]:
with open('detectionModel.pkl','wb') as handle:
    pickle.dump(pip, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [60]:
var = input("Please enter text here: ")
def fakeNewsDetection(var):
    load_model = pickle.load(open('detectionModel.pkl','rb'))
    prediction = load_model.predict([var])
    
    return(print("The given statement you entered is ", prediction[0]))
    # return "hi"
if __name__=='__main__':
    fakeNewsDetection(var)


The given statement you entered is  REAL
