## Imports
 **Import the usual suspects. :) **


In [1]:
import nltk
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## The Data

In [2]:
truenews = pd.read_csv("True.csv")
fakenews = pd.read_csv("Fake.csv")

In [3]:
truenews.head()

Unnamed: 0,title,text,subject,date,True/Fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [4]:
fakenews.head()

Unnamed: 0,title,text,subject,date,True/Fake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake


In [5]:
fakenews.describe()

Unnamed: 0,title,text,subject,date,True/Fake
count,23481,23481.0,23481,23481,23481
unique,17903,17455.0,6,1681,1
top,MEDIA IGNORES Time That Bill Clinton FIRED His...,,News,"May 10, 2017",Fake
freq,6,626.0,9050,46,23481


In [6]:
truenews.describe()

Unnamed: 0,title,text,subject,date,True/Fake
count,21417,21417,21417,21417,21417
unique,20826,21192,2,716,1
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017",True
freq,14,8,11272,182,21417


## Combine the 2 DataFrames into a single data frame

In [7]:
news = pd.concat([truenews, fakenews])

In [8]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
title        44898 non-null object
text         44898 non-null object
subject      44898 non-null object
date         44898 non-null object
True/Fake    44898 non-null object
dtypes: object(5)
memory usage: 2.1+ MB


In [9]:
news["Article"] = news["title"] + news["text"]
news.sample(frac = 1)

Unnamed: 0,title,text,subject,date,True/Fake,Article
6509,U.S. obtained evidence after election that Rus...,WASHINGTON (Reuters) - U.S. intelligence agen...,politicsNews,"January 5, 2017",True,U.S. obtained evidence after election that Rus...
1466,"U.S. will admit up to 45,000 refugees next yea...",WASHINGTON (Reuters) - The United States will ...,politicsNews,"September 29, 2017",True,"U.S. will admit up to 45,000 refugees next yea..."
8601,Oops: Ammosexual Playing With Gun During Beng...,Thinking about going to the movies? You might ...,News,"January 22, 2016",Fake,Oops: Ammosexual Playing With Gun During Beng...
15836,Russian Defense Minister meets U.S. envoy to M...,MOSCOW (Reuters) - Russian Defense Minister Se...,worldnews,"November 2, 2017",True,Russian Defense Minister meets U.S. envoy to M...
421,Trump taps Fifth Third lawyer McWilliams to le...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"December 1, 2017",True,Trump taps Fifth Third lawyer McWilliams to le...
...,...,...,...,...,...,...
14720,Serial Plagiarist Does Victory Dance Over Whit...,White people the only lives that DON T matter....,politics,"Jan 2, 2016",Fake,Serial Plagiarist Does Victory Dance Over Whit...
2518,Factbox: Trump on Twitter (July 26) - U.S. Mil...,The following statements were posted to the ve...,politicsNews,"July 26, 2017",True,Factbox: Trump on Twitter (July 26) - U.S. Mil...
22542,Boiler Room #61 – Hello From the Gutter,Tune in to the Alternate Current Radio Network...,US_News,"June 16, 2016",Fake,Boiler Room #61 – Hello From the GutterTune in...
1982,Trump’s ‘Deportation Force’ Storms Unarmed Le...,A Chicago man is in critical condition after t...,News,"March 27, 2017",Fake,Trump’s ‘Deportation Force’ Storms Unarmed Le...


## Data Cleaning

In [10]:
news.isnull().sum()

title        0
text         0
subject      0
date         0
True/Fake    0
Article      0
dtype: int64

In [11]:
from nltk.corpus import stopwords
import string

In [12]:
def Text_processing(s):

    # Check string to see if they are a punctuation
    nopunc = [char for char in s if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # stopwords
    clean_string = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean_string

## Tokenize the Article

In [13]:
news[["Article"]].info

<bound method DataFrame.info of                                                  Article
0      As U.S. budget fight looms, Republicans flip t...
1      U.S. military to accept transgender recruits o...
2      Senior U.S. Republican senator: 'Let Mr. Muell...
3      FBI Russia probe helped by Australian diplomat...
4      Trump wants Postal Service to charge 'much mor...
...                                                  ...
23476  McPain: John McCain Furious That Iran Treated ...
23477  JUSTICE? Yahoo Settles E-mail Privacy Class-ac...
23478  Sunnistan: US and Allied ‘Safe Zone’ Plan to T...
23479  How to Blow $700 Million: Al Jazeera America F...
23480  10 U.S. Navy Sailors Held by Iranian Military ...

[44898 rows x 1 columns]>

In [14]:
%%time
news['Clean Text'] = news['Article'].apply(Text_processing)

Wall time: 2h 25min 33s


In [15]:
news.sample(10)

Unnamed: 0,title,text,subject,date,True/Fake,Article,Clean Text
9575,"JUST IN: Obama Campaign Secretly Paid $972,000...","Well, well, well As a famous reverend once s...",politics,"Oct 29, 2017",Fake,"JUST IN: Obama Campaign Secretly Paid $972,000...","[Obama, Campaign, Secretly, Paid, 972000, Fund..."
15238,Lithuania says east-west schism within EU bene...,WARSAW (Reuters) - Lithuania said a growing ri...,worldnews,"November 9, 2017",True,Lithuania says east-west schism within EU bene...,"[Lithuania, says, eastwest, schism, within, EU..."
5275,"In sweeping move, Trump puts regulation monito...",WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,"February 24, 2017",True,"In sweeping move, Trump puts regulation monito...","[sweeping, move, Trump, puts, regulation, moni..."
10814,CNN REPORTER EMBARRASSES Himself With Idiotic ...,Bob Schieffer is an old-school newsman. There ...,politics,"May 22, 2017",Fake,CNN REPORTER EMBARRASSES Himself With Idiotic ...,"[CNN, REPORTER, EMBARRASSES, Idiotic, Response..."
9845,"Regional crises, not human rights, to dominate...",RIYADH (Reuters) - U.S. President Barack Obama...,politicsNews,"April 20, 2016",True,"Regional crises, not human rights, to dominate...","[Regional, crises, human, rights, dominate, Ob..."
8368,‘Christian’ Dad Beat Daughter With Frozen Bac...,It s always amazing how many folks who claim t...,News,"February 2, 2016",Fake,‘Christian’ Dad Beat Daughter With Frozen Bac...,"[‘Christian’, Dad, Beat, Daughter, Frozen, Bac..."
14923,HERE’S THE LIST Of 25 Governors Who Have Told ...,There is a also a map (below) showing which st...,politics,"Nov 17, 2015",Fake,HERE’S THE LIST Of 25 Governors Who Have Told ...,"[HERE’S, LIST, 25, Governors, Told, Obama, Mus..."
961,Senate Intelligence chairman: Indictments do n...,WASHINGTON (Reuters) - U.S. Senate Intelligenc...,politicsNews,"October 30, 2017",True,Senate Intelligence chairman: Indictments do n...,"[Senate, Intelligence, chairman, Indictments, ..."
9198,WATCH: PRESIDENT TRUMP CHANNELS Successful Dev...,WASHINGTON President Trump said on Thursday ...,politics,"Dec 14, 2017",Fake,WATCH: PRESIDENT TRUMP CHANNELS Successful Dev...,"[WATCH, PRESIDENT, TRUMP, CHANNELS, Successful..."
16332,BREAKING: SOURCE TELLS HOW Anthony Weiner Acci...,A source close to Anthony Weiner s legal team...,Government News,"Oct 31, 2016",Fake,BREAKING: SOURCE TELLS HOW Anthony Weiner Acci...,"[BREAKING, SOURCE, TELLS, Anthony, Weiner, Acc..."


## CountVectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
bow_transformer = CountVectorizer(analyzer=Text_processing).fit(news['Clean Text'])
 
#Total vocab words

print(len(bow_transformer.vocabulary_))

39099


## Bag Of Words

In [18]:
news_bow = bow_transformer.transform(news['Clean Text'])

## CountVectorizer,bow_transformer example

<table border = “1“>
<tr>
<th></th> <th>Message 1</th> <th>Message 2</th> <th>...</th> <th>Message N</th> 
</tr>
<tr>
<td><b>Word 1 Count</b></td><td>0</td><td>1</td><td>...</td><td>0</td>
</tr>
<tr>
<td><b>Word 2 Count</b></td><td>0</td><td>0</td><td>...</td><td>0</td>
</tr>
<tr>
<td><b>...</b></td> <td>1</td><td>2</td><td>...</td><td>0</td>
</tr>
<tr>
<td><b>Word N Count</b></td> <td>0</td><td>1</td><td>...</td><td>1</td>
</tr>
</table>

## Shape

In [19]:
print('Shape of Sparse Matrix: ', news_bow.shape)
print('Amount of Non-Zero occurences: ', news_bow.nnz)

Shape of Sparse Matrix:  (44898, 39099)
Amount of Non-Zero occurences:  44898


## Sparsity

In [20]:
sparsity = (100.0 * news_bow.nnz / (news_bow.shape[0] * news_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

sparsity: 0


## TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(news_bow)
news_tfidf = tfidf_transformer.transform(news_bow)
print(news_tfidf.shape)

(44898, 39099)


## Model Selection

In [22]:
from sklearn.naive_bayes import MultinomialNB
fakenews_detect_model = MultinomialNB().fit(news_tfidf, news['True/Fake'])

## Splitting The Data

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X=news["Article"]
y=news["True/Fake"]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Creating a Pipeline & Training a model

In [26]:
%%time
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=Text_processing)),  
    ('tfidf', TfidfTransformer()),  
    ('classifier', MultinomialNB()),  
])
pipeline.fit(X_train,y_train)

Wall time: 1h 12min 17s


Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function Text_processing at 0x000001ED76FAE708>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

## Predictions

In [27]:
%%time
prediction = pipeline.predict(X_test)

Wall time: 28min 30s


## Results

In [28]:
from sklearn.metrics import classification_report,confusion_matrix

In [32]:
print(classification_report(prediction,y_test))
print()
print(confusion_matrix(prediction,y_test))

              precision    recall  f1-score   support

        True       0.98      0.95      0.97      6640
        Fake       0.96      0.98      0.97      6830

    accuracy                           0.97     13470
   macro avg       0.97      0.97      0.97     13470
weighted avg       0.97      0.97      0.97     13470


[[6330  310]
 [ 144 6686]]


## Testing

In [57]:
Test_news1=news.iloc[[5]]

In [58]:
Test_news1

Unnamed: 0,title,text,subject,date,True/Fake,Article,Clean Text
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017",True,"White House, Congress prepare for talks on spe...","[White, House, Congress, prepare, talks, spend..."


In [59]:
Test_news1=Test_news1.drop("True/Fake",axis=1)

In [60]:
Test_news1

Unnamed: 0,title,text,subject,date,Article,Clean Text
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017","White House, Congress prepare for talks on spe...","[White, House, Congress, prepare, talks, spend..."


In [74]:
pipeline.predict(Test_news1["Article"])

array([' True'], dtype='<U5')

In [70]:
Test_news2=news.iloc[[22000]]

In [71]:
Test_news2

Unnamed: 0,title,text,subject,date,True/Fake,Article,Clean Text
583,"Trump Goes Mental On Threats To North Korea, ...",Donald Trump has now taken his threats to Nort...,News,"August 11, 2017",Fake,"Trump Goes Mental On Threats To North Korea, ...","[Trump, Goes, Mental, Threats, North, Korea, D..."


In [72]:
Test_news2=Test_news2.drop("True/Fake",axis=1)

In [73]:
Test_news2

Unnamed: 0,title,text,subject,date,Article,Clean Text
583,"Trump Goes Mental On Threats To North Korea, ...",Donald Trump has now taken his threats to Nort...,News,"August 11, 2017","Trump Goes Mental On Threats To North Korea, ...","[Trump, Goes, Mental, Threats, North, Korea, D..."


In [75]:
pipeline.predict(Test_news2["Article"])

array(['Fake'], dtype='<U5')