Project: 5 Fake News Detection API

Problem Statement:
Detect fake news using NLP models and deploy it as an API.

In [8]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [9]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

df = pd.read_csv('data.csv')


In [10]:
df.sample(10)


Unnamed: 0,Text,label
9362,U.S. judge questions states seeking to restore...,Real
4034,GOP Senator Has Had ENOUGH: Firing Mueller Co...,Fake
1489,No rehearing for ex-Illinois Governor Blagojev...,Real
6401,Pastors stand firm as Trump's U.S. evangelical...,Real
8232,Oxford Fellow GLORIOUSLY Buries Trump For Try...,Fake
2038,Trump to name Republican donor Kelly Craft as ...,Real
9200,"Illinois House enacts FY 2018 budget, ending r...",Real
2646,"Alt-Right Trump Lover Loses His Sh*t, Kills H...",Fake
7050,White House says some members of infrastructur...,Real
1799,Republicans Running Scared As Trump Threatens...,Fake


In [11]:
df.columns = ['news_text', 'is_fake']


In [12]:
df.sample(10)


Unnamed: 0,news_text,is_fake
7258,The Newest Clinton Email Scandal Actually PRO...,Fake
1759,House speaker urges Trump not to scrap 'Dreame...,Real
2860,"Fearing Trump's next move, liberals urge Supre...",Real
694,Exclusive: Trump to approve Keystone XL at mee...,Real
3957,Teacher Tells Black Students He’ll Have Trump...,Fake
3563,House Committee Uncovers DAMNING BOMBSHELL – ...,Fake
5660,Trump’s ICE Thugs Stop Ambulance Taking 10-Ye...,Fake
7073,Illinois lawmakers delay bill to expand aborti...,Real
1560,Treasury's Mnuchin concerned about alternate s...,Real
6390,Democrats to study every option to halt Obamac...,Real


In [13]:
df['is_fake'] = df['is_fake'].map({'Real' : 0, 'Fake' : 1})


In [14]:
df.sample(10)


Unnamed: 0,news_text,is_fake
7625,Trump’s LOST HIS DAMN MIND Making RIDICULOUS ...,1
6635,Petition DEMANDING The Electoral College Take...,1
1957,BUSTED: Trump’s Secretary Of State Pick Is A ...,1
1783,White House says fully committed to draft Sena...,0
7313,U.S. lawmakers target Myanmar military with ne...,0
7060,Senate passes measure that permanently repeals...,0
9199,For women at the U.S. Congress: the right to b...,0
9735,Maxine Waters Just Threw An EPIC Twitter Bomb...,1
8928,U.S. lawmakers seek looser energy development ...,0
2368,"President Obama BLASTS ‘Insecure’ Trump, Make...",1


In [15]:
df.isna().sum()


news_text    0
is_fake      0
dtype: int64

In [16]:
df.dropna(inplace=True)


In [17]:
df.shape


(9900, 2)

In [18]:
df.duplicated().sum()


35

In [19]:
df.drop_duplicates(inplace=True)


In [20]:
df.shape


(9865, 2)

In [21]:
df['is_fake'].value_counts()


is_fake
1    5000
0    4865
Name: count, dtype: int64

In [22]:
X, y = df['news_text'], df['is_fake']


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:

import re
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def convert_text(text_series):
    return text_series.apply(lambda text: ' '.join(
        stemmer.stem(word) for word in re.findall(r'\b\w+\b', text.lower())
    ))


In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


In [28]:
model = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1, 2))),
    ('clf', MultinomialNB())
])


In [29]:
model.fit(X_train, y_train)


In [30]:
model.score(X_test, y_test)


0.9751647237709072

In [35]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier


In [36]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(),
    'Support Vector Machine' : SVC(),
    'Decision Tree' : DecisionTreeClassifier(),
    'ExtraTreeClassifier' : ExtraTreeClassifier(),
    'SVC' : SVC(),
    'Multinomial Naive Bayes' : MultinomialNB(),
    'Bernoulli Naive Bayes' : BernoulliNB(),
    'AdaBoost' : AdaBoostClassifier(),
    'Gradient Boosting' : GradientBoostingClassifier(),
    
}


for name, current_model in models.items():
    print(f"Checking accuracy for {name}")
    model = Pipeline([
        ('preprocess', FunctionTransformer(convert_text, validate=False)),
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1, 2))),
        ('clf', current_model)
    ])

    model.fit(X_train, y_train)
    print(f"Accuracy for {name} is {model.score(X_test, y_test)}")
    print("*"*50)
    print("\n")



Checking accuracy for Logistic Regression
Accuracy for Logistic Regression is 0.9923973644196655
**************************************************


Checking accuracy for Random Forest
Accuracy for Random Forest is 0.9984794728839331
**************************************************


Checking accuracy for Support Vector Machine
Accuracy for Support Vector Machine is 0.994931576279777
**************************************************


Checking accuracy for Decision Tree
Accuracy for Decision Tree is 0.9994931576279777
**************************************************


Checking accuracy for ExtraTreeClassifier
Accuracy for ExtraTreeClassifier is 0.9031931069437404
**************************************************


Checking accuracy for SVC
Accuracy for SVC is 0.994931576279777
**************************************************


Checking accuracy for Multinomial Naive Bayes
Accuracy for Multinomial Naive Bayes is 0.9751647237709072
***********************************************



Accuracy for AdaBoost is 0.9994931576279777
**************************************************


Checking accuracy for Gradient Boosting
Accuracy for Gradient Boosting is 0.9994931576279777
**************************************************


