## Fake News Detection

This project aims to classify fake and genuine news

### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

### Get Data 

In [2]:
# !wget "https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset"

In [3]:
path = "/home/liben/Liben/Desktop/5th year 1st semester/big data modelling/final_project/archive/"
true_df = pd.read_csv(path + 'True.csv')
fake_df = pd.read_csv(path + 'Fake.csv')

### Data Exploration

In [4]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
true_df['label'] = 0

In [6]:
fake_df['label'] = 1

In [7]:
true_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [8]:
fake_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [9]:
true_df = true_df[['text','label']]
fake_df = fake_df[['text','label']]

In [10]:
dataset = pd.concat([true_df , fake_df]) # concatnates the true and false dataframes data

In [11]:
dataset.shape

(44898, 2)

### Data Preprocessing

In [12]:
dataset.isnull().sum() # no null values

text     0
label    0
dtype: int64

Balanced or Unbalanced dataset

In [13]:
dataset['label'].value_counts()

1    23481
0    21417
Name: label, dtype: int64

In [14]:
true_df.shape # true news

(21417, 2)

In [15]:
fake_df.shape # fake news

(23481, 2)

Shuffle or Resample

In [16]:
dataset = dataset.sample(frac = 1) # shuffle datatest

In [17]:
dataset.head()

Unnamed: 0,text,label
3345,The National Fraternal Order of Police knows e...,1
9466,WASHINGTON (Reuters) - Presumptive Republican ...,0
23240,"Brasscheck TVNearly 30 years ago, two criminal...",1
6004,WASHINGTON (Reuters) - The U.S. State Departme...,0
22003,21st Century Wire says After 12 months of perp...,1


In [18]:
dataset.reset_index(inplace=True)
dataset.drop(["index"],axis = 1,inplace = True)

In [19]:
dataset.head()

Unnamed: 0,text,label
0,The National Fraternal Order of Police knows e...,1
1,WASHINGTON (Reuters) - Presumptive Republican ...,0
2,"Brasscheck TVNearly 30 years ago, two criminal...",1
3,WASHINGTON (Reuters) - The U.S. State Departme...,0
4,21st Century Wire says After 12 months of perp...,1


In [20]:
lemmatizer = WordNetLemmatizer() # lemmatizier changes worlds to their root word studing,studied,study -> study
stopwords = stopwords.words('english') # removes stopwords a, an, the etc

def clean_data(text):
    text = text.lower() 
    row = re.sub('[^a-zA-Z]' , ' ' , text)
    token = row.split() 
    news = [lemmatizer.lemmatize(word) for word in token if not word in stopwords]  
    clean_news = ' '.join(news) 
    
    return clean_news 

In [21]:
dataset["text"] = dataset["text"].apply(clean_data)

### Model Training and Evaluation

Splitting Training and Testing

In [22]:
x_train, x_test, y_train, y_test = train_test_split(dataset["text"], dataset["label"], test_size=0.25)

In [23]:
vectorization = TfidfVectorizer() # transforms the text to vector(meaningful representation)
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

#### Logistic Regression

In [24]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

#### Logistic Evaluation

In [25]:
pred_lr=LR.predict(xv_test)

In [26]:
LR.score(xv_test, y_test)

0.9880623608017818

In [27]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5373
           1       0.99      0.99      0.99      5852

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



#### Decision Tree Classification

In [28]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

#### Decision Tree Classification Evaluation

In [29]:
pred_dt = DT.predict(xv_test)

In [30]:
DT.score(xv_test, y_test)

0.995456570155902

In [31]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5373
           1       1.00      1.00      1.00      5852

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



#### Gradient Boosting Classifier

In [32]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

#### Gradient Boosting Classifier Evaluation

In [33]:
pred_gbc = GBC.predict(xv_test)

In [34]:
GBC.score(xv_test, y_test)

0.9953674832962138

In [35]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5373
           1       1.00      0.99      1.00      5852

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



#### Random Forest Classifier

In [36]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

#### Random Forest Classifier Evaluation

In [37]:
pred_rfc = RFC.predict(xv_test)

In [38]:
RFC.score(xv_test, y_test)

0.9924276169265034

In [39]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5373
           1       1.00      0.99      0.99      5852

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [40]:
import pickle
pickle.dump( GBC, open( "model.pkl", "wb" ) )
pickle.dump( GBC, open( "model.bin", "wb" ) )

pickle.dump( vectorization, open("vectorizer.pkl","wb"))