In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [2]:
data_fake = pd.read_csv('dataset/Fake.csv')
data_true = pd.read_csv('dataset/True.csv')

In [6]:
data_fake['class'] = 0
data_true['class'] = 1

In [7]:
data_fake.shape, data_true.shape

((23481, 5), (21417, 5))

In [8]:
data_fake_manual_test = data_fake.tail(10)
for i in range(23480, 23470, -1):
    data_fake.drop([i], axis = 0, inplace = True)

data_true_manual_test = data_true.tail(10)
for i in range(21416, 21406, -1):
    data_true.drop([i], axis = 0, inplace= True)

In [9]:
data_fake.shape, data_true.shape

((23471, 5), (21407, 5))

In [10]:
data_fake_manual_test['class'] = 0
data_true_manual_test['class'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fake_manual_test['class'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_true_manual_test['class'] = 1


In [13]:
data_merge = pd.concat([data_fake, data_true], axis = 0)

In [15]:
data_merge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [16]:
data = data_merge.drop(['title', 'subject', 'date'], axis = 1)

In [17]:
data.isnull().sum()

text     0
class    0
dtype: int64

In [18]:
data = data.sample(frac = 1)

In [19]:
#data.head()

In [20]:
data.reset_index(inplace = True)
data.drop(['index'], axis = 1, inplace = True)

In [21]:
data.columns

Index(['text', 'class'], dtype='object')

In [22]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [23]:
data['text'] = data['text'].apply(wordopt)

In [24]:
x = data['text']
y = data['class']

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [27]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(xv_train, y_train)

In [28]:
pred_lr = LR.predict(xv_test)

In [29]:
LR.score(xv_test, y_test)

0.9879679144385026

In [30]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5804
           1       0.99      0.99      0.99      5416

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [31]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [32]:
pred_dt = DT.predict(xv_test)

In [35]:
DT.score(xv_test, y_test)

0.9952762923351158

In [37]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5804
           1       1.00      0.99      1.00      5416

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [39]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier(random_state = 0)
GB.fit(xv_train, y_train)

In [40]:
pred_gb = GB.predict(xv_test)

In [43]:
GB.score(xv_test, y_test)

0.9959001782531194

In [45]:
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5804
           1       0.99      1.00      1.00      5416

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [47]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state = 0)
RF.fit(xv_train, y_train)

In [48]:
pred_rf = RF.predict(xv_test)

In [49]:
RF.score(xv_test, y_test)

0.9903743315508021

In [53]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5804
           1       0.99      0.99      0.99      5416

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [73]:
def output_lable(n):
    if n==0:
        return "Fake news"

    elif n==1:
        return "Not a Fake news"


def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GB = GB.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)

    return print("\n\nLR prediction: {} \nDT prediction: {} \nGBC prediction: {} \nRF prediction: {}".format(output_lable(pred_LR[0]) ,output_lable(pred_DT[0]), output_lable(pred_GB[0]), output_lable(pred_RF[0])))
    

In [65]:
news = str(input())
manual_testing(news)

 hello




LR prediction: Fake news 
DT prediction: Fake news 
GBC prediction: Fake news 
RF prediction: Fake news
