In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
import re
import string

In [2]:
data_fake = pd.read_csv('C:/Users/Harsakshi Bhanushali/Downloads/Fake.csv')
data_true = pd.read_csv('C:/Users/Harsakshi Bhanushali/Downloads/True.csv')

In [3]:
data_fake["class"] = 0
data_true["class"] = 1

In [4]:
data_fake_manual_testing = data_fake.tail(10)
data_fake = data_fake.iloc[:-10]

In [5]:
data_true_manual_testing = data_true.tail(10)
data_true = data_true.iloc[:-10]

In [6]:
data_merge = pd.concat([data_fake, data_true], axis=0)

In [7]:
if 'class' not in data_merge.columns:
    raise KeyError("'class' column is missing in the merged dataset.")

In [8]:
data = data_merge.drop(['title', 'subject', 'date'], axis=1)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)



In [9]:
def wordopt(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

In [10]:
data['text'] = data['text'].apply(wordopt)

In [11]:
x = data['text']
y = data['class']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [13]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [14]:
LR = LogisticRegression()
LR.fit(xv_train, y_train)
pred_lr = LR.predict(xv_test)

In [15]:
print("Logistic Regression Classification Report:")
print(classification_report(y_test, pred_lr))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5885
           1       0.98      0.99      0.98      5335

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [16]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)
pred_dt = DT.predict(xv_test)

In [17]:
print("Decision Tree Classification Report:")
print(classification_report(y_test, pred_dt))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5885
           1       0.99      0.99      0.99      5335

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [18]:
ensemble_model = VotingClassifier(estimators=[('lr', LR), ('dt', DT)], voting='hard')
ensemble_model.fit(xv_train, y_train)
pred_ensemble = ensemble_model.predict(xv_test)


In [19]:
print("Ensemble Model Classification Report:")
print(classification_report(y_test, pred_ensemble))

Ensemble Model Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5885
           1       1.00      0.98      0.99      5335

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [23]:
def output_label(n):
    if n == 0:
        return "Fake news"
    elif n == 1:
        return "Not a fake news"

def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_lr = LR.predict(new_xv_test)
    pred_dt = DT.predict(new_xv_test)
    pred_ensemble = ensemble_model.predict(new_xv_test)
    return {
        "LR prediction": output_label(pred_lr[0]),
        "DT prediction": output_label(pred_dt[0]),
        "Ensemble prediction": output_label(pred_ensemble[0])
    }

# Prompt user for input and perform manual testing
news = str(input("Enter your news article text: "))
print(manual_testing(news))

Enter your news article text:  One example has been shared more than 40,000 times. It appears to come from a Facebook user in Durham, who wrote that he heard "first hand that a doctor who had Corona virus recovered in double quick time" after inhaling steam. A similar message has taken different forms - coming from someone's "sister in London", "Queens NY" and "a sister from Pakistan". There's no evidence that steam inhalation works as a treatment for coronavirus.


{'LR prediction': 'Fake news', 'DT prediction': 'Fake news', 'Ensemble prediction': 'Fake news'}
