In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Load datasets
data_Fake = pd.read_csv('/content/Fake.csv')
data_true = pd.read_csv('/content/True.csv')

# Add class labels: 0 for Fake, 1 for True
data_Fake['class'] = 0
data_true['class'] = 1

# Extract the last 10 rows for manual testing
data_fake_manual_testing = data_Fake.tail(10)
data_true_manual_testing = data_true.tail(10)

# Drop the last 10 rows from the datasets
data_Fake = data_Fake.iloc[:-10]
data_true = data_true.iloc[:-10]

# Combine datasets
data_merge = pd.concat([data_Fake, data_true], axis=0)

# Drop unnecessary columns
data = data_merge.drop(['title', 'subject', 'date'], axis=1)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Text cleaning function
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\s+', ' ', text).strip()
    return text

# Apply text cleaning
data['text'] = data['text'].apply(wordopt)

# Split features and labels
x = data['text']
y = data['class']

# Stratified train-test split to handle class imbalance
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=42)

# Vectorization using TF-IDF
vectorization = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test) #Limits features to 5000 most important terms, considers unigrams and bigrams, and removes common English stopwords.

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
xv_train_resampled, y_train_resampled = smote.fit_resample(xv_train, y_train) #Synthetic Minority Oversampling Technique creates synthetic samples for the minority class (if the dataset is imbalanced)

# Logistic Regression
LR = LogisticRegression(C=0.1, random_state=42)
LR.fit(xv_train_resampled, y_train_resampled)
pred_lr = LR.predict(xv_test)
print("Logistic Regression")
print(classification_report(y_test, pred_lr)) #Fits a Logistic Regression model with regularization (C=0.1) and prints its performance.

# Decision Tree Classifier
DT = DecisionTreeClassifier(max_depth=10, random_state=42)
DT.fit(xv_train_resampled, y_train_resampled)
pred_dt = DT.predict(xv_test)
print("Decision Tree Classifier")
print(classification_report(y_test, pred_dt)) #Trains a Decision Tree with a maximum depth of 10 to prevent overfitting.

# Random Forest Classifier
RF = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
RF.fit(xv_train_resampled, y_train_resampled)
pred_rf = RF.predict(xv_test)
print("Random Forest Classifier")
print(classification_report(y_test, pred_rf)) #Helper function to convert numerical predictions to human-readable labels.

# Confusion Matrices
print("Confusion Matrix for Logistic Regression:\n", confusion_matrix(y_test, pred_lr))
print("Confusion Matrix for Decision Tree:\n", confusion_matrix(y_test, pred_dt))
print("Confusion Matrix for Random Forest:\n", confusion_matrix(y_test, pred_rf))

# Manual testing function
def output_label(n):
    return "Fake news" if n == 0 else "Not a fake news"

def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)

    # Predictions from models
    pred_LR = LR.predict(new_xv_test)
    proba_LR = LR.predict_proba(new_xv_test)[:, 1]
    pred_DT = DT.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)

    # Adjusted threshold for Logistic Regression
    threshold = 0.6
    pred_adjusted_LR = (proba_LR >= threshold).astype(int)

    print("\nRaw Predictions:")
    print(f"LR: {pred_LR[0]} (Probability: {proba_LR[0]:.2f})")
    print(f"DT: {pred_DT[0]}")
    print(f"RF: {pred_RF[0]}")

    print("\nAdjusted LR Prediction: ", output_label(pred_adjusted_LR[0]))
    return print(
        f"\nFinal Predictions:\n"
        f"LR: {output_label(pred_LR[0])}\n"
        f"DT: {output_label(pred_DT[0])}\n"
        f"RF: {output_label(pred_RF[0])}"
    )


Logistic Regression
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      5868
           1       0.97      0.98      0.98      5352

    accuracy                           0.98     11220
   macro avg       0.98      0.98      0.98     11220
weighted avg       0.98      0.98      0.98     11220

Decision Tree Classifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5868
           1       0.99      1.00      1.00      5352

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220

Random Forest Classifier
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      5868
           1       0.98      1.00      0.99      5352

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weig

In [None]:
# Input for manual testing
news = str(input("Enter a news text: "))
manual_testing(news)


Enter a news text: house widens ethics probe to include farenthold campaign work

Raw Predictions:
LR: 0 (Probability: 0.45)
DT: 0
RF: 0

Adjusted LR Prediction:  Fake news

Final Predictions:
LR: Fake news
DT: Fake news
RF: Fake news


In [None]:
# Input for manual testing
news = str(input("Enter a news text: "))
manual_testing(news)



Enter a news text: The U.S. House of Representatives gave final approval on Wednesday to a sweeping, debt-financed tax bill in a midday vote. It now will go to President Donald Trump to sign into law, although the timing of that was unclear. The Senate approved the bill early on Wednesday. Here are the key parts of the bill, representing the biggest overhaul of the U.S. tax code in more than 30 years. CORPORATE TAX RATE: Cuts corporate income tax rate permanently to 21 percent from 35 percent, as of Jan. 1, 2018. PASS-THROUGHS: Creates a 20 percent deduction for the first $315,000 of qualified business income for joint filers of pass-through businesses such as partnerships and sole proprietorships. For income above that threshold, the legislation phases in limits, producing an effective marginal tax rate of no more than 29.6 percent.   CORPORATE ALTERNATIVE MINIMUM TAX: Repeals the 20 percent corporate alternative minimum tax, set up to ensure profitable corporations pay at least some 