In [1]:
import pandas as pd

In [3]:
import pandas as pd

fake_df = pd.read_csv("/content/drive/MyDrive/FakeNewsProject/Fake.csv",
    engine="python",
    on_bad_lines="skip"
)

true_df = pd.read_csv("/content/drive/MyDrive/FakeNewsProject/True.csv",
    engine="python",
    on_bad_lines="skip"
)

fake_df.head()


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year‚Äô...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama‚Äôs Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
fake_df["label"] = 0   # Fake news
true_df["label"] = 1   # Real news

In [5]:
news_df = pd.concat([fake_df, true_df], axis=0)

In [6]:
news_df = news_df.dropna()
news_df = news_df.reset_index(drop=True)
news_df = news_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
X = news_df["text"]
y = news_df["label"]

In [8]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

In [11]:
X = X.apply(preprocess_text)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_tfidf = vectorizer.fit_transform(X)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

In [14]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)

In [15]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

In [16]:
import pickle

with open("lr_model.pkl", "wb") as f:
    pickle.dump(lr_model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [17]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(name, y_test, y_pred):
    print(name)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("-"*30)

evaluate_model("Naive Bayes", y_test, nb_pred)
evaluate_model("Logistic Regression", y_test, lr_pred)

Naive Bayes
Accuracy: 0.9381730647409097
Precision: 0.9180276381909548
Recall: 0.927936507936508
F1 Score: 0.9229554783706978
------------------------------
Logistic Regression
Accuracy: 0.9887241859875839
Precision: 0.9835703001579779
Recall: 0.9882539682539683
F1 Score: 0.9859065716547902
------------------------------


In [18]:
from sklearn.model_selection import cross_val_score
import numpy as np

nb_scores = cross_val_score(nb_model, X_tfidf, y, cv=10)
lr_scores = cross_val_score(lr_model, X_tfidf, y, cv=10)

print("NB Mean:", nb_scores.mean())
print("LR Mean:", lr_scores.mean())

NB Mean: 0.9437461018916251
LR Mean: 0.9898135020393759


In [19]:
from scipy.stats import ttest_rel

t_stat, p_value = ttest_rel(nb_scores, lr_scores)
print("T-statistic:", t_stat)
print("P-value:", p_value)

T-statistic: -51.37770287820809
P-value: 2.0130348385811767e-12


In [20]:
import json

results = {
    "Naive_Bayes": {
        "accuracy": accuracy_score(y_test, nb_pred),
        "f1_score": f1_score(y_test, nb_pred)
    },
    "Logistic_Regression": {
        "accuracy": accuracy_score(y_test, lr_pred),
        "f1_score": f1_score(y_test, lr_pred)
    }
}

with open("results.json", "w") as f:
    json.dump(results, f, indent=4)

In [21]:
from concurrent.futures import ThreadPoolExecutor

def predict_news(text):
    text = preprocess_text(text)
    vec = vectorizer.transform([text])
    return lr_model.predict(vec)[0]

samples = X.sample(5).tolist()

with ThreadPoolExecutor() as executor:
    predictions = list(executor.map(predict_news, samples))

predictions

[np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0)]

In [22]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.52.2-py3-none-any.whl.metadata (9.8 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.2-py3-none-any.whl (9.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.0/9.0 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.52.2


In [23]:
%%writefile app.py


import streamlit as st
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

# Load trained model
with open("lr_model.pkl", "rb") as f:
    model = pickle.load(f)

# Load vectorizer
with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

st.title("Fake News Detection Web App")
st.write("Enter a news article to classify it as Fake or Real")

user_input = st.text_input(
    "Paste News Text Here:",
    "",
    key="news_input_text"
)


if st.button("Predict"):
    if user_input.strip() == "":
        st.warning("Please enter some text")
    else:
        clean_text = preprocess_text(user_input)
        vec = vectorizer.transform([clean_text])
        prediction = model.predict(vec)[0]

        if prediction == 1:
            st.success("üü¢ This news is REAL")
        else:
            st.error("üî¥ This news is FAKE")


Writing app.py


In [24]:
#!pip install streamlit

In [25]:
#!streamlit run app.py --server.enableCORS false --server.enableXsrfProtection false & npx localtunnel --port 8501


In [26]:
!pip install gradio



In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
!python /content/drive/MyDrive/FakeNewsProject/app_gradio.py

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://eda70d4e95fa4615d0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
Created dataset file at: .gradio/flagged/dataset1.csv
Keyboard interruption in main thread... closing server.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 3043, in block_thread
    time.sleep(0.1)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/content/drive/MyDrive/FakeNewsProject/app_gradio.py", l

In [33]:
!mkdir /content/drive/MyDrive/FakeNewsProject


mkdir: cannot create directory ‚Äò/content/drive/MyDrive/FakeNewsProject‚Äô: File exists


In [34]:
!cp /content/sample_data/* /content/drive/MyDrive/FakeNewsProject/
