In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [2]:
!kaggle datasets download -d jainpooja/fake-news-detection

Dataset URL: https://www.kaggle.com/datasets/jainpooja/fake-news-detection
License(s): unknown
Downloading fake-news-detection.zip to /content
 71% 29.0M/41.0M [00:00<00:00, 157MB/s] 
100% 41.0M/41.0M [00:00<00:00, 143MB/s]


In [3]:

import zipfile
zip_ref = zipfile.ZipFile('fake-news-detection.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# %reload_ext cudf.pandas
# import cudf

## Data Description:

- title: title of the article.
- news_url: URL of the article.
- source domain: web domain where article was posted.
- tweet_num: number of retweets for this article.
- real: label column, where 1 is real and 0 is fake.

In [5]:
fake_news = pd.read_csv('/content/Fake.csv')
true_news = pd.read_csv('/content/True.csv')

In [6]:
print("Shape of fake news dataset:", fake_news.shape)
print("Columns in fake news dataset:", fake_news.columns)
print("\nShape of true news dataset:", true_news.shape)
print("Columns in true news dataset:", true_news.columns)


Shape of fake news dataset: (23481, 4)
Columns in fake news dataset: Index(['title', 'text', 'subject', 'date'], dtype='object')

Shape of true news dataset: (21417, 4)
Columns in true news dataset: Index(['title', 'text', 'subject', 'date'], dtype='object')


In [7]:
true_news.sample(10)

Unnamed: 0,title,text,subject,date
20433,Pro-independence from China posters appearing ...,HONG KONG (Reuters) - Thirteen Hong Kong unive...,worldnews,"September 11, 2017"
17627,EU to launch internal Brexit transition work: ...,BRUSSELS (Reuters) - European Union leaders wi...,worldnews,"October 12, 2017"
13604,Cambodian PM leaves for China to seek more aid,PHNOM PENH (Reuters) - Cambodian Prime Ministe...,worldnews,"November 29, 2017"
2321,Senate votes to confirm slate of three CFTC co...,WASHINGTON (Reuters) - The U.S. Senate voted o...,politicsNews,"August 3, 2017"
18031,Woman charged after trying to scale Buckingham...,LONDON (Reuters) - A woman who tried to scale ...,worldnews,"October 8, 2017"
4286,Kansas Republican wins congressional seat in s...,"KANSAS CITY, Kan (Reuters) - Kansas Republican...",politicsNews,"April 12, 2017"
15127,Ireland sees 'a way to go' before agreement on...,DUBLIN/BRUSSELS (Reuters) - Ireland s foreign ...,worldnews,"November 10, 2017"
8378,Hispanic coalition asks Trump to stop 'attacks',NEW YORK (Reuters) - The National Hispanic Lea...,politicsNews,"August 25, 2016"
20646,Singapore decried for 'harassment' of anti-dea...,SINGAPORE (Reuters) - Singapore should end har...,worldnews,"September 8, 2017"
16733,Russian radio station says intruder stabs pres...,MOSCOW (Reuters) - An intruder forced his way ...,worldnews,"October 23, 2017"


In [8]:
fake_news.sample(10)

Unnamed: 0,title,text,subject,date
386,BREAKING: Someone Else Connected To Trump Is ...,"Today, more bad news for Trump broke as yet an...",News,"September 13, 2017"
14113,TRAYVON MARTIN’S MOM Goes On BLAME WHITEY Tour...,Trayvon s dad sounds more logical and Preside...,politics,"Apr 14, 2016"
5170,Former CIA Director Reveals How Putin Recruit...,Former CIA Director Michael J. Morell has writ...,News,"August 5, 2016"
10829,SOFT COUP ALERT: Shocking Percentage Of Negati...,The Harvard study below will blow your mind! I...,politics,"May 20, 2017"
16210,JUDGE NAPOLITANO: Samsung Allowed British Inte...,.@Judgenap: Samsung allowed British intelligen...,Government News,"Mar 9, 2017"
2781,#THERESISTANCE Is Working: Homeland Security ...,"Over the weekend, the Trump Administration imp...",News,"January 29, 2017"
3314,Trump Vineyard Asks Labor Department For More...,Donald Trump may have campaigned on a promise ...,News,"December 21, 2016"
1197,The Internet Can’t Stop Laughing At Sarah Huc...,White House deputy press secretary Sarah Hucka...,News,"June 10, 2017"
7627,Scooby Doo’s ‘Mystery Machine’ Involved In Cr...,File this one under: WTF?!A woman driving a va...,News,"March 8, 2016"
21707,LIB PROFESSOR AND HARVARD GRAD SAYS PEDOPHILIA...,Perhaps if one of her children or a close rela...,left-news,"May 22, 2015"


In [9]:
true_news["class"] = 1
fake_news["class"] = 0

In [10]:
true_news.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
 4   class    21417 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 836.7+ KB


In [11]:
[true_news["title"][i] for i in range(10)]

['As U.S. budget fight looms, Republicans flip their fiscal script',
 'U.S. military to accept transgender recruits on Monday: Pentagon',
 "Senior U.S. Republican senator: 'Let Mr. Mueller do his job'",
 'FBI Russia probe helped by Australian diplomat tip-off: NYT',
 "Trump wants Postal Service to charge 'much more' for Amazon shipments",
 'White House, Congress prepare for talks on spending, immigration',
 'Trump says Russia probe will be fair, but timeline unclear: NYT',
 'Factbox: Trump on Twitter (Dec 29) - Approval rating, Amazon',
 'Trump on Twitter (Dec 28) - Global Warming',
 'Alabama official to certify Senator-elect Jones today despite challenge: CNN']

In [12]:
true_news["text"][6]

'WEST PALM BEACH, Fla (Reuters) - President Donald Trump said on Thursday he believes he will be fairly treated in a special counsel investigation into Russian meddling in the U.S. presidential election, but said he did not know how long the probe would last. The federal investigation has hung over Trump’s White House since he took office almost a year ago, and some Trump allies have in recent weeks accused the team of Justice Department Special Counsel Robert Mueller of being biased against the Republican president. But in an interview with the New York Times, Trump appeared to shrug off concerns about the investigation, which was prompted by U.S. intelligence agencies’ conclusion that Russia tried to help Trump defeat Democrat Hillary Clinton by hacking and releasing embarrassing emails and disseminating propaganda. “There’s been no collusion. But I think he’s going to be fair,” Trump said in what the Times described as a 30-minute impromptu interview at his golf club in West Palm Be

In [13]:
print("Null values in true news data are \n", true_news.isnull().sum())
print("-"*50)
print("Null values in fake news data are \n", fake_news.isnull().sum())

Null values in true news data are 
 title      0
text       0
subject    0
date       0
class      0
dtype: int64
--------------------------------------------------
Null values in fake news data are 
 title      0
text       0
subject    0
date       0
class      0
dtype: int64


In [14]:
# Concatenate fake and true news dataframes
all_news = pd.concat([fake_news, true_news], axis=0, ignore_index=True)
all_news.shape

(44898, 5)

In [15]:
# Shuffle the combined dataframe
all_news = all_news.sample(frac=1).reset_index(drop=True)
all_news.head()

Unnamed: 0,title,text,subject,date,class
0,"London metro station incident caused by bomb, ...",LONDON (Reuters) - British police said a bomb ...,worldnews,"September 15, 2017",1
1,Something AMAZING Happens When You Redact Mos...,When most presidents in the past have sent a m...,News,"December 31, 2016",0
2,GOP Hilariously Tries Seeing The Future By De...,Nobody can accuse the GOP of having it all tog...,News,"October 4, 2016",0
3,One Of Trump’s Groping Victims Just Came Forw...,After a tape was released in which Donald Trum...,News,"October 8, 2016",0
4,Exclusive: Former top Brazil prosecutor says s...,BRASILIA (Reuters) - Three senior Brazilian la...,worldnews,"December 1, 2017",1


In [16]:
all_news[["title","text","class"]].sample(20)

Unnamed: 0,title,text,class
13661,Virginia Republican Goodlatte will not seek re...,WASHINGTON (Reuters) - U.S. Representative Bob...,1
42915,Top Senate Democrat Schumer: 50-50 chance Repu...,WASHINGTON (Reuters) - U.S. Senate Democratic ...,1
22602,Illinois judge to decide jurisdiction over Cru...,CHICAGO (Reuters) - An Illinois judge on Frida...,1
42959,DOCTOR MENTIONED In Hillary Email Released By ...,"54-year-old Dr. Dean Lorich, Associate Directo...",0
23594,KELLYANNE CONWAY Tells “Haters” Reason She Loo...,Kellyanne Conway responded to a New York Times...,0
15408,AWESOME! Conservative Artist Crashes Anti-Trum...,Our favorite conservative street artist Sabo c...,0
19862,Trump gains first endorsement from member of C...,WASHINGTON (Reuters) - Republican U.S. Represe...,1
39890,Trump Just Sunk Himself In Pennsylvania With ...,Trump made a visit to the Keystone State in an...,0
127,GOP Rep’s Defense Of Jeff Sessions Is Straigh...,"Donald Trump, if nothing else, is the white ma...",0
14708,Qatar emir says open to Trump-hosted talks ove...,DOHA (Reuters) - Qatar s ruler said he is read...,1


In [17]:
all_news["subject"].value_counts().head(20)

subject
politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: count, dtype: int64

In [18]:
all_news["class"].value_counts()

class
0    23481
1    21417
Name: count, dtype: int64

In [19]:
all_news = all_news.drop(['title','subject','date'], axis=1)
all_news = all_news.drop_duplicates()
all_news.sample(5)

Unnamed: 0,text,class
32685,WASHINGTON (Reuters) - U.S. President Donald T...,1
29007,BRUSSELS (Reuters) - European parliamentarians...,1
1581,WARSAW (Reuters) - Poland s new prime minister...,1
41472,WASHINGTON (Reuters) - The top Republican and ...,1
22453,WASHINGTON (Reuters) - U.S. lawmakers released...,1


In [20]:
# Calculate vocabulary size for 'text'
title_vocab = set()
all_news['text'].str.lower().str.split().apply(title_vocab.update)
title_vocab_size = len(title_vocab)
print("Title vocabulary size:", title_vocab_size)

Title vocabulary size: 362593


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')


stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [22]:
def preprocess_text(text):
  # Lowercase the text
  text = text.lower()
  # Remove special characters and digits
  text = re.sub(r'[^a-zA-Z]', ' ', text)
  # Tokenize the text
  words = word_tokenize(text)
  # Remove stop words
  words = [word for word in words if word not in stop_words]
  # Stem the words
  # words = [stemmer.stem(word) for word in words]
  # Lemmatize the words
  words = [lemmatizer.lemmatize(word) for word in words]
  # Join the words back into a string
  text = ' '.join(words)
  return text

all_news['text'] = all_news['text'].apply(preprocess_text)


In [23]:
# Calculate vocabulary size for 'text'
title_vocab = set()
all_news['text'].str.lower().str.split().apply(title_vocab.update)
title_vocab_size = len(title_vocab)
print("Title vocabulary size:", title_vocab_size)

Title vocabulary size: 107038


In [24]:
[all_news['text'][i] for i in range(1)]

['london reuters british police said bomb used explosion london metro station injured people officer described terrorist incident ass detonation improvised explosive device britain top counter terrorism officer mark rowley said friday london police supported britain mi intelligence service said']

Applying Count Vectorization

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 3))

# Process data in chunks
chunk_size = 1000  # Adjust as needed
for i in range(0, len(all_news), chunk_size):
    chunk = all_news['text'][i: i + chunk_size]
    vectorizer.fit(chunk)

# Transform the entire dataset (if needed)
X = vectorizer.transform(all_news['text'])


In [26]:
type(X)

In [27]:
# Convert sparse matrix to dataframe
X_df = pd.DataFrame.sparse.from_spmatrix(X, columns=vectorizer.get_feature_names_out())

# Extract target variable
y = all_news['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)

In [28]:
print("Shape of CountVectorized data frame:", X_df.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of CountVectorized data frame: (38647, 10000)
Shape of X_train: (27052, 10000)
Shape of X_test: (11595, 10000)
Shape of y_train: (27052,)
Shape of y_test: (11595,)


In [29]:

def train_and_evaluate(model, X_train, X_test, y_train, y_test):
  """
  Trains a given model and evaluates its performance.

  Args:
    model: The machine learning model to train.
    X_train: Training data features.
    X_test: Testing data features.
    y_train: Training data labels.
    y_test: Testing data labels.

  Returns:
    A tuple containing the model name, accuracy, and confusion matrix.
  """

  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)
  return type(model).__name__, accuracy, conf_matrix

# List of models to evaluate
models = [
    LogisticRegression(class_weight="balanced"),
    PassiveAggressiveClassifier(),
    MultinomialNB(),
    XGBClassifier(),
    RandomForestClassifier(n_estimators=100, random_state=42)
]

# Train and evaluate each model
results = []
for model in models:
  results.append(train_and_evaluate(model, X_train, X_test, y_train, y_test))

# Sort results by accuracy in ascending order
results.sort(key=lambda x: x[1])

# Print results
for name, accuracy, conf_matrix in results:
  print(f"Model: {name}")
  print(f"Accuracy: {accuracy}")
  print(f"Confusion Matrix:\n {conf_matrix}\n")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: MultinomialNB
Accuracy: 0.9656748598533851
Confusion Matrix:
 [[5016  219]
 [ 179 6181]]

Model: PassiveAggressiveClassifier
Accuracy: 0.9885295385942217
Confusion Matrix:
 [[5172   63]
 [  70 6290]]

Model: LogisticRegression
Accuracy: 0.9956877964639931
Confusion Matrix:
 [[5201   34]
 [  16 6344]]

Model: RandomForestClassifier
Accuracy: 0.9962915049590341
Confusion Matrix:
 [[5200   35]
 [   8 6352]]

Model: XGBClassifier
Accuracy: 0.9971539456662355
Confusion Matrix:
 [[5212   23]
 [  10 6350]]



In [30]:
def preprocess_new_text(text):
  # Lowercase the text
  text = text.lower()
  # Remove special characters and digits
  text = re.sub(r'[^a-zA-Z]', ' ', text)
  # Tokenize the text
  words = word_tokenize(text)
  # Remove stop words
  words = [word for word in words if word not in stop_words]
  # Lemmatize the words
  words = [lemmatizer.lemmatize(word) for word in words]
  # Join the words back into a string
  text = ' '.join(words)
  return text

def tokenize_and_predict(text, model):
  # Preprocess the new text
  preprocessed_text = preprocess_new_text(text)
  # Transform the preprocessed text using the fitted vectorizer
  text_vectorized = vectorizer.transform([preprocessed_text])
  # Make prediction using the trained model
  prediction = model.predict(text_vectorized)
  return prediction

In [31]:
# Infrencing on new artical
new_text = input("Enter a news article: ")
trained_model = LogisticRegression() # Replace with your best performing model
trained_model.fit(X_train, y_train)
prediction = tokenize_and_predict(new_text, trained_model)

if prediction[0] == 1:
  print("The news article is predicted to be REAL.")
else:
  print("The news article is predicted to be FAKE.")


Enter a news article: The government should publish advice for its departments on engaging with young people, including on TikTok, a group of MPs has said. The culture, media and sport committee has been looking into countering disinformation online. Its call comes despite TikTok currently being banned on government devices due to data security concerns. Accurate information needs to be communicated in a "relatable" way, the MPs say. The committee says that countering misinformation is particularly important for young people, who are increasingly turning away from traditional media and towards social media for their information. It advises meeting young people "where they are" - with 15 to 24 year olds spending around an hour per day on TikTok, according to media regulator Ofcom. The report says: "The Government must have a clear strategy for communicating with young people and adapting to the development of new apps and platforms which appeal to this audience." Some MPs do still use T

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:

# Train and evaluate each model, storing them in a dictionary
trained_models = {}
for model in models:
  name, accuracy, conf_matrix = train_and_evaluate(model, X_train, X_test, y_train, y_test)
  trained_models[name] = model

# Pickle each model with its name
for name, model in trained_models.items():
  filename = f"{name}.pkl"
  with open(filename, 'wb') as file:
    pickle.dump(model, file)
  print(f"Model {name} saved to {filename}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model LogisticRegression saved to LogisticRegression.pkl
Model PassiveAggressiveClassifier saved to PassiveAggressiveClassifier.pkl
Model MultinomialNB saved to MultinomialNB.pkl
Model XGBClassifier saved to XGBClassifier.pkl
Model RandomForestClassifier saved to RandomForestClassifier.pkl


In [4]:
import pickle


In [8]:
# Load the saved model and vectorizer
with open('/content/LogisticRegression.pkl', 'rb') as f:
  loaded_model = pickle.load(f)

with open('/content/count_vectorizer.pkl', 'rb') as f:
  loaded_vectorizer = pickle.load(f)

# Infrencing on new article
new_text = input("Enter a news article: ")

def preprocess_new_text(text):
  # Lowercase the text
  text = text.lower()
  # Remove special characters and digits
  text = re.sub(r'[^a-zA-Z]', ' ', text)
  # Tokenize the text
  words = word_tokenize(text)
  # Remove stop words
  words = [word for word in words if word not in stop_words]
  # Lemmatize the words
  words = [lemmatizer.lemmatize(word) for word in words]
  # Join the words back into a string
  text = ' '.join(words)
  return text

def tokenize_and_predict(text, model, vectorizer):
  # Preprocess the new text
  preprocessed_text = preprocess_new_text(text)
  # Transform the preprocessed text using the loaded vectorizer
  text_vectorized = vectorizer.transform([preprocessed_text])
  # Make prediction using the loaded model
  prediction = model.predict(text_vectorized)
  return prediction

prediction = tokenize_and_predict(new_text, loaded_model, loaded_vectorizer)

if prediction[0] == 1:
  print("The news article is predicted to be REAL.")
else:
  print("The news article is predicted to be FAKE.")


Enter a news article: NATO allies on Tuesday welcomed President Donald Trump s decision to commit more forces to Afghanistan, as part of a new U.S. strategy he said would require more troops and funding from America s partners. Having run for the White House last year on a pledge to withdraw swiftly from Afghanistan, Trump reversed course on Monday and promised a stepped-up military campaign against  Taliban insurgents, saying:  Our troops will fight to win .  U.S. officials said he had signed off on plans to send about 4,000 more U.S. troops to add to the roughly 8,400 now deployed in Afghanistan. But his speech did not define benchmarks for successfully ending the war that began with the U.S.-led invasion of Afghanistan in 2001, and which he acknowledged had required an   extraordinary sacrifice of blood and treasure .  We will ask our NATO allies and global partners to support our new strategy, with additional troops and funding increases in line with our own. We are confident they 

In [None]:
# prompt: just give me updated code for streamlit part where i can select diferent model through selection bar, celalring input feilds after each prediction

import streamlit as st
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Load the saved models and vectorizer
model_paths = {
    "Logistic Regression": "/content/LogisticRegression.pkl",
    "Passive Aggressive Classifier": "/content/PassiveAggressiveClassifier.pkl",
    "Multinomial Naive Bayes": "/content/MultinomialNB.pkl",
    "XGBoost Classifier": "/content/XGBClassifier.pkl",
    "Random Forest Classifier": "/content/RandomForestClassifier.pkl"
}

loaded_models = {}
for name, path in model_paths.items():
    with open(path, 'rb') as f:
        loaded_models[name] = pickle.load(f)

with open('/content/count_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

# Download NLTK resources if necessary
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_new_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)
    return text

def tokenize_and_predict(text, model, vectorizer):
    preprocessed_text = preprocess_new_text(text)
    text_vectorized = vectorizer.transform([preprocessed_text])
    prediction = model.predict(text_vectorized)
    return prediction

# Streamlit app
st.title("Fake News Detection")

# Model selection
selected_model_name = st.selectbox("Select Model", list(loaded_models.keys()))
selected_model = loaded_models[selected_model_name]

# Text input
text_input = st.text_area("Enter a news article:")

# Prediction button
if st.button("Predict"):
    if text_input:
        prediction = tokenize_and_predict(text_input, selected_model, loaded_vectorizer)
        if prediction[0] == 1:
            st.success("The news article is predicted to be REAL.")
        else:
            st.error("The news article is predicted to be FAKE.")
        # Clear the text input field
        st.session_state['text_input'] = ''
    else:
        st.warning("Please enter some text.")

# Initialize session state for text input if not already
if 'text_input' not in st.session_state:
    st.session_state['text_input'] = ''

# Set the value of the text area based on session state
text_input = st.text_area("Enter a news article:", value=st.session_state['text_input'])


In [1]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-

In [5]:
%%writefile app.py

!pip install streamlit
import streamlit as st
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Load the saved model and vectorizer
with open('/content/XGBClassifier.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

with open('/content/count_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_new_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)
    return text

# Prediction function
def tokenize_and_predict(text, model, vectorizer):
    preprocessed_text = preprocess_new_text(text)
    text_vectorized = vectorizer.transform([preprocessed_text])
    prediction = model.predict(text_vectorized)
    return prediction

# Streamlit app
st.title("Fake News Detection")
st.write("Enter a news article below to check if it's real or fake.")

user_input = st.text_area("Enter news article here:")
if st.button("Predict"):
    if user_input:
        prediction = tokenize_and_predict(user_input, loaded_model, loaded_vectorizer)
        if prediction[0] == 1:
            st.success("The news article is predicted to be REAL.")
        else:
            st.error("The news article is predicted to be FAKE.")
    else:
        st.warning("Please enter a news article.")


Writing app.py


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.199.158.163:8501[0m
[0m
[K[?25hnpx: installed 22 in 3.57s
your url is: https://proud-chairs-travel.loca.lt


Applying TF-IDF Vectorization

In [25]:
# Vectorize 'text' columns
vectorizer_text = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)

# Fit the vectorizer on the entire dataset first
vectorizer_text.fit(all_news['text'])

# Process data in chunks
chunk_size = 10000  # Adjust chunk size as needed
all_news_processed = pd.DataFrame()

In [26]:
for i in range(0, all_news.shape[0], chunk_size):
    chunk = all_news.iloc[i:i + chunk_size]

    # Transform 'text' column in the chunk (no fitting here)
    title_vectors_chunk = vectorizer_text.transform(chunk['text'])
    title_df_chunk = pd.DataFrame(title_vectors_chunk.toarray(),
                                  columns=vectorizer_text.get_feature_names_out())

    # Merge vectorized data back to the chunk
    chunk_processed = pd.concat([chunk.reset_index(drop=True), title_df_chunk], axis=1)

    # Append processed chunk to the final dataframe
    all_news_processed = pd.concat([all_news_processed, chunk_processed], axis=0, ignore_index=True)



In [27]:
all_news_processed.shape

(38647, 10002)

In [None]:
# prompt: train a logistic regression

# Extract features and target variable
X = all_news_processed.drop(['class'], axis=1)
y = all_news_processed['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Logistic Regression Model:")
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)


In [None]:
# prompt: loops through all models and train them, print out acurracies and conf matrix similarly as above

# Extract features and target variable
X_tfidf = all_news_processed.drop(['class'], axis=1)
y_tfidf = all_news_processed['class']

# Split the data into training and testing sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    X_tfidf, y_tfidf, test_size=0.3, random_state=42
)

# List of models to evaluate
models = [
    LogisticRegression(class_weight="balanced"),
    PassiveAggressiveClassifier(),
    MultinomialNB(),
    SVC(),
    XGBClassifier(),
    RandomForestClassifier(n_estimators=100, random_state=42)
]

# Train and evaluate each model
results_tfidf = []
for model in models:
    results_tfidf.append(train_and_evaluate(model, X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf))

# Sort results by accuracy in ascending order
results_tfidf.sort(key=lambda x: x[1])

# Print results
for name, accuracy, conf_matrix in results_tfidf:
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n {conf_matrix}\n")
