In [12]:
import os
print("Working Directory: ",os.getcwd())#checks where we are
print("Files that exist: ",os.listdir())



Working Directory:  c:\Users\jeevi\fake-news-classifier\notebooks
Files that exist:  ['preprocessing.ipynb']


In [26]:
import pandas as pd

true=pd.read_csv("../data/True.csv")
fake=pd.read_csv("../data/Fake.csv")

true['label']=0
fake['label']=1

df=pd.concat([true,fake],axis=0).sample(frac=1,random_state=42).reset_index(drop=True)
#concat stacks the dataframe vertically,df has all the news
#sample-shuffles the data 
# frac=1 means take 100% of the data, but shuffled.
# random_state=42 ensures reproducible shuffling.
#reset_index,resets the row indices after shuffling
print("df.shape=",df.shape) #returns the number of rows and columns
df.head()#shows the first 5 rows
df[0:20]



df.shape= (44898, 5)


Unnamed: 0,title,text,subject,date,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",1
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",1
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",1
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",0
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",1
5,"As private lawyer, Trump high court pick was f...",WASHINGTON (Reuters) - As a lawyer in private ...,politicsNews,"February 1, 2017",0
6,Yemeni Salafist imam killed in Aden: sources,ADEN (Reuters) - A Salafist imam was shot dead...,worldnews,"October 28, 2017",0
7,FBI says witnesses in U.S. probe into Malaysia...,KUALA LUMPUR (Reuters) - Potential witnesses t...,worldnews,"September 6, 2017",0
8,An Easy To Read Chart Shows How Bernie Sanders...,The goal of socialism is communism. -Vladimi...,politics,"Feb 24, 2016",1
9,MMA FIGHTER JAKE SHIELDS Embarrasses Cowards I...,Opposing views and beliefs has much of this co...,politics,"Feb 4, 2017",1


In [27]:
#spacy-python lib for NLP
import spacy
# loading a language model
#en_core_web_sm-small english model for text processing
#disabling-parser[syntactic parsing] and ner(named entity recognition(detecting name,location,dates etc))
nlp = spacy.load("en_core_web_sm", disable=["parser","ner"])


In [29]:
import re  #regular expressions,useful for pattern matching(like removing numbers)
import string  #provides a list of punctuation characters
from nltk.corpus import stopwords #common words like "the" "is" "and"

stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

df['content'] = df['text'].apply(clean_text)
df.head()


Unnamed: 0,title,text,subject,date,label,content
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",1,donald trump white house chaos trying cover ru...
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",1,donald trump presumptive gop nominee time reme...
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",1,mike pence huge homophobe supports exgay conve...
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",0,san francisco reuters california attorney gene...
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",1,twisted reasoning comes pelosi days especially...


In [32]:
df[['content', 'label']].to_csv("../data/cleaned_news.csv", index=False)
cleaned_news=pd.read_csv("../data/cleaned_news.csv")
cleaned_news.head()


Unnamed: 0,content,label
0,donald trump white house chaos trying cover ru...,1
1,donald trump presumptive gop nominee time reme...,1
2,mike pence huge homophobe supports exgay conve...,1
3,san francisco reuters california attorney gene...,0
4,twisted reasoning comes pelosi days especially...,1


In [None]:
#Step 1: Separate Features and Labels
cleaned_news['content']=cleaned_news['content'].fillna("")
x=cleaned_news['content']
y=cleaned_news['label']

In [45]:
# Step 2:Split into train and test sets
# Step 3:Convert Text to Numbers(TF-IDF)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
vectorizer=TfidfVectorizer(max_features=10000,stop_words='english',ngram_range=(1,2))
x_train_tfidf=vectorizer.fit_transform(x_train) #numerical features for training
x_test_tfidf=vectorizer.fit_transform(x_test) #numerical features for testing

In [None]:
# #step 4: Train the model
# #logistic regression

# from sklearn.linear_model import LogisticRegression

# model=LogisticRegression(max_iter=1000)
# model.fit(x_train_tfidf,y_train)

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(x_train_tfidf, y_train)
y_pred_nb = nb_model.predict(x_test_tfidf)

from sklearn.metrics import accuracy_score, classification_report
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nReport:\n", classification_report(y_test, y_pred_nb))



Naive Bayes Accuracy: 0.7070155902004455

Report:
               precision    recall  f1-score   support

           0       0.68      0.73      0.70      4311
           1       0.73      0.69      0.71      4669

    accuracy                           0.71      8980
   macro avg       0.71      0.71      0.71      8980
weighted avg       0.71      0.71      0.71      8980



In [47]:
# Step 5:Make predictions and evaluate
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
# Predict on test data
y_pred=nb_model.predict(x_test_tfidf)

#accuracy
print("Accuracy:",accuracy_score(y_test,y_pred))

#Detailed report

print("\nClassification Report:\n",classification_report(y_test,y_pred))

#confusion matrix

print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))

Accuracy: 0.7070155902004455

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.73      0.70      4311
           1       0.73      0.69      0.71      4669

    accuracy                           0.71      8980
   macro avg       0.71      0.71      0.71      8980
weighted avg       0.71      0.71      0.71      8980


Confusion Matrix:
 [[3132 1179]
 [1452 3217]]


In [53]:
import joblib

# Save model and vectorizer
joblib.dump(nb_model, "fake_news_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")




['tfidf_vectorizer.pkl']

In [54]:
# Load back
model = joblib.load("fake_news_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")


In [55]:
# Step 7: Create a Prediction Function
import joblib

# Load saved model and vectorizer
model = joblib.load("fake_news_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

def predict_news(text):
    # Transform input text with the same vectorizer
    text_tfidf = vectorizer.transform([text])
    # Predict with the trained model
    prediction = model.predict(text_tfidf)[0]
    # Map label to meaning
    return "Fake News ❌" if prediction == 1 else "Real News ✅"


In [56]:
print(predict_news("Breaking: Scientists discover cure for cancer in 24 hours"))
print(predict_news("The government announced new tax reforms today"))


Fake News ❌
Fake News ❌


In [None]:
# step 8: Deployment
