# 📰 Fake News Detection using Naive Bayes SVM (NBSVM)
This notebook trains an NBSVM classifier using TF-IDF transformed features from the Kaggle Fake News dataset.

In [9]:
!pip install -q scikit-learn pandas numpy nltk


In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [12]:
df = pd.read_csv('/content/gdrive/MyDrive/balanced_dataset.csv')
df = df[['title', 'text', 'label']]  # Combine title + text to predict label (0=real, 1=fake)
df['text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
df = df[['text', 'label']].dropna()
df.head()


Unnamed: 0,text,label
0,Retail Payrolls Sustain a New Blow as Shopping...,1
1,5 ways to get rid of credit card debt without ...,0
2,SEA CONTAINERS <SCR> EXPECTS BETTER FIRST QTR ...,0
3,Russian Lawyer At Trump Jr.a??s Meeting Has W...,1
4,"After speaker defeat, Marjorie Taylor Greene i...",0


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"\@\w+|\#", '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    return text

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['text_clean'] = df['text'].apply(clean_text).apply(remove_stopwords)
df.head()


Unnamed: 0,text,label,text_clean
0,Retail Payrolls Sustain a New Blow as Shopping...,1,retail payrolls sustain new blow shopping habi...
1,5 ways to get rid of credit card debt without ...,0,ways get rid credit card debt without new loan...
2,SEA CONTAINERS <SCR> EXPECTS BETTER FIRST QTR ...,0,sea containers scr expects better first qtr se...
3,Russian Lawyer At Trump Jr.a??s Meeting Has W...,1,russian lawyer trump jras meeting worked russi...
4,"After speaker defeat, Marjorie Taylor Greene i...",0,speaker defeat marjorie taylor greene back sti...


In [14]:
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['text_clean'])
y = df['label'].values


In [15]:
def compute_log_count_ratio(X, y, alpha=1.0):
    pos = X[y == 1]
    neg = X[y == 0]
    p = alpha + pos.sum(axis=0)
    q = alpha + neg.sum(axis=0)
    r = np.log((p / p.sum()) / (q / q.sum()))
    return np.asarray(r).flatten()

r = compute_log_count_ratio(X, y)
X_nbsvm = X.multiply(r)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_nbsvm, y, test_size=0.2, random_state=42)
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [17]:
print(classification_report(y_test, y_pred, target_names=['Real', 'Fake']))


              precision    recall  f1-score   support

        Real       0.96      0.91      0.93      6045
        Fake       0.90      0.95      0.93      5377

    accuracy                           0.93     11422
   macro avg       0.93      0.93      0.93     11422
weighted avg       0.93      0.93      0.93     11422

