# ✅ Fake News Detection with Pretrained Word2Vec + SVM (Robust Tokenizer)
This notebook uses a pre-trained Word2Vec model (Google News) for feature extraction and applies SVM for classification. Includes clean tokenization and proper train/validation split.

In [2]:
!pip install -q gensim scikit-learn  nltk


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [7]:
!pip uninstall -y numpy
!pip install numpy==1.23.5
!pip install gensim==4.3.0


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chex 0.1.89 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
albucore 0.0.24 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.2 requires numpy>=1.24, but you have num

Collecting gensim==4.3.0
  Downloading gensim-4.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting FuzzyTM>=0.4.0 (from gensim==4.3.0)
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim==4.3.0)
  Downloading pyFUME-0.3.4-py3-none-any.whl.metadata (9.7 kB)
[31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import files


df = pd.read_csv('train.csv')
df['text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
df = df[['text', 'label']].dropna()
df.head()


Unnamed: 0,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
1,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0
4,SATAN 2: Russia unvelis an image of its terrif...,1


In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"\@\w+|\#", '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    return text

def tokenize(text):
    return [w for w in clean_text(text).split() if w not in stop_words]

df['tokens'] = df['text'].apply(tokenize)
df.head()


Unnamed: 0,text,label,tokens
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1,"[law, enforcement, high, alert, following, thr..."
1,Did they post their votes for Hillary already?,1,"[post, votes, hillary, already]"
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1,"[unbelievable, obamas, attorney, general, says..."
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0,"[bobby, jindal, raised, hindu, uses, story, ch..."
4,SATAN 2: Russia unvelis an image of its terrif...,1,"[satan, russia, unvelis, image, terrifying, ne..."


In [4]:
# This downloads ~1.5GB and may take time
import gensim.downloader as api
w2v_model = api.load("word2vec-google-news-300")




In [5]:
def document_vector(tokens):
    tokens = [token for token in tokens if token in w2v_model]
    if len(tokens) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean([w2v_model[token] for token in tokens], axis=0)

X = np.array([document_vector(tokens) for tokens in df['tokens']])
y = df['label'].values


In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [7]:
clf = LinearSVC(class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)


In [8]:
print(classification_report(y_val, y_pred, target_names=['Real', 'Fake']))


              precision    recall  f1-score   support

        Real       0.87      0.88      0.87      7006
        Fake       0.88      0.88      0.88      7421

    accuracy                           0.88     14427
   macro avg       0.88      0.88      0.88     14427
weighted avg       0.88      0.88      0.88     14427

