In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
FAKE_PATH = '/content/drive/MyDrive/Minor Project 7th Sem Dataset /Fake.csv'   # <-- change if your path differs
TRUE_PATH = '/content/drive/MyDrive/Minor Project 7th Sem Dataset /True.csv'   # <-- change if your path differs

# Quick check to see files exist
import os
print("Fake exists:", os.path.exists(FAKE_PATH))
print("True exists:", os.path.exists(TRUE_PATH))

Fake exists: False
True exists: False


In [3]:
# Cell 2 — Install PyG (PyTorch Geometric) and other libs (run once)
# These installs are a bit heavy but required for GNNs. If you prefer not to install, skip GNN cells.
!pip install -q torch==2.2.2+cu121 torchvision==0.15.2+cu121 torchaudio==2.2.2 --extra-index-url https://download.pytorch.org/whl/cu121
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install -q torch-geometric
!pip install -q scikit-learn nltk umap-learn
# PyG might already be available sometimes; installs can take a couple minutes.


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m757.2/757.2 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Ignored the following yanked versions: 0.1.6, 0.1.7, 0.1.8, 0.1.9, 0.2.0, 0.2.1, 0.2.2, 0.2.2.post2, 0.2.2.post3[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement torchvision==0.15.2+cu121 (from versions: 0.1.6, 0.2.0, 0.17.0, 0.17.0+cu121, 0.17.1, 0.17.1+cu121, 0.17.2, 0.17.2+cu121, 0.18.0, 0.18.0+cu121, 0.18.1, 0.18.1+cu121, 0.19.0, 0.19.0+cu121, 0.19.1, 0.19.1+cu121, 0.20.0, 0.20.0+cu121, 0.20.1, 0.20.1+cu121, 0.21.0, 0.22.0, 0.22.1, 0.23.0)[0m[31m
[0m[31mERROR: No matching distribution found for torchvision==0.15.2+cu121[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# Cell 3 — Imports & lightweight NLP setup
import os, re, time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.kernel_approximation import RBFSampler   # Random Fourier Features (RFF)
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, global_mean_pool

import nltk
nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Device: cpu


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
# Cell 4 — Load CSVs and quick combine
df_fake = pd.read_csv(FAKE_PATH)
df_true = pd.read_csv(TRUE_PATH)

df_fake['label'] = 1
df_true['label'] = 0
df = pd.concat([df_fake, df_true], ignore_index=True).reset_index(drop=True)
print("Total articles:", len(df))
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle
df['content'] = (df['title'].fillna('') + ' ' + df['text'].fillna('')).astype(str)
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Minor Project 7th Sem Dataset /Fake.csv'

In [None]:
# Cell 5 — Lightweight preprocess (fast)
import re
def preprocess_text_simple(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+|\S+@\S+', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in STOPWORDS and len(t) > 2]
    return ' '.join(tokens)

# Apply to a subset if dataset huge to save time — but default apply to all
df['clean'] = df['content'].map(preprocess_text_simple)


In [None]:
# Cell 6 — Create TF-IDF features + compress with SVD to get node feature vectors (small dims)
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=30000, min_df=3, sublinear_tf=True)
X_tfidf = tfidf.fit_transform(df['clean'])
print("TF-IDF shape:", X_tfidf.shape)

# Compress to low-dim node features (to save memory for GNN)
svd = TruncatedSVD(n_components=128, random_state=42)   # 64-128 dims are usually OK
X_embed = svd.fit_transform(X_tfidf)
print("Compressed features shape:", X_embed.shape)


In [None]:
# Cell 7 — Build k-NN graph among documents (sparse edges). Use cosine/Euclidean on embeddings.
k = 8   # neighbors — tune small for memory
nbrs = NearestNeighbors(n_neighbors=k+1, metric='cosine', n_jobs=2).fit(X_embed)
distances, indices = nbrs.kneighbors(X_embed)

# Build edge_index for PyG (bidirectional edges)
edge_index_list = []
N = X_embed.shape[0]
for i in range(N):
    for j in indices[i][1:]:  # skip self (first neighbor)
        edge_index_list.append([i, j])
        edge_index_list.append([j, i])
edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()
print("Number of edges:", edge_index.size(1))


In [None]:
# Cell 8 — Prepare PyG Data object for node classification
x = torch.tensor(X_embed, dtype=torch.float)
y = torch.tensor(df['label'].values, dtype=torch.long)
data = Data(x=x, edge_index=edge_index, y=y).to(device)


In [None]:
# Cell 9 — Train / test masks (stratified split) for node classification
train_idx, test_idx = train_test_split(np.arange(N), test_size=0.2, stratify=df['label'].values, random_state=42)
# Optional validation split:
train_idx, val_idx = train_test_split(train_idx, test_size=0.15, stratify=df['label'].values[train_idx], random_state=42)

train_mask = torch.zeros(N, dtype=torch.bool)
val_mask = torch.zeros(N, dtype=torch.bool)
test_mask = torch.zeros(N, dtype=torch.bool)
train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask.to(device)
data.val_mask = val_mask.to(device)
data.test_mask = test_mask.to(device)
print(train_mask.sum().item(), val_mask.sum().item(), test_mask.sum().item())


In [None]:
# Cell 10 — Define a lightweight GCN model (baseline)
class SimpleGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, num_classes=2, dropout=0.5):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.lin = nn.Linear(hidden_dim, num_classes)
        self.dropout = dropout
    def forward(self, x, edge_index, batch=None):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        if batch is None:
            # node classification: no pooling
            out = self.lin(x)
            return out, x
        else:
            # graph-level (not used here)
            x = global_mean_pool(x, batch)
            out = self.lin(x)
            return out, x

# utility train / eval
def train_epoch(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out, _ = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask].long())
    loss.backward()
    optimizer.step()
    return loss.item()

def eval_model(model, data, mask):
    model.eval()
    with torch.no_grad():
        logits, _ = model(data.x, data.edge_index)
        preds = logits[mask].argmax(dim=1).cpu().numpy()
        labels = data.y[mask].cpu().numpy()
    return preds, labels


In [None]:
# Cell 11 — Train baseline GCN quickly
in_dim = data.num_node_features
gcn = SimpleGCN(in_dim, hidden_dim=128, num_classes=2, dropout=0.5).to(device)
optimizer = optim.Adam(gcn.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

best_val_f1 = 0.0
patience = 10
cur_pat = 0
best_state = None

for epoch in range(1, 101):   # small number of epochs
    loss = train_epoch(gcn, data, optimizer, criterion)
    val_preds, val_labels = eval_model(gcn, data, data.val_mask)
    val_f1 = f1_score(val_labels, val_preds)
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_state = gcn.state_dict()
        cur_pat = 0
    else:
        cur_pat += 1
    if epoch % 10 == 0 or epoch==1:
        print(f"Epoch {epoch} loss {loss:.4f} val_f1 {val_f1:.4f}")
    if cur_pat >= patience:
        print("Early stopping.")
        break

# load best
if best_state is not None:
    gcn.load_state_dict(best_state)


In [None]:
# Cell 12 — Evaluate GCN baseline on test set
gcn.eval()
test_preds, test_labels = eval_model(gcn, data, data.test_mask)
print("GCN Test — acc {:.4f} precision {:.4f} recall {:.4f} f1 {:.4f}".format(
    accuracy_score(test_labels, test_preds),
    precision_score(test_labels, test_preds),
    recall_score(test_labels, test_preds),
    f1_score(test_labels, test_preds)
))
print(classification_report(test_labels, test_preds, target_names=['Real','Fake']))
cm = confusion_matrix(test_labels, test_preds)
print("Confusion matrix:\n", cm)


In [None]:
# Section 13 — Stable-GNN Training

from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import f1_score
import numpy as np

# === Helper: compute Random Fourier Features ===
def compute_rff_features(X_np, gamma=1.0, n_components=128, random_state=42):
    rff = RBFSampler(gamma=gamma, n_components=n_components, random_state=random_state)
    Z = rff.fit_transform(X_np)   # shape: N x n_components
    return Z

# === Helper: compute weighted off-diagonal covariance norm ===
def weighted_offdiag_cov_norm(Z, W):
    W = W.reshape(-1,1)
    W_sum = W.sum()
    mean = (W * Z).sum(axis=0) / W_sum
    Zc = Z - mean
    Zw = Zc * np.sqrt(W)
    cov = (Zw.T @ Zw) / (W_sum - 1.0)
    offdiag = cov - np.diag(np.diag(cov))
    return np.linalg.norm(offdiag, ord='fro')

# === Simplified sample weight optimization ===
def optimize_sample_weights(Z, init_w=None, lr=0.5, n_iters=10):
    N = Z.shape[0]
    if init_w is None:
        w = np.ones(N)
    else:
        w = init_w.copy()

    def project(w):
        w = np.maximum(w, 1e-6)
        return w * (N / w.sum())

    w = project(w)
    for it in range(n_iters):
        base = weighted_offdiag_cov_norm(Z, w)
        idxs = np.random.choice(N, size=min(200, N), replace=False)
        for i in idxs:
            w_pert = w.copy()
            w_pert[i] += 1e-3
            val = weighted_offdiag_cov_norm(Z, w_pert)
            grad = (val - base) / 1e-3
            w[i] -= lr * grad
        w = project(w)
        if it % 5 == 0:
            print(f"Iter {it} offdiag={base:.5f}")
    return w


In [None]:
# Section 14 — Train Stable-GNN

def train_stable_gnn(data, base_model, rounds=2, rff_dim=128):
    model = base_model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss(reduction='none')
    N = data.num_nodes
    W = np.ones(N)

    for r in range(rounds):
        # 1) Extract node embeddings
        model.eval()
        with torch.no_grad():
            _, latent = model(data.x, data.edge_index)
            latent_np = latent.detach().cpu().numpy()

        # 2) Compute RFF features
        Z = compute_rff_features(latent_np, gamma=1.0, n_components=rff_dim)

        # 3) Optimize sample weights
        print(f"Stable-GNN Round {r+1}")
        W = optimize_sample_weights(Z, init_w=W, n_iters=10)

        # 4) Train with weighted loss
        model.train()
        for epoch in range(5):
            optimizer.zero_grad()
            out, _ = model(data.x, data.edge_index)
            losses = criterion(out[data.train_mask], data.y[data.train_mask])
            train_idx_np = data.train_mask.cpu().numpy().nonzero()[0]
            w_train = torch.tensor(W[train_idx_np] / W[train_idx_np].sum() * len(train_idx_np),
                                   dtype=torch.float, device=device)
            weighted_loss = (losses * w_train).mean()
            weighted_loss.backward()
            optimizer.step()

    return model

# Run Stable-GNN
stable_gcn = SimpleGCN(data.num_node_features, hidden_dim=128, num_classes=2, dropout=0.5)
stable_gcn = train_stable_gnn(data, stable_gcn, rounds=2, rff_dim=128)


In [None]:
# Section 15 — Evaluate Stable-GNN

stable_gcn.eval()
stable_preds, stable_labels = eval_model(stable_gcn, data, data.test_mask)

print("Stable-GNN Test — acc {:.4f} precision {:.4f} recall {:.4f} f1 {:.4f}".format(
    accuracy_score(stable_labels, stable_preds),
    precision_score(stable_labels, stable_preds),
    recall_score(stable_labels, stable_preds),
    f1_score(stable_labels, stable_preds)
))
print(classification_report(stable_labels, stable_preds, target_names=['Real','Fake']))

print("Confusion matrix:\n", confusion_matrix(stable_labels, stable_preds))


In [None]:
# Section 16 — Interactive Prediction

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Train a fast Logistic Regression baseline for interactive use
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=20000, sublinear_tf=True)
X_tfidf = tfidf.fit_transform(df['clean'])
y = df['label'].values

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, random_state=42
)

lr = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced')
lr.fit(X_train_tfidf, y_train_tfidf)

print("Interactive LR baseline ready.")

# === Function for user input ===
def predict_news(text):
    clean = preprocess_text_simple(text)
    Xv = tfidf.transform([clean])
    pred = lr.predict(Xv)[0]
    label = "Fake" if pred == 1 else "Real"
    prob = lr.predict_proba(Xv)[0][1]
    return f"Prediction: {label} (Fake prob={prob:.2f})"

# === Ask user for input ===
while True:
    user_input = input("Enter news text or headline (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    print(predict_news(user_input))
