In [1]:
import os
from pathlib import Path

os.chdir("../")  
print("Current working directory:", Path().resolve())

Current working directory: /home/manmath/Desktop/MyProjects/CNN-Text-Classifiers


In [2]:
import torch 
import torch.nn as nn 
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from datasets import load_dataset

from transformers import AutoTokenizer, DataCollatorWithPadding

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Device: ", DEVICE)

Device:  cpu


  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# DIAGNOSTIC (run this to see exactly what columns your CSV has and a few rows)
from pathlib import Path
import pandas as pd

p = Path("/home/manmath/Desktop/MyProjects/CNN-Text-Classifiers/Data/FakeNewsNet.csv")
print("Exists?", p.exists(), "  Absolute path:", p.resolve())
df = pd.read_csv(p)
print("\nCOLUMNS:\n", df.columns.tolist())
print("\nHEAD (first 5 rows):")
display(df.head(5))
print("\nDtypes:")
print(df.dtypes)
print("\nAny nulls in columns?")
print(df.isna().sum())

Exists? True   Absolute path: /home/manmath/Desktop/MyProjects/CNN-Text-Classifiers/Data/FakeNewsNet.csv

COLUMNS:
 ['title', 'news_url', 'source_domain', 'tweet_num', 'real']

HEAD (first 5 rows):


Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1



Dtypes:
title            object
news_url         object
source_domain    object
tweet_num         int64
real              int64
dtype: object

Any nulls in columns?
title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

CSV_PATH = Path("/home/manmath/Desktop/MyProjects/CNN-Text-Classifiers/Data/FakeNewsNet.csv")
if not CSV_PATH.exists():
    raise FileNotFoundError(f"CSV not found: {CSV_PATH.resolve()}")

df = pd.read_csv(CSV_PATH)
TEXT_COL = "title"       
LABEL_COL = "real"       

df = df[[TEXT_COL, LABEL_COL]].dropna()

df = df.rename(columns={TEXT_COL: "text", LABEL_COL: "label"})
df["text"] = df["text"].astype(str)
df["label"] = df["label"].astype(int)

print("Value counts:", df["label"].value_counts().to_dict())

SAMPLE_PER_CLASS = 3000

dfs = []
for cls in sorted(df["label"].unique()):
    subset = df[df["label"] == cls]
    if len(subset) > SAMPLE_PER_CLASS:
        subset = subset.sample(SAMPLE_PER_CLASS, random_state=42)
    dfs.append(subset)

balanced_df = pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)

train_df, test_df = train_test_split(
    balanced_df,
    test_size=0.2,
    stratify=balanced_df["label"],
    random_state=42
)

print("Final counts → Train:", len(train_df), " Test:", len(test_df))
print("Sample row:", train_df.iloc[0].to_dict())


Value counts: {1: 17441, 0: 5755}
Final counts → Train: 4800  Test: 1200
Sample row: {'text': 'Emmy Snubs: Oprah Winfrey Snub & 5 Others Who Deserved a Nomination – Variety', 'label': 0}


In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader

TOKENIZER_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

MAX_LEN = 200

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=MAX_LEN):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        txt = str(self.texts[index])

        encoded = self.tokenizer(
            txt,
            truncation=True,
            max_length= self.max_length,
            padding=False,
            return_attention_mask=True,
            return_tensors=None
        )

        label = int(self.labels[index])
        encoded["labels"] = label

        return encoded
    
collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", return_tensors="pt")

BATCH_SIZE = 32

train_ds = NewsDataset(train_df["text"].tolist(), train_df["label"].tolist(), tokenizer, max_length=MAX_LEN)
test_ds = NewsDataset(test_df["text"].tolist(), test_df["label"].tolist(), tokenizer, max_length=MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator)

batch = next(iter(train_loader))
print("Batch Keys: ", list(batch.keys()))
print("input_ids shape: ", batch["input_ids"].shape)
print("attention_mask shape: ", batch["attention_mask"].shape)
print("labels shape: ", batch["labels"].shape)
print("Sample Labels (first 8): ", batch["labels"][:8].tolist())


Batch Keys:  ['input_ids', 'attention_mask', 'labels']
input_ids shape:  torch.Size([32, 29])
attention_mask shape:  torch.Size([32, 29])
labels shape:  torch.Size([32])
Sample Labels (first 8):  [1, 0, 0, 0, 0, 0, 1, 0]


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(
            self,
            vocab_size:int,
            embed_dim:int = 200,
            num_filters:int = 100,
            kernel_sizes:tuple = (3, 4, 5),
            dropout:float = 0.5,
            pad_idx:int = 0,
            num_classes:int = 1
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(k, embed_dim))
            for k in kernel_sizes
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes)*num_filters, num_classes)

        nn.init.xavier_uniform_(self.embedding.weight)
        for conv in self.convs:
            nn.init.kaiming_uniform_(conv.weight, nonlinearity='relu')
        nn.init.xavier_uniform_(self.fc.weight)
        if self.embedding.padding_idx is not None:
            with torch.no_grad():
                self.embedding.weight[self.embedding.padding_idx].zero_()

    def forward(self, input_ids, attention_mask=None):
        emb = self.embedding(input_ids)

        if attention_mask is not None:
            mask = attention_mask.unsqueeze(-1).to(emb.dtype)
            emb = emb*mask
        
        emb = emb.unsqueeze(1)

        pooled_outputs = []
        for conv in self.convs:
            x = conv(emb)
            x = F.relu(x)
            x = x.squeeze(3)

            x = F.max_pool1d(x, kernel_size=x.size(2)).squeeze(2)
            pooled_outputs.append(x)
        
        cat = torch.cat(pooled_outputs, dim=1)

        dropped = self.dropout(cat)
        logits = self.fc(dropped)

        if logits.size(1) == 1:
            return logits.squeeze(1)
        return logits


In [7]:
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

try:
    vocab_size = len(tokenizer.get_vocab())
    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
    print("Using tokenizer vocab:", vocab_size, "pad_id:", pad_id)
except Exception:
    vocab_size = 5000   
    pad_id = 0
    print("Tokenizer not found in scope — using fallback vocab_size=5000, pad_id=0")

model = TextCNN(vocab_size=vocab_size, embed_dim=200, num_filters=100, kernel_sizes=(3,4,5), pad_idx=pad_id).to(DEVICE)

B = 8
L = 120
batch_input = torch.randint(low=0, high=vocab_size, size=(B, L), dtype=torch.long, device=DEVICE)
batch_mask  = (batch_input != pad_id).long().to(DEVICE)

out = model(batch_input, batch_mask)  
print("output shape:", out.shape)
print("sample logits (first 5):", out[:5].detach().cpu().numpy())

Using device: cpu
Using tokenizer vocab: 30522 pad_id: 0
output shape: torch.Size([8])
sample logits (first 5): [0.06497481 0.07232034 0.13559815 0.14432389 0.03916262]


In [8]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Runtime CUDA:", torch.version.cuda)
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

Torch version: 2.5.1+cu121
CUDA available: False
Runtime CUDA: 12.1
Device name: None


In [9]:
import time
import math
import torch
import torch.nn as nn

EPOCHS = 8
LR = 2e-4
BATCH_SIZE = 32
PATIENCE = 2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PRINT_EVERY = 50

def train_epoch(model, loader, optimizer, loss_fn, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for step, batch in enumerate(loader, 1):
        input_ids = batch["input_ids"].to(device)
        attn = batch.get("attention_mask", None)
        if attn is not None:
            attn = attn.to(device)
        labels = batch["labels"].float().to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask=attn)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * labels.size(0)
        preds = (torch.sigmoid(logits) >= 0.5).long()
        correct += (preds == labels.long()).sum().item()
        total += labels.size(0)

        if step % PRINT_EVERY == 0:
            avg_loss = running_loss / total if total else 0.0
            acc = correct / total if total else 0.0
            print(f"  step {step:04d}  avg_loss={avg_loss:.4f}   acc={acc:.4f}")

    epoch_loss = running_loss / total if total else 0.0
    epoch_acc = correct /total if total else 0.0
    return epoch_loss, epoch_acc

def eval_epoch(model, loader, loss_fn, device):
    model.eval()
    running_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attn = batch.get("attention_mask", None)
            if attn is not None:
                attn = attn.to(device)
            labels = batch["labels"].float().to(device)

            logits = model(input_ids, attention_mask=attn)
            loss = loss_fn(logits, labels)

            running_loss += loss.item() * labels.size(0)
            preds = (torch.sigmoid(logits) >= 0.5).long()
            correct += (preds == labels.long()).sum().item()
            total += labels.size(0)
    
    val_loss = running_loss / total if total else 0.0
    val_acc = correct / total if total else 0.0
    return val_loss, val_acc

model = TextCNN(vocab_size=vocab_size, embed_dim=200, num_filters=100, kernel_sizes=(3, 4, 5), pad_idx=pad_id, dropout=0.5)
model = model.to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1, verbose=True)

best_val = float("inf")
no_improve = 0
best_path = "Models/textcnn_best.pt"
import os
os.makedirs("Models", exist_ok=True)

print(f"Train on device: {DEVICE} --- Train Batches {len(train_loader)}, Val Batches {len(test_loader)}")
t0 = time.time()

for epoch in range(1, EPOCHS + 1):
    e0 = time.time()
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, loss_fn, DEVICE)
    val_loss, val_acc = eval_epoch(model, test_loader, loss_fn, DEVICE)
    scheduler.step(val_loss)

    print(f"Epoch {epoch:02d}  Train Loss = {train_loss:.4f}  Train acc = {train_acc:.4f}  Val Loss = {val_loss:.4f}  Val Acc = {val_acc:.4f}  Time = {(time.time()-e0):.1f}s")

    if val_loss + 1e-9 < best_val:
        best_val = val_loss
        no_improve = 0
        torch.save(model.state_dict(), best_path)
        print(f"  -> Saved Best Model: {best_path}")
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print("Early Stopping No Improvement.")

print("Total Training Tine: ", time.time()-t0)

if os.path.exists(best_path):
    model.load_state_dict(torch.load(best_path, map_location=DEVICE))
    print("Loaded best model for final eval.")
val_loss, val_acc = eval_epoch(model, test_loader, loss_fn, DEVICE)
print(f"Final eval: val_loss = {val_loss:.4f} val_acc = {val_acc:.4f}")




Train on device: cpu --- Train Batches 150, Val Batches 38
  step 0050  avg_loss=0.6907   acc=0.5337
  step 0100  avg_loss=0.6871   acc=0.5837
  step 0150  avg_loss=0.6812   acc=0.6185
Epoch 01  Train Loss = 0.6812  Train acc = 0.6185  Val Loss = 0.6600  Val Acc = 0.7242  Time = 8.7s
  -> Saved Best Model: Models/textcnn_best.pt
  step 0050  avg_loss=0.6313   acc=0.7900
  step 0100  avg_loss=0.6116   acc=0.7809
  step 0150  avg_loss=0.5902   acc=0.7825
Epoch 02  Train Loss = 0.5902  Train acc = 0.7825  Val Loss = 0.5489  Val Acc = 0.7483  Time = 8.3s
  -> Saved Best Model: Models/textcnn_best.pt
  step 0050  avg_loss=0.4655   acc=0.8219
  step 0100  avg_loss=0.4562   acc=0.8206
  step 0150  avg_loss=0.4487   acc=0.8196
Epoch 03  Train Loss = 0.4487  Train acc = 0.8196  Val Loss = 0.5004  Val Acc = 0.7675  Time = 8.4s
  -> Saved Best Model: Models/textcnn_best.pt
  step 0050  avg_loss=0.3270   acc=0.8956
  step 0100  avg_loss=0.3155   acc=0.8962
  step 0150  avg_loss=0.3190   acc=0.8892

  model.load_state_dict(torch.load(best_path, map_location=DEVICE))


Final eval: val_loss = 0.5003 val_acc = 0.7675


In [10]:
# run in the same env / notebook where you ran training
import pandas as pd
p = "/home/manmath/Desktop/MyProjects/CNN-Text-Classifiers/Data/FakeNewsNet.csv"   # update if your path differs
df = pd.read_csv(p)
print("Columns:", df.columns.tolist())
print(df[['title','real']].head(10))
print("Value counts for 'real':\n", df['real'].value_counts())
# If you mapped columns to 'text' and 'label' earlier, check sample mapping:
print("\nSample rows used for train/test (first 6):")
print(df[['title','real']].sample(6))

Columns: ['title', 'news_url', 'source_domain', 'tweet_num', 'real']
                                               title  real
0  Kandi Burruss Explodes Over Rape Accusation on...     1
1  People's Choice Awards 2018: The best red carp...     1
2  Sophia Bush Sends Sweet Birthday Message to 'O...     1
3  Colombian singer Maluma sparks rumours of inap...     1
4  Gossip Girl 10 Years Later: How Upper East Sid...     1
5  Gwen Stefani Got Dumped by Blake Shelton Over ...     0
6  Broward County Sheriff Fired For Lying About P...     0
7  Amber Rose Shuts Down French Montana Dating Ru...     0
8  Mindy Kaling makes first post-baby appearance ...     1
9  Katharine McPhee Butchers Tony Nominations: “I...     1
Value counts for 'real':
 real
1    17441
0     5755
Name: count, dtype: int64

Sample rows used for train/test (first 6):
                                                   title  real
4291   Will Stabler return to 'Law and Order: SVU' fo...     1
14922  Travis Barker Survives ‘Re

In [11]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from typing import List, Union
from pathlib import Path

MODEL_PATH = Path("Models/textcnn_best.pt")   
TOKENIZER_NAME = "distilbert-base-uncased"   
MAX_LEN = 200
EMBED_DIM = 200       
NUM_FILTERS = 100      
KERNEL_SIZES = (3,4,5)  
PAD_IDX_FALLBACK = 0

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)

class TextCNNConv2D(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, num_filters: int, kernel_sizes=(3,4,5), pad_idx=0, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(k, embed_dim))
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), 1)

        nn.init.xavier_uniform_(self.embedding.weight)
        for c in self.convs:
            nn.init.kaiming_uniform_(c.weight, nonlinearity='relu')
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, input_ids, attention_mask=None):
        emb = self.embedding(input_ids)          
        emb_2d = emb.unsqueeze(1)               

        conv_outs = []
        for conv in self.convs:
            c = conv(emb_2d)                
            c = torch.relu(c)
            c = c.squeeze(3)                   
            c = torch.max(c, dim=2).values    
            conv_outs.append(c)

        cat = torch.cat(conv_outs, dim=1)     
        x = self.dropout(cat)
        logits = self.fc(x).squeeze(-1)
        return logits
    
vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
pad_id = tokenizer.pad_token_id or 0

model = TextCNNConv2D(vocab_size=vocab_size, embed_dim=EMBED_DIM,
                      num_filters=NUM_FILTERS, kernel_sizes=KERNEL_SIZES, pad_idx=pad_id).to(DEVICE)
state = torch.load(MODEL_PATH, map_location=DEVICE)
model.load_state_dict(state)   
print("Loaded checkpoint OK")

def predict(text: str):
    model.eval()
    with torch.no_grad():
        encoded = tokenizer(
            text,
            truncation=True,
            max_length=MAX_LEN,
            padding="max_length",
            return_tensors="pt"
        )

        input_ids = encoded["input_ids"].to(DEVICE)
        attention_mask = encoded["attention_mask"].to(DEVICE)

        logits = model(input_ids, attention_mask)
        prob = torch.sigmoid(logits)

        label = 1 if prob.item() >= 0.5 else 0

        return {
            "text": text,
            "probability": round(prob.item(), 4),
            "prediction": "FAKE" if label == 0 else "REAL",
            "label": label
        }

print(predict("Breaking news!!! Elon Musk buys the moon!!"))


Using device: cpu
Loaded checkpoint OK
{'text': 'Breaking news!!! Elon Musk buys the moon!!', 'probability': 0.2508, 'prediction': 'FAKE', 'label': 0}


  state = torch.load(MODEL_PATH, map_location=DEVICE)


In [12]:
examples = [
 "Breaking news!!! Elon Musk buys the moon!!",
 "The president announced new policies today in a press conference.",
 "Scientists discover cure for rare disease after long research.",
 "You won't believe these celebrity secrets!! Click to read!"
]
for ex in examples:
    print(ex, "->", predict(ex))

Breaking news!!! Elon Musk buys the moon!! -> {'text': 'Breaking news!!! Elon Musk buys the moon!!', 'probability': 0.2508, 'prediction': 'FAKE', 'label': 0}
The president announced new policies today in a press conference. -> {'text': 'The president announced new policies today in a press conference.', 'probability': 0.6969, 'prediction': 'REAL', 'label': 1}
Scientists discover cure for rare disease after long research. -> {'text': 'Scientists discover cure for rare disease after long research.', 'probability': 0.6018, 'prediction': 'REAL', 'label': 1}
You won't believe these celebrity secrets!! Click to read! -> {'text': "You won't believe these celebrity secrets!! Click to read!", 'probability': 0.5369, 'prediction': 'REAL', 'label': 1}
