In [1]:
# Cell 1: Imports
import os
import glob
import pandas as pd
import numpy as np
import re
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Cell 2: Load the Datasets
# Load all CSV files from Data/Train and Data/Test directories and combine them into single DataFrames.
train_files = glob.glob(os.path.join("Data", "Train", "*.csv"))
test_files = glob.glob(os.path.join("Data", "Test", "*.csv"))

if len(train_files) == 0:
    raise FileNotFoundError("No training CSV files found in Data/Train")
if len(test_files) == 0:
    raise FileNotFoundError("No testing CSV files found in Data/Test")

train_df = pd.concat([pd.read_csv(file) for file in train_files], ignore_index=True)
test_df = pd.concat([pd.read_csv(file) for file in test_files], ignore_index=True)

print("Training dataset shape:", train_df.shape)
print("Testing dataset shape:", test_df.shape)

In [None]:
# Cell 3: Handle Missing Values
# For both training and testing datasets, drop rows missing the 'Statement' column.
train_df = train_df.dropna(subset=['Statement'])
test_df = test_df.dropna(subset=['Statement'])

print("Processed Training dataset shape:", train_df.shape)
print("Processed Testing dataset shape:", test_df.shape)

Kaggle dataset shape: (20800, 5)
ISOT dataset shape: (44898, 5)
Combined dataset shape: (65698, 7)


In [None]:
# Cell 4: Extract Features and Labels
# Use 'Statement' as the input text and 'label' as the target.
X_train = train_df['Statement']
y_train = train_df['label']

X_test = test_df['Statement']
# If test CSVs include labels, use them; otherwise, evaluation will use predictions only.
y_test = test_df['label'] if 'label' in test_df.columns else None

In [5]:
# Cell 5: Preprocess Text for the LSTM
def tokenize(text):
    # Convert text to lowercase and extract word tokens.
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

# Build vocabulary from the training statements.
all_tokens = []
for text in X_train:
    all_tokens.extend(tokenize(text))
    
freq = Counter(all_tokens)
# Reserve index 0 for padding and 1 for unknown tokens.
vocab = {word: i+2 for i, (word, count) in enumerate(freq.items())}
vocab_size = len(vocab) + 2

def text_to_sequence(text, vocab):
    tokens = tokenize(text)
    return [vocab.get(token, 1) for token in tokens]

# Convert statements into sequences of integers.
X_train_seq = [text_to_sequence(text, vocab) for text in X_train]
X_test_seq = [text_to_sequence(text, vocab) for text in X_test]

# Define the maximum sequence length.
max_len = 500

def pad_sequence(seq, max_len):
    if len(seq) < max_len:
        return seq + [0]*(max_len - len(seq))
    else:
        return seq[:max_len]

# Apply padding or truncation.
X_train_pad = [pad_sequence(seq, max_len) for seq in X_train_seq]
X_test_pad = [pad_sequence(seq, max_len) for seq in X_test_seq]

In [None]:
# Cell 6: Convert Sequences to Torch Tensors
X_train_tensor = torch.tensor(X_train_pad, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_pad, dtype=torch.long)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
if y_test is not None:
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [None]:
# Cell 7: Create a PyTorch Dataset and DataLoader
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

batch_size = 64
train_dataset = NewsDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

if y_test is not None:
    test_dataset = NewsDataset(X_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [8]:
# Cell 8: Define the LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        last_hidden = lstm_out[:, -1, :]
        return self.fc(last_hidden)

embedding_dim = 100
hidden_dim = 128
output_dim = 2  # Adjust if you have a different number of classes

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

# Set the device (using 'cuda:2' if available, otherwise CPU)
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Cell 9: Train the Model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for texts, labels in train_loader:
        texts = texts.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * texts.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_acc:.4f}")

In [None]:
# Cell 10: Evaluate the Model
if y_test is not None:
    model.eval()  
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for texts, labels in test_loader:
            texts = texts.to(device)
            labels = labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='weighted')
    rec = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print("LSTM Test Accuracy:", acc)
    print("LSTM Test Precision:", prec)
    print("LSTM Test Recall:", rec)
    print("LSTM Test F1-Score:", f1)
else:
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor.to(device))
        _, predicted = torch.max(predictions, 1)
    print("Sample predictions on test data:", predicted.cpu().numpy()[:5])

Epoch 1/5 - Loss: 0.6655 - Accuracy: 0.5898
Epoch 2/5 - Loss: 0.6697 - Accuracy: 0.5737
Epoch 3/5 - Loss: 0.4905 - Accuracy: 0.7611
Epoch 4/5 - Loss: 0.1975 - Accuracy: 0.9251
Epoch 5/5 - Loss: 0.1041 - Accuracy: 0.9645
