In [1]:
# Cell 1: Imports
import os
import glob
import pandas as pd
import numpy as np
import re
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import csv

In [2]:
# Cell 2 (Modified): Load the Datasets and Fix Test Column Order
import csv

train_files = glob.glob(os.path.join("Data", "Train", "*.csv"))
test_files = glob.glob(os.path.join("Data", "Test", "*.csv"))

if len(train_files) == 0:
    raise FileNotFoundError("No training CSV files found in Data/Train")
if len(test_files) == 0:
    raise FileNotFoundError("No testing CSV files found in Data/Test")

# Read training files (using default CSV settings)
train_dfs = []
for file in train_files:
    try:
        df = pd.read_csv(file, engine='python')
        train_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")
train_df = pd.concat(train_dfs, ignore_index=True)

# Read testing files
test_dfs = []
for file in test_files:
    try:
        df = pd.read_csv(file, engine='python')
        test_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")
test_df = pd.concat(test_dfs, ignore_index=True)

# Fix test_df column order if necessary.
# If the first column is "Label", then reorder to have "Statement" first.
if test_df.columns[0].strip().lower() == "label":
    test_df = test_df[["Statement", "Label"]]

print("Training dataset shape:", train_df.shape)
print("Testing dataset shape:", test_df.shape)

Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_000.csv: unexpected end of data
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_001.csv: unexpected end of data
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_002.csv: unexpected end of data
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_003.csv: ',' expected after '"'
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_005.csv: unexpected end of data
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_006.csv: ',' expected after '"'
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_007.csv: Expected 2 fields in line 330, saw 3
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_010.csv: unexpected end of data
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_012.csv: unexpected end of data
Error reading Data/Train/fulltrain_Guardian_Nyt_binary_shuffled_013.csv: ',' expected after '"'
Error reading Data/Train/f

In [3]:
# Cell 3 (Modified): Handle Missing Values Precisely
# Drop rows where the 'Statement' column is empty after stripping whitespace.
train_df = train_df[train_df["Statement"].astype(str).str.strip() != ""]
test_df = test_df[test_df["Statement"].astype(str).str.strip() != ""]

print("Processed Training dataset shape:", train_df.shape)
print("Processed Testing dataset shape:", test_df.shape)

Processed Training dataset shape: (28633, 2)
Processed Testing dataset shape: (7011, 2)


In [4]:
# Cell 4 (Modified): Extract Features and Labels
X_train = train_df["Statement"]
y_train = train_df["Label"]

X_test = test_df["Statement"]
y_test = test_df["Label"]

# (Optional) Print an example to verify the content:
print("Example training statement:", X_train.iloc[0])

Example training statement: Sainsburyâ€™s sales have dropped further after it was forced to cut prices amid heavy competition from discount rivals. Sales at stores open for more than a year fell by 1.1% in the three months to 24 September, compared with the 0.8% decline reported for the previous three months. Shares in Sainsburyâ€™s slid 3.5% to 242.2p in morning trading, as City analysts said the supermarket had slightly underperformed against expectations and appeared to be under more pressure in comparison with Morrisons and Tesco. Mike Coupe, Sainsburyâ€™s chief executive, said the fall in sales was driven by price cuts. Although a 1% dip in prices resulted in a drop in the total value of Sainsburyâ€™s sales in the quarter, the reductions drew more people to Sainsburyâ€™s tills and meant the group sold a higher volume of goods. Coupe said there had been no discernible impact on customer behaviour after the EU referendum, and he thought there would not be any impact on shopping unle

In [5]:
# Cell 4.1: Clean and Shuffle the Data
# Keep only binary labels (0 or 1)
train_df = train_df[train_df['Label'].isin([0, 1])]
test_df = test_df[test_df['Label'].isin([0, 1])]

# Convert the Label columns to integers to ensure numeric type.
train_df['Label'] = train_df['Label'].astype(int)
test_df['Label'] = test_df['Label'].astype(int)

# Shuffle the data
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Re-extract features and labels after cleaning
X_train = train_df['Statement']
y_train = train_df['Label']
X_test = test_df['Statement']
y_test = test_df['Label']

In [6]:
# Cell 5: Preprocess Text for the LSTM
def tokenize(text):
    # Convert text to lowercase and extract word tokens.
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

# Build vocabulary from the training statements.
all_tokens = []
for text in X_train:
    all_tokens.extend(tokenize(text))
    
# Limit vocabulary to the 10,000 most common words.
max_vocab = 10000
freq = Counter(all_tokens)
vocab = {word: i+2 for i, (word, count) in enumerate(freq.most_common(max_vocab))}
vocab_size = len(vocab) + 2  # Add 2 for reserved tokens: 0 for padding, 1 for unknown.

def text_to_sequence(text, vocab):
    tokens = tokenize(text)
    return [vocab.get(token, 1) for token in tokens]

# Convert texts to sequences of token indices.
X_train_seq = [text_to_sequence(text, vocab) for text in X_train]
X_test_seq = [text_to_sequence(text, vocab) for text in X_test]

# Set fixed maximum sequence length.
max_len = 300

def pad_sequence(seq, max_len):
    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))
    else:
        return seq[:max_len]

# Apply padding/truncation.
X_train_pad = [pad_sequence(seq, max_len) for seq in X_train_seq]
X_test_pad = [pad_sequence(seq, max_len) for seq in X_test_seq]

In [7]:
# Cell 6: Convert Sequences to Torch Tensors
X_train_tensor = torch.tensor(X_train_pad, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_pad, dtype=torch.long)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [8]:
# Cell 7: Create a PyTorch Dataset and DataLoader
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

batch_size = 64
train_dataset = NewsDataset(X_train_tensor, y_train_tensor)
test_dataset = NewsDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [9]:
# Cell 8: Define the LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        last_hidden = lstm_out[:, -1, :]
        return self.fc(last_hidden)

embedding_dim = 100
hidden_dim = 128
output_dim = 2  # Adjust if you have a different number of classes

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

# Set the device (using 'cuda:2' if available, otherwise CPU)
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMClassifier(
  (embedding): Embedding(10002, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [10]:
# Cell 9: Train the Model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 8

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for texts, labels in train_loader:
        texts = texts.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * texts.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_acc:.4f}")

Epoch 1/9 - Loss: 0.4892 - Accuracy: 0.7623
Epoch 2/9 - Loss: 0.1951 - Accuracy: 0.9225
Epoch 3/9 - Loss: 0.1050 - Accuracy: 0.9621
Epoch 4/9 - Loss: 0.0613 - Accuracy: 0.9784
Epoch 5/9 - Loss: 0.0410 - Accuracy: 0.9857
Epoch 6/9 - Loss: 0.0250 - Accuracy: 0.9920
Epoch 7/9 - Loss: 0.0190 - Accuracy: 0.9941
Epoch 8/9 - Loss: 0.0116 - Accuracy: 0.9966
Epoch 9/9 - Loss: 0.0126 - Accuracy: 0.9965


In [11]:
# Cell 10: Evaluate the Model
if y_test is not None:
    model.eval()  
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for texts, labels in test_loader:
            texts = texts.to(device)
            labels = labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='weighted')
    rec = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print("LSTM Test Accuracy:", acc)
    print("LSTM Test Precision:", prec)
    print("LSTM Test Recall:", rec)
    print("LSTM Test F1-Score:", f1)
else:
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor.to(device))
        _, predicted = torch.max(predictions, 1)
    print("Sample predictions on test data:", predicted.cpu().numpy()[:5])

LSTM Test Accuracy: 0.9596348595064899
LSTM Test Precision: 0.9596772338446422
LSTM Test Recall: 0.9596348595064899
LSTM Test F1-Score: 0.9593333390442632
