In [30]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel

import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transform
import torch.nn as nn
from tqdm import tqdm
from torch.optim import Adam
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, ConfusionMatrixDisplay

### Converting csv to a PyTorch Dataset Object

In [27]:
def process_data(df):
    """
    Processes a dataset file to encode categorical variables and convert data into PyTorch tensors.
    """
    print("Processing data")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    
    # Drop useless data
    dropped_columns = ["id", "date"]
    df = df.drop(dropped_columns, axis=1)
    
    textual_columns = ["statement", "justification", "speaker_description", "subject", "state_info", "speaker", "context"]
    
    
    # Tokenize text columns using BERT tokenizer
    for col in df.columns:
        if df[col].dtype == 'object':  # Tokenize only text columns
            df[col] = df[col].fillna("None").astype(str)
            df[col] = df[col].apply(lambda x: tokenizer.encode_plus(
                x,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=256,
                return_tensors="pt",
                return_attention_mask=True)
            )

    return df

In [28]:
class SentimentDataset(Dataset):
    def __init__(self, path):
        self.sentiment = pd.read_csv(path)
        self.sentiment = process_data(self.sentiment)
        
    def __len__(self):
        return len(self.sentiment)
    
    def __getitem__(self, idx):
        data = self.sentiment.iloc[idx]
        label = data["label"]
        data = data.drop("label")
        
        input_ids_list = []
        attention_masks_list = []
        
        for col in data.index:
            if isinstance(data[col], dict):
                encoded = data[col]
                input_ids_list.append(encoded['input_ids'].squeeze(0))
                attention_masks_list.append(encoded['attention_mask'].squeeze(0))
                
        if not input_ids_list:
            raise ValueError(f"No tokenized text found for index {idx}. Check your data preprocessing.")

        
        # Combine BERT input tensors
        input_ids = torch.cat(input_ids_list, dim=0)
        attention_masks = torch.cat(attention_masks_list, dim=0)
        
        # Handle non-textual features (if any remain after processing)
        non_text_features = []
        for col in data.index:
            if not isinstance(data[col], dict):
                value = data[col]
                if isinstance(value, (np.ndarray, list)):
                    non_text_features.append(np.array(value))
                else:
                    non_text_features.append(np.array([value], dtype=np.float32))
        
        # Convert non-text feature vectors to tensor
        if non_text_features:
            feature_tensor = torch.tensor(np.concatenate(non_text_features), dtype=torch.float32)
        else:
            feature_tensor = torch.tensor([], dtype=torch.float32)  # Empty tensor if no non-text features
        
        return input_ids, attention_masks, feature_tensor, torch.tensor(label, dtype=torch.long)

In [29]:
t = transform.Compose([transform.ToTensor()])
vocabs = create_vocabs()
train_dataset = SentimentDataset(path="data/train.csv")
dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
iterator = iter(dataloader)
_, _, additional_features, label = next(iterator)

print(additional_features.shape, label.shape)

Processing data


ValueError: No tokenized text found for index 1742. Check your data preprocessing.

### Training Neural Network

In [None]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, additional_dim=6, num_classes=6):
        super(FakeNewsClassifier, self).__init__()
        
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.input_dim = 768 + additional_dim
        self.fc1 = nn.Linear(self.input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        
    def forward(self, input_ids, attention_mask, additional_features):
        # Get output from DistilRoBERTa
        distilroberta_output = self.distilroberta(input_ids=input_ids, attention_mask=attention_mask)
        # Take the [CLS] token embedding for classification
        pooled_output = distilroberta_output.last_hidden_state[:, 0]  # [CLS] token
        
        # Combine with additional features (like one-hot encodings, etc.)
        combined_input = torch.cat((pooled_output, additional_features), dim=1)
        
        # Feed through the classifier layers
        x = self.relu(self.bn1(self.fc1(combined_input)))
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.fc4(x)
        
        return x

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

vocabs = create_vocabs()
train_dataset = SentimentDataset(path="data/train.csv", vocabs=vocabs)
test_dataset = SentimentDataset(path="data/test.csv", vocabs=vocabs) 
val_dataset = SentimentDataset(path="data/valid.csv", vocabs=vocabs)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=True)

In [None]:
model = FakeNewsClassifierWithBERT().to(device)
optimizer = Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

epochs = 10
training_losses = []
val_losses = []

for epoch in range(1, epochs + 1):
    train_loss = 0
    val_loss = 0
    train_correct_predictions = 0
    train_total_samples = 0
    val_correct_predictions = 0
    val_total_samples = 0
    
    model.train()
    for text_inputs, non_text_inputs, labels in tqdm(train_loader, desc="Training", unit="its"):
        text_inputs = text_inputs.to(device)
        non_text_inputs = non_text_inputs.to(device).float()
        labels = labels.to(device).long()
        
        optimizer.zero_grad()
        outputs = model(text_inputs, non_text_inputs)
        t_loss = criterion(outputs, labels)
        t_loss.backward()
        optimizer.step()
        
        train_loss += t_loss.item()
        _, predicted = torch.max(outputs, 1)
        train_correct_predictions += (predicted == labels).sum().item()
        train_total_samples += labels.size(0)
        
    model.eval()
    for text_inputs, non_text_inputs, labels in tqdm(val_loader, desc="Validating", unit="its"):
        text_inputs = text_inputs.to(device)
        non_text_inputs = non_text_inputs.to(device).float()
        labels = labels.to(device).long()
        
        with torch.no_grad():
            outputs = model(text_inputs, non_text_inputs)
            v_loss = criterion(outputs, labels)
        
        val_loss += v_loss.item()
        _, predicted = torch.max(outputs, 1)
        val_correct_predictions += (predicted == labels).sum().item()
        val_total_samples += labels.size(0)
        
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    training_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracy = train_correct_predictions / train_total_samples * 100
    val_accuracy = val_correct_predictions / val_total_samples * 100
    print(f"Epoch {epoch}/{epochs}: Train Loss: {train_loss: .4f}, Val Loss: {val_loss}, Train Accuracy: {train_accuracy: .2f}, Val Accuracy: {val_accuracy: .2f}")
    
torch.save(model.state_dict(), "trained_model_bert_with_word2vec.pth")

# Plot training/validation loss
plt.plot(range(1, epochs+1), training_losses, label="Train")
plt.plot(range(1, epochs+1), val_losses, label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Evolution of loss during training")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
test_model = FakeNewsClassifier().to(device)
test_model.load_state_dict(torch.load("trained_model.pth"))
test_model.eval()

all_predictions = []
all_labels = []

with torch.no_grad():
    for features, labels in tqdm(test_loader, desc="Testing", unit="batch"):
        features = features.to(device).float()
        labels = labels.to(device).long()
        
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)

        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate Metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average="weighted")
recall = recall_score(all_labels, all_predictions, average="weighted")
f1 = f1_score(all_labels, all_predictions, average="weighted")

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(all_labels, all_predictions))

corresponding_labels = ["Pants on Fire", "False", "Barely True", "Half True", "Mostly True", "True"]
ConfusionMatrixDisplay.from_predictions(all_labels, all_predictions, display_labels=corresponding_labels)
plt.title("Confusion Matrix")
plt.save("results/confusion_matrix.png")
plt.show()