## Baseline
This notebook implments a baseline model, which shows you how to handle the data and to provide a first very simple solution to the problem. You may re-use and modify any part of this notebook.

In [None]:
!pip install torchmetrics --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import csv
import torch
import pickle
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm.notebook import tqdm
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torchmetrics import AUROC, F1Score

In [None]:
drive = drive.mount('/content/drive')
data_dir = 'drive/MyDrive/kaggle_data'

Mounted at /content/drive


In [None]:
torch.manual_seed(0)

<torch._C.Generator at 0x7ae44f7125f0>

We first start by defining the dataset class which takes as input the path to the data and the mode (`train`, `val`, or `train`). This fits a count vectorizer using the training set, and uses it on the validation and test sets.

In [None]:
class BaselineDataset(Dataset):
    def __init__(self, data_dir, mode, vectorizer=None):
        super(BaselineDataset, self).__init__()
        assert mode in ['train', 'val', 'test']
        self.mode = mode

        # load the data
        self.data = pd.read_csv(os.path.join(data_dir, f'{mode}_x.csv'), index_col=0)

        # load the labels if not the test set
        if self.mode != 'test':
            self.label = pd.read_csv(os.path.join(data_dir, f'{mode}_y.csv'))

        # train the vectorizer if train set
        if self.mode == 'train':
            self.vectorizer = CountVectorizer()
            self.vectorizer.fit(self.data.values.flatten().tolist())
        # otherwise use the vectorizer given as arguments (which was trained on the train set)
        else:
            self.vectorizer = vectorizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data.iloc[idx, 0]
        x = self.vectorizer.transform([x]).toarray()
        x = torch.tensor(x).float()
        if self.mode == 'test':
            return x, idx
        else:
            y = torch.tensor([self.label.iloc[idx, -2]])
            return x, y, idx

In [None]:
train_dataset = BaselineDataset(data_dir, 'train')
val_dataset = BaselineDataset(data_dir, 'val', train_dataset.vectorizer)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=256, shuffle=False)

We will define two models, one which will be a simple MLP, and another one which will generate random predictions to use as comparison.

In [None]:
class BaselineClassifier(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(BaselineClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        return x

In [None]:
class RandomClassifier(nn.Module):
    def __init__(self):
        super(RandomClassifier, self).__init__()

    def forward(self, x):
        x = torch.rand(len(x))
        return x

Let's check the performance of the random classifier on the validation set.

In [None]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga

In [None]:
def evaluate_model(model, dataloader, criterion):
    """
        Evaluate the model on a given dataloader.
        argument:
            model [torch.nn.Module]: model to evaluate
            dataloader [torch.utils.data.DataLoader]: dataloader on which to evaluate
            criterion [torch.nn.modules.loss]: desired loss to compute
        returns:
            dataset_loss [float]: computed loss on the dataset
            dataset_metric [float]: computed metric on the dataset
    """
    model.eval()
    losses, predictions, indices = [], [], []
    for x, y, idx in tqdm(dataloader, leave=False):
        with torch.no_grad():
            pred = model(x)
        loss = criterion(pred.squeeze(), y.squeeze().float())
        losses.extend([loss.item()] * len(y))
        predictions.extend(pred.detach().squeeze().tolist())
        indices.extend(idx.tolist())

    pred_df = pd.DataFrame({'index': indices, 'pred': predictions})
    dataset_loss = np.mean(losses)
    dataset_metric = worst_group_accuracy(pred_df, dataloader.dataset.label)
    return dataset_loss, dataset_metric

In [None]:
model = RandomClassifier()
criterion = nn.BCELoss()
metric = F1Score(task='binary')

In [None]:
random_val_loss, random_val_metric = evaluate_model(model, val_dataloader, criterion)
print(f'Random classifier validation loss {random_val_loss:.4f} WGA {random_val_metric:.4f}')

  0%|          | 0/177 [00:00<?, ?it/s]

Random classifier validation loss 1.0027 WGA 0.4720


Now let's train the MLP baseline classifier.

In [None]:
def train_model(model, optimizer, criterion, dataloader):
    """
        Train a model for one epoch.
        arguments:
            model [torch.nn.Module]: model to evaluate
            oprimizer [torch.optim]: optimizer used for training
            criterion [torch.nn.modules.loss]: desired loss to compute
            dataloader [torch.utils.data.DataLoader]: dataloader used for training
        returns:
            dataset_loss [float]: computed loss on the dataset
            dataset_metric [float]: computed metric on the dataset
    """
    model.train()
    losses, predictions, indices = [], [], []
    for x, y, idx in tqdm(dataloader, leave=False):
        optimizer.zero_grad()
        pred = model(x)
        loss = criterion(pred.squeeze(), y.squeeze().float())
        loss.backward()
        optimizer.step()

        losses.extend([loss.item()] * len(y))
        predictions.extend(pred.detach().squeeze().tolist())
        indices.extend(idx.tolist())

    pred_df = pd.DataFrame({'index': indices, 'pred': predictions})
    dataset_loss = np.mean(losses)
    dataset_metric = worst_group_accuracy(pred_df, y = dataloader.dataset.label)
    return dataset_loss, dataset_metric

In [None]:
input_size = len(train_dataset.vectorizer.get_feature_names_out())
model = BaselineClassifier(input_size, 128)
optimizer = optim.AdamW(model.parameters(), lr=0.05, weight_decay=0.1)

In [None]:
train_loss, train_metric = train_model(model, optimizer, criterion, train_dataloader)
mlp_val_loss, mlp_val_metric = evaluate_model(model, val_dataloader, criterion)
print(f'MLP classifier validation loss {mlp_val_loss:.4f} WGA {mlp_val_metric:.4f}')

  0%|          | 0/1051 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

MLP classifier validation loss 0.2060 WGA 0.7022


Once we are happy with our results, we want to make a prediction on the test set. Your submission `.csv` file should contain 2 columns:
- ID: with the id of each prediction (do not shuffle to not mix things up)
- pred: the prediction of the model (thresholded or not)

In [None]:
#model = RandomClassifier()
test_dataset = BaselineDataset(data_dir, 'test', train_dataset.vectorizer)
test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False)
model.eval()
test_predictions, indices = [], []
for x, idx in tqdm(test_dataloader, leave=False):
    with torch.no_grad():
        pred = (model(x).squeeze() > 0.5).int()
    test_predictions.extend(pred.tolist())
    indices.extend(idx.tolist())

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
pred_df = pd.DataFrame({'ID': indices, 'pred': test_predictions})
pred_df.to_csv('prediction.csv', index=False)

## NEW CODE

In [1]:
import torch
import pandas as pd
import numpy as np
import re
import nltk
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm



# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [13]:
import torch.nn as nn

In [2]:
train_x = pd.read_csv('train_x.csv', index_col=0)
train_y = pd.read_csv('train_y.csv')
val_x = pd.read_csv('val_x.csv', index_col=0)
val_y = pd.read_csv('val_y.csv')
test_x = pd.read_csv('test_x.csv', index_col=0)

In [3]:
# Preprocessing text for tokenization
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        tokens = nltk.word_tokenize(text)
        stop_words = set(nltk.corpus.stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    return ""

train_x['processed_comment'] = train_x['string'].apply(preprocess_text)
val_x['processed_comment'] = val_x['string'].apply(preprocess_text)

In [4]:
# Function to compute the worst group accuracy
def worst_group_accuracy(prediction, y):
    """
    Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
    'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
    arguments:
        prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
        y [pandas.DataFrame]: dataframe containing the metadata
    returns:
        wga [float]: worst group accuracy
    """
    # Ensure we assign the predictions to the correct DataFrame and make sure we're comparing scalars
    y.loc[prediction.index, 'pred'] = prediction['y_predictions']

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            # Select the group based on subpopulation
            group = y.loc[y[category] == label]

            # Compute accuracy for this group
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()  # Compare predicted and true values
            accuracies.append(group_accuracy)

    # The worst group accuracy is the minimum accuracy across all groups
    wga = np.min(accuracies)
    return wga


In [5]:
# Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
    return inputs['input_ids'], inputs['attention_mask']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
train_inputs = tokenize_data(train_x['processed_comment'].tolist())
val_inputs = tokenize_data(val_x['processed_comment'].tolist())

In [7]:
class ToxicityDataset(Dataset):
    def __init__(self, input_ids, attention_mask, toxicity_labels=None, subpopulations=None, y_labels=None, ids=None):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.toxicity_labels = toxicity_labels  # Shape: (batch_size, num_toxicity_labels)
        self.subpopulations = subpopulations  # Shape: (batch_size, num_subpopulations)
        self.y_labels = y_labels  # Shape: (batch_size, 1) - final target `y`
        self.ids = ids  # The unique ids (for test dataset)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        data = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
        }

        # Add labels only if they are provided
        if self.toxicity_labels is not None:
            data['toxicity_labels'] = self.toxicity_labels[idx]

        if self.subpopulations is not None:
            data['subpopulation_labels'] = self.subpopulations[idx]

        if self.y_labels is not None:
            data['y_labels'] = self.y_labels[idx]

        # Add the id for tracking purposes
        if self.ids is not None:
            data['ids'] = self.ids[idx]

        return data


In [8]:
# Prepare datasets and DataLoaders
train_dataset = ToxicityDataset(
    train_inputs[0],  # Input IDs from tokenized text
    train_inputs[1],  # Attention Mask from tokenized text
    torch.tensor(train_y[['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']].values),  # Toxicity labels (6)
    train_y[['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white', 'identity_any']].values,  # Subpopulation labels (9)
    torch.tensor(train_y['y'].values)  # Final target variable `y`
)

val_dataset = ToxicityDataset(
    val_inputs[0],  # Input IDs from tokenized text
    val_inputs[1],  # Attention Mask from tokenized text
    torch.tensor(val_y[['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']].values),  # Toxicity labels (6)
    val_y[['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white', 'identity_any']].values,  # Subpopulation labels (9)
    torch.tensor(val_y['y'].values)  # Final target variable `y`
)


In [9]:
# DataLoader setup
batch_size = 32  # Adjust batch size based on available GPU memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [14]:
class LSTM_BERT_Model(nn.Module):
    def __init__(self, lstm_units=64, dropout=0.3, num_toxicity_labels=6, num_subpopulation_labels=9):
        super(LSTM_BERT_Model, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.gru = nn.GRU(input_size=768, hidden_size=lstm_units, num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)

        # Output layers for toxicity labels, subpopulation labels, and final target `y`
        self.fc_toxicity = nn.Linear(lstm_units * 2, num_toxicity_labels)  # 6 labels for toxicity
        self.fc_subpopulation = nn.Linear(lstm_units * 2, num_subpopulation_labels)  # 9 labels for subpopulation
        self.fc_y = nn.Linear(lstm_units * 2, 1)  # Final target variable `y` (binary)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = bert_output.last_hidden_state
        gru_out, _ = self.gru(hidden_states)
        gru_out = self.dropout(gru_out[:, -1, :])  # Get the output from the last GRU time step

        # Predict toxicity labels (6 outputs)
        toxicity_preds = self.fc_toxicity(gru_out)

        # Predict subpopulation labels (9 outputs)
        subpopulation_preds = self.fc_subpopulation(gru_out)

        # Predict the final target variable `y`
        y_preds = self.fc_y(gru_out)

        return toxicity_preds, subpopulation_preds, y_preds


In [11]:
class MultiOutputLoss(nn.Module):
    def __init__(self):
        super(MultiOutputLoss, self).__init__()

    def forward(self, toxicity_preds, subpopulation_preds, y_preds, toxicity_labels, subpopulation_labels, y_labels):
        # Binary cross-entropy loss for toxicity labels (6 labels)
        toxicity_loss = F.binary_cross_entropy_with_logits(toxicity_preds, toxicity_labels.float(), reduction='none')

        # Binary cross-entropy loss for subpopulation labels (9 labels)
        subpopulation_loss = F.binary_cross_entropy_with_logits(subpopulation_preds, subpopulation_labels.float(), reduction='none')

        # Binary cross-entropy loss for final target `y`
        y_loss = F.binary_cross_entropy_with_logits(y_preds, y_labels.float(), reduction='none')

        # Combine all the losses (mean across all samples)
        total_loss = toxicity_loss.mean() + subpopulation_loss.mean() + y_loss.mean()

        return total_loss


In [15]:
# Initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize model, optimizer, and criterion
model = LSTM_BERT_Model(num_toxicity_labels=6, num_subpopulation_labels = 9).to(device)  # 6 labels for toxicity labels
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = MultiOutputLoss()  # Using multi-output loss function

In [16]:
# Training loop for one epoch
num_epochs = 1
accumulation_steps = 4
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        toxicity_labels = batch['toxicity_labels'].to(device)  # Accessing the correct key
        subpopulation_labels = batch['subpopulation_labels'].to(device)  # Accessing the correct key
        y_labels = batch['y_labels'].to(device).unsqueeze(1)  # Accessing the correct key


        # Forward pass
        toxicity_preds, subpopulation_preds, y_preds = model(input_ids, attention_mask)

        # Compute loss with multi-output loss function
        loss = criterion(toxicity_preds, subpopulation_preds, y_preds, toxicity_labels, subpopulation_labels, y_labels)
        loss.backward()
        if(batch_idx+1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_dataloader)}")


                                                                

Epoch 1/1, Loss: 0.8128514763758253




In [17]:
# Validation loop
model.eval()
val_predictions_toxicity, val_predictions_subpop, val_predictions_y = [], [], []
val_indices = []

with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Validating", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        toxicity_labels = batch['toxicity_labels'].to(device)  # Shape: (batch_size, num_toxicity_labels)
        subpopulation_labels = batch['subpopulation_labels'].to(device)  # Shape: (batch_size, num_subpopulation_labels)
        y_labels = batch['y_labels'].to(device).unsqueeze(1)  # Shape: (batch_size, 1)

        # Forward pass
        toxicity_preds, subpopulation_preds, y_preds = model(input_ids, attention_mask)

        # Convert logits to binary predictions (threshold at 0.5)
        toxicity_preds_binary = (torch.sigmoid(toxicity_preds) > 0.5).int().cpu().tolist()
        subpopulation_preds_binary = (torch.sigmoid(subpopulation_preds) > 0.5).int().cpu().tolist()
        y_preds_binary = (torch.sigmoid(y_preds) > 0.5).int().cpu().flatten().tolist()  # Flatten the list to make it a list of scalars

        # Store predictions for all outputs
        val_predictions_toxicity.extend(toxicity_preds_binary)
        val_predictions_subpop.extend(subpopulation_preds_binary)
        val_predictions_y.extend(y_preds_binary)  # Ensure predictions are scalar (not lists)

        # Store the subpopulation identity indices for evaluation
        val_indices.extend(batch['subpopulation_labels'].cpu().tolist())  # Subpopulation indices

# Combine all predictions into a DataFrame for easier evaluation
val_predictions_df = pd.DataFrame({
    'subpopulation_indices': val_indices,
    'y_predictions': val_predictions_y
})

# Example: Compute worst-group accuracy or a similar metric for evaluation
# val_predictions_df will contain all the necessary predictions for the toxicity labels, subpopulations, and final target
wga = worst_group_accuracy(val_predictions_df, val_y)
print(f"Epoch {epoch + 1}/{num_epochs}, Worst Group Accuracy: {wga}")


                                                               

Epoch 1/1, Worst Group Accuracy: 0.7602905569007264




In [18]:
# Create the test dataset and DataLoader
test_x['processed_comment'] = test_x['string'].apply(preprocess_text)
test_inputs = tokenize_data(test_x['processed_comment'].tolist())

In [19]:
# Create the test dataset with only input_ids and attention_mask (no labels for testing)
test_dataset = ToxicityDataset(
    input_ids=test_inputs[0],  # Tokenized input_ids
    attention_mask=test_inputs[1],  # Tokenized attention mask
    toxicity_labels=None,  # No toxicity labels for testing
    subpopulations=None,  # No subpopulations for testing
    y_labels=None,  # No final target `y` labels for testing
    ids=test_x.index.values  # Pass the ids from test_x
)

# Create the DataLoader for the test dataset
test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode
model.eval()

# Initialize lists to store predictions and indices
test_predictions_y = []  # Only predicting 'y' in the test set
indices = []

# Evaluate the model on the test set
for batch in tqdm(test_dataloader, leave=False):
    # Move the inputs to the correct device (GPU or CPU)
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Forward pass with no gradients
    with torch.no_grad():
        toxicity_preds, subpopulation_preds, y_preds = model(input_ids, attention_mask)

        # Convert logits to binary predictions for final target `y` (threshold at 0.5)
        y_preds_binary = (torch.sigmoid(y_preds) > 0.5).int().cpu().tolist()

    # Flatten predictions and add to the list
    test_predictions_y.extend([item for sublist in y_preds_binary for item in sublist])

    # Store the indices (e.g., from `test_x.index.values`)
    indices.extend(batch['ids'].tolist())


# Combine all predictions into a DataFrame for easier evaluation or further processing
test_predictions_df = pd.DataFrame({
    'ID': indices,
    'pred': test_predictions_y  # Only include predictions for 'y'
})

# Optionally, you can save the predictions to a CSV file for later use:
test_predictions_df.to_csv('test_predictions.csv', index=False)


