In [None]:
import os
import gc
from dotenv import load_dotenv
import copy
from tqdm import tqdm
import time
import torch 
import torch.nn as nn
from torch.optim import lr_scheduler, AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import mlflow
import mlflow.pytorch
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

#### Setting some constants

In [2]:
load_dotenv()
DATA_PATH = os.getenv('DATA_PATH')
RAW_PATH = os.getenv('RAW_PATH')
PROCESSED_PATH = os.getenv('PROCESSED_PATH')
MODEL_PATH = os.getenv('MODEL_PATH')

In [3]:
FOLDS = 5
BATCH_SIZE = 16
NUM_CLASSES = 1
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 1e-2
MAX_LEN = 256
MARGIN = 0.5
T_MAX = 500
MIN_LR = 1e-6
MODEL_NAME = 'FacebookAI/roberta-base'
CURRENT_MODEL_PATH = os.path.join(MODEL_PATH, 'roberta-base-freeze')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(CURRENT_MODEL_PATH, exist_ok=True)

In [5]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

#### Loading the data

In [6]:
df_train = pd.read_csv(os.path.join(PROCESSED_PATH, 'train_data.csv'))
df_train

Unnamed: 0,less_toxic,more_toxic
0,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"
...,...,...
15405,Straw poll being conducted on Catholic Church ...,"""\n\n Possible Troll \n\nWhy is the word """"MON..."
15406,Outrageous!!!!! \n\nThis block is outrageous ...,Bloody bots get more annoying every day.... He...
15407,Blink 182\n\nYou ahve 3 Blink 182 CD's???? WOW...,Homosexuality\nPlease attempt to refrain from ...
15408,I'm sorry. I'm not an admin. I will give you t...,get out my large penis


#### Considering that our data does not have target values, we will use KFold for cross-validation
Also we wont preprocess the data, since we are using a BERT-like models

In [7]:
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

for fold, (_, valid_idx) in enumerate(kf.split(df_train)):
    df_train.loc[valid_idx, 'kfold'] = fold

df_train["kfold"] = df_train["kfold"].astype(int)
df_train.head()

Unnamed: 0,less_toxic,more_toxic,kfold
0,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,0
1,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,4
2,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",3
3,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,0
4,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",4


#### Classes for the dataset and the model

In [8]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

In [9]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name, return_dict=False)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, NUM_CLASSES)
        
    def forward(self, ids, mask):        
        _, out = self.model(input_ids=ids,attention_mask=mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

#### We will use MarginalRankingLoss as our loss function, since it will allow us to train the model with our pairs of sentences

In [None]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=MARGIN)(outputs1, outputs2, targets)

#### Defining functions for training and evaluating the model

In [11]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    loop = tqdm(dataloader)
    for data in loop:
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets.unsqueeze(1))
        loss.backward()
    
        optimizer.step()

        optimizer.zero_grad()

        if scheduler is not None:
            scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        loop.set_description(f'Epoch {epoch+1}/{NUM_EPOCHS}')
        loop.set_postfix(Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    
    return epoch_loss

def eval_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0.0
    
    loop = tqdm(dataloader)

    with torch.no_grad():
        for data in loop:
            more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
            more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
            less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
            less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
            targets = data['target'].to(device, dtype=torch.long)

            batch_size = more_toxic_ids.size(0)

            more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
            less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)

            loss = criterion(more_toxic_outputs, less_toxic_outputs, targets.unsqueeze(1))

            running_loss += (loss.item() * batch_size)
            dataset_size += batch_size

            epoch_loss = running_loss / dataset_size

            loop.set_description(f'Epoch {epoch+1}/{NUM_EPOCHS}')
            loop.set_postfix(Val_Loss=epoch_loss)

    return epoch_loss

#### Functions to get scheduler and loaders

In [12]:
def fetch_scheduler(optimizer, scheduler=None):
    if scheduler == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=T_MAX, 
                                                   eta_min=MIN_LR)
    elif scheduler == 'LinearLR':
        scheduler = lr_scheduler.LinearLR(optimizer)

    elif scheduler == None:
        return None
    
    return scheduler

In [13]:
def prepare_loaders(fold, df, tokenizer, max_length, batch_size):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = JigsawDataset(df_train, tokenizer=tokenizer, max_length=max_length)
    valid_dataset = JigsawDataset(df_valid, tokenizer=tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                              shuffle=True)
    

    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, 
                              shuffle=False)
    
    return train_loader, valid_loader

#### Function for main training loop

In [None]:
from collections import defaultdict

def run_training(model, scheduler_name, device, num_epochs, fold, train_loader, valid_loader, freeze=False):
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(num_epochs): 
        if epoch == 0:
            if freeze:
                # Freeze the base model
                for param in model.model.parameters():
                    param.requires_grad = False
                    
                # Change lr
                optimizer = AdamW(model.parameters(), lr=1e-3, weight_decay=WEIGHT_DECAY)
                scheduler = fetch_scheduler(optimizer, scheduler_name)

        else:
            if freeze:
                # Unfreeze the base model
                for param in model.model.parameters():
                    param.requires_grad = True
                
                # Change lr
                optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
                scheduler = fetch_scheduler(optimizer, scheduler_name)


        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=device, epoch=epoch)
        
        val_epoch_loss = eval_one_epoch(model, valid_loader, device=device, 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # Log the metrics
        steps = (epoch+1) * len(train_loader)
        mlflow.log_metric("Train Loss", train_epoch_loss, step=steps)
        mlflow.log_metric("Valid Loss", val_epoch_loss, step=steps)
        
        # Save the best model
        if val_epoch_loss <= best_epoch_loss:
            print(f'Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})')
            best_epoch_loss = val_epoch_loss
            mlflow.log_metric("Best Loss", best_epoch_loss, step=steps)
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = os.path.join(CURRENT_MODEL_PATH, f'model_fold_{fold}.pth')
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f'Model Saved')
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [1]:
# Function to log the config
def mlflow_log_config():
    mlflow.log_param('FOLDS', FOLDS)
    mlflow.log_param('BATCH_SIZE', BATCH_SIZE)
    mlflow.log_param('NUM_CLASSES', NUM_CLASSES)
    mlflow.log_param('NUM_EPOCHS', NUM_EPOCHS)
    mlflow.log_param('LEARNING_RATE', LEARNING_RATE)
    mlflow.log_param('WEIGHT_DECAY', WEIGHT_DECAY)
    mlflow.log_param('MAX_LEN', MAX_LEN)
    mlflow.log_param('MARGIN', MARGIN)
    mlflow.log_param('T_MAX', T_MAX)
    mlflow.log_param('MIN_LR', MIN_LR)
    mlflow.log_param('MODEL_NAME', MODEL_NAME)

#### Training the model on folds

In [16]:
mlflow.set_experiment('Jigsaw-'+time.strftime("%Y-%m-%d-%H-%M-%S")+'-'+MODEL_NAME)
mlflow.end_run()

for fold in range(0, FOLDS):
    print(f'Training fold: {fold}')
    mlflow.start_run(run_name=f"Fold-{fold}")
    mlflow_log_config()
    
    # Create Dataloaders
    train_loader, valid_loader = prepare_loaders(fold=fold, df=df_train,
                                                 tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
                                                 max_length=MAX_LEN, batch_size=BATCH_SIZE)
    model = JigsawModel(MODEL_NAME)
    model.to(DEVICE)
    print("Model Created")
    scheduler_name = 'CosineAnnealingLR'

    model, history = run_training(model, scheduler_name, DEVICE, NUM_EPOCHS, fold, train_loader, valid_loader, freeze=True)
    
    mlflow.end_run()
    
    del model, history, train_loader, valid_loader
    gc.collect()
    torch.cuda.empty_cache()

    print()

2025/03/20 16:05:48 INFO mlflow.tracking.fluent: Experiment with name 'Jigsaw-2025-03-20-16-05-48-FacebookAI/roberta-base' does not exist. Creating a new experiment.


Training fold: 0


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Created


Epoch 1/3: 100%|██████████| 771/771 [01:25<00:00,  9.01it/s, LR=0.000566, Train_Loss=0.496]
Epoch 1/3: 100%|██████████| 193/193 [00:20<00:00,  9.25it/s, Val_Loss=0.494]


Validation Loss Improved (inf ---> 0.4935037189880646)
Model Saved



Epoch 2/3: 100%|██████████| 771/771 [04:24<00:00,  2.92it/s, LR=1.17e-5, Train_Loss=0.443]
Epoch 2/3: 100%|██████████| 193/193 [00:21<00:00,  9.11it/s, Val_Loss=0.44] 


Validation Loss Improved (0.4935037189880646 ---> 0.44035555363320283)
Model Saved



Epoch 3/3: 100%|██████████| 771/771 [04:23<00:00,  2.93it/s, LR=1.17e-5, Train_Loss=0.433]
Epoch 3/3: 100%|██████████| 193/193 [00:21<00:00,  9.02it/s, Val_Loss=0.441]



Training complete in 0h 11m 17s
Best Loss: 0.4404

Training fold: 1


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Created


Epoch 1/3: 100%|██████████| 771/771 [01:25<00:00,  8.97it/s, LR=0.000566, Train_Loss=0.494]
Epoch 1/3: 100%|██████████| 193/193 [00:20<00:00,  9.45it/s, Val_Loss=0.492]


Validation Loss Improved (inf ---> 0.49194475020161704)
Model Saved



Epoch 2/3: 100%|██████████| 771/771 [04:24<00:00,  2.92it/s, LR=1.17e-5, Train_Loss=0.441]
Epoch 2/3: 100%|██████████| 193/193 [00:20<00:00,  9.24it/s, Val_Loss=0.438]


Validation Loss Improved (0.49194475020161704 ---> 0.43764511276807666)
Model Saved



Epoch 3/3: 100%|██████████| 771/771 [04:25<00:00,  2.90it/s, LR=1.17e-5, Train_Loss=0.434]
Epoch 3/3: 100%|██████████| 193/193 [00:21<00:00,  9.03it/s, Val_Loss=0.436]


Validation Loss Improved (0.43764511276807666 ---> 0.4364924729643976)
Model Saved

Training complete in 0h 11m 20s
Best Loss: 0.4365

Training fold: 2


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Created


Epoch 1/3: 100%|██████████| 771/771 [01:29<00:00,  8.63it/s, LR=0.000566, Train_Loss=0.496]
Epoch 1/3: 100%|██████████| 193/193 [00:20<00:00,  9.59it/s, Val_Loss=0.494]


Validation Loss Improved (inf ---> 0.4940065833595494)
Model Saved



Epoch 2/3: 100%|██████████| 771/771 [04:18<00:00,  2.99it/s, LR=1.17e-5, Train_Loss=0.44] 
Epoch 2/3: 100%|██████████| 193/193 [00:21<00:00,  8.89it/s, Val_Loss=0.443]


Validation Loss Improved (0.4940065833595494 ---> 0.4429621871688937)
Model Saved



Epoch 3/3: 100%|██████████| 771/771 [04:23<00:00,  2.92it/s, LR=1.17e-5, Train_Loss=0.43] 
Epoch 3/3: 100%|██████████| 193/193 [00:21<00:00,  9.09it/s, Val_Loss=0.451]



Training complete in 0h 11m 15s
Best Loss: 0.4430

Training fold: 3


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Created


Epoch 1/3: 100%|██████████| 771/771 [01:29<00:00,  8.65it/s, LR=0.000566, Train_Loss=0.495]
Epoch 1/3: 100%|██████████| 193/193 [00:21<00:00,  9.08it/s, Val_Loss=0.492]


Validation Loss Improved (inf ---> 0.492076423321352)
Model Saved



Epoch 2/3: 100%|██████████| 771/771 [04:33<00:00,  2.82it/s, LR=1.17e-5, Train_Loss=0.447]
Epoch 2/3: 100%|██████████| 193/193 [00:21<00:00,  9.00it/s, Val_Loss=0.421]


Validation Loss Improved (0.492076423321352 ---> 0.4214677869848119)
Model Saved



Epoch 3/3: 100%|██████████| 771/771 [04:24<00:00,  2.92it/s, LR=1.17e-5, Train_Loss=0.435]
Epoch 3/3: 100%|██████████| 193/193 [00:21<00:00,  8.92it/s, Val_Loss=0.424]



Training complete in 0h 11m 32s
Best Loss: 0.4215

Training fold: 4


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Created


Epoch 1/3: 100%|██████████| 771/771 [01:26<00:00,  8.92it/s, LR=0.000566, Train_Loss=0.495]
Epoch 1/3: 100%|██████████| 193/193 [00:19<00:00,  9.81it/s, Val_Loss=0.492]


Validation Loss Improved (inf ---> 0.49169483049824053)
Model Saved



Epoch 2/3: 100%|██████████| 771/771 [04:16<00:00,  3.01it/s, LR=1.17e-5, Train_Loss=0.444]
Epoch 2/3: 100%|██████████| 193/193 [00:20<00:00,  9.42it/s, Val_Loss=0.421]


Validation Loss Improved (0.49169483049824053 ---> 0.42094164626463754)
Model Saved



Epoch 3/3: 100%|██████████| 771/771 [04:14<00:00,  3.03it/s, LR=1.17e-5, Train_Loss=0.437]
Epoch 3/3: 100%|██████████| 193/193 [00:20<00:00,  9.41it/s, Val_Loss=0.425]



Training complete in 0h 10m 59s
Best Loss: 0.4209



#### Plots from mlflow

![image.png](attachment:image.png)

![image.png](attachment:image.png)

![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)

#### Submission results

![image-2.png](attachment:image-2.png)

As we can see, we made 5 submissions.  
All of them except the 4th one were made with bert-base-uncased with some tweaks. 

The best bert-base-uncased model was the 2nd one, with a private score of 0.77952 and a public score of 0.74594. It used a a batch size of 24 and 2 epochs. 1st epoch was trained with a learning rate of 1e-3 and freezed bert base model. 2nd epoch was trained with a learning rate of 2e-5 and unfreezed bert base model.  

The best model overall was the 4th one - roberta-base, with a private score of 0.76989 and a public score of 0.78628. It used a batch size of 16 and 3 epochs. 1st epoch was trained with a learning rate of 1e-3 and freezed roberta base model. 2-3 epochs were trained with a learning rate of 2e-5 and unfreezed roberta base model.

Difference between public and private scores is quite big, but it is expected, since the public set is only about 5% of the whole dataset.

#### Conclusion
We chose the Jigsaw Rate Severity of Toxic Comments competition.  
We used a BERT-like models to solve the problem (bert-base-uncased and roberta-base).
We used MarginalRankingLoss as our loss function, since it will allow us to train the model with our pairs of sentences.  
We trained models on 5 folds and made 5 submissions.
The best model was the 4th one - roberta-base with freezing strategy. It achieved a private score of 0.76989 and a public score of 0.78628.

Possible improvements:
- Use more complex models/pretrained models on similar tasks
- Experiment with different hyperparameters
- Use more data for training
- Use some preprocessing techniques