In [1]:
import os
import pandas as pd
import torch
import torch_directml
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
device = torch_directml.device()
device

device(type='privateuseone', index=0)

In [7]:
# Load label mapping
label_map = {}
with open('data/task2/mapping.txt', 'r') as f:
    for line in f:
        idx, label = line.strip().split('\t')
        label_map[int(idx)] = label

# Load training labels and files
train_labels = pd.read_csv('data/task2/train/labels.csv', index_col=0)
train_labels['file'] = 'data/task2/train/' + train_labels['file'].astype(str)

# Load test files
test_files = sorted(['data/task2/test/' + f for f in os.listdir('data/task2/test/') if f.endswith('.txt')])
test_df = pd.DataFrame({'file': test_files})

# Setup tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Step 3: Load label mapping (optional, useful for later)
label_map = {}
with open('data/task2/mapping.txt', 'r') as f:
    for line in f:
        idx, label = line.strip().split('\t')
        label_map[int(idx)] = label

# Step 4: Define dataset class
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")



In [11]:
config = {
    'batch_size': 8,
    'learning_rate': 5e-5,
    'epochs': 3,
    'max_length': 256,
    'early_stop_patience': 3
}

In [21]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, is_test=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['file']
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=config['max_length'],
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}

        if not self.is_test:
            item['labels'] = torch.tensor(int(self.data.iloc[idx]['label']), dtype=torch.long)
        else:
            item['file'] = file_path

        return item
    

train_dataset = TextDataset(train_labels, tokenizer)
test_dataset = TextDataset(test_df, tokenizer, is_test=True)

device = torch_directml.device()
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)  # *** CHANGED ***
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])           # *** CHANGED ***


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
precisions, recalls, f1s = [], [], []
best_f1 = 0.0 
epochs_no_improve = 0 


for epoch in range(config['epochs']):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

    
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

    print(f"Epoch {epoch+1} Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        epochs_no_improve = 0
        # Save best model state
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epoch(s)")

    if epochs_no_improve >= config['early_stop_patience']:
        print(f"Early stopping triggered after {epoch+1} epochs.")
        break

# Load best model state after early stopping
model.load_state_dict(best_model_state)
print("Loaded best model state based on highest F1 score.")


  torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - beta1)
Epoch 1:  43%|████▎     | 657/1513 [06:57<09:02,  1.58it/s]

In [None]:
df_preds = pd.DataFrame({
    'label': all_labels,
    'prediction': all_preds
})
df_preds.to_csv('train_predictions.csv', index=False)
print("Saved predictions and labels to train_predictions.csv")

# *** ADDED: Plot metrics ***
plt.plot(range(1, config['epochs'] + 1), precisions, label='Precision')
plt.plot(range(1, config['epochs'] + 1), recalls, label='Recall')
plt.plot(range(1, config['epochs'] + 1), f1s, label='F1 Score')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Training Metrics')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Set model to evaluation mode
model.eval()
model.to(device)

precisions, recalls, f1s = [], [], []

test_loader = DataLoader(test_dataset, batch_size=8)

# Collect predictions
all_preds = []

for batch in tqdm(test_loader, desc="Predicting"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

    all_preds.extend(preds.cpu().numpy())


Predicting: 100%|██████████| 250/250 [02:16<00:00,  1.84it/s]

Predictions saved to 'submission.csv'





In [None]:
submission = pd.DataFrame({
    'Id': list(range(len(all_preds))),
    'predictions': all_preds
})

submission.to_csv('submission.csv', index=False)

Fixed submission.csv generated!
