In [None]:
import os
import pandas as pd
import torch
import torch_directml
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from torch.nn import CrossEntropyLoss
from sklearn.metrics import precision_recall_fscore_support
from torch.nn import CrossEntropyLoss

In [41]:
device = torch_directml.device()
device

device(type='privateuseone', index=0)

In [42]:
# Load label mapping
label_map = {}
with open('data/task2/mapping.txt', 'r') as f:
    for line in f:
        idx, label = line.strip().split('\t')
        label_map[int(idx)] = label

# Load training labels and files
train_labels = pd.read_csv('data/task2/train/labels.csv', index_col=0)
train_labels['file'] = 'data/task2/train/' + train_labels['file'].astype(str)

# Load test files
test_files = sorted(['data/task2/test/' + f for f in os.listdir('data/task2/test/') if f.endswith('.txt')])
test_df = pd.DataFrame({'file': test_files})

# Setup tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Step 3: Load label mapping (optional, useful for later)
label_map = {}
with open('data/task2/mapping.txt', 'r') as f:
    for line in f:
        idx, label = line.strip().split('\t')
        label_map[int(idx)] = label

# Step 4: Define dataset class
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")



In [43]:
config = {
    'batch_size': 10,
    'learning_rate': 2e-5,
    'epochs': 3,
    'max_length': 160,
    'early_stop_patience': 3
}

In [44]:
# label mapping
label_map = {}
with open('data/task2/mapping.txt', 'r') as f:
    for line in f:
        idx, label = line.strip().split('\t')
        label_map[int(idx)] = label

# Load training labels and files
train_labels = pd.read_csv('data/task2/train/labels.csv', index_col=0)
train_labels['file'] = 'data/task2/train/' + train_labels['file'].astype(str)

# Load test files
test_files = sorted(['data/task2/test/' + f for f in os.listdir('data/task2/test/') if f.endswith('.txt')])
test_df = pd.DataFrame({'file': test_files})

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [45]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, is_test=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['file']
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=config['max_length'],
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}

        if not self.is_test:
            item['labels'] = torch.tensor(int(self.data.iloc[idx]['label']), dtype=torch.long)
        else:
            item['file'] = file_path

        return item
    

train_dataset = TextDataset(train_labels, tokenizer)
test_dataset = TextDataset(test_df, tokenizer, is_test=True)

device = torch_directml.device()
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)  
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])           


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
class_counts = train_labels['label'].value_counts()
weight_0 = class_counts[1] / (class_counts[0] + class_counts[1])
weight_1 = class_counts[0] / (class_counts[0] + class_counts[1])
class_weights = torch.tensor([weight_0, weight_1], dtype=torch.float).to(device)

criterion = CrossEntropyLoss(weight=class_weights)

In [47]:
count_0 = (train_labels['label'] == 0).sum()
count_1 = (train_labels['label'] == 1).sum()
weight_0 = 1.0
weight_1 = count_0 / count_1
weights = torch.tensor([weight_0, weight_1], dtype=torch.float).to(device)

criterion = CrossEntropyLoss(weight=weights)

optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])

# Early stopping variables
best_f1 = 0
patience_counter = 0


history = {
    'precision': [],
    'recall': [],
    'f1': []
}

In [49]:


for epoch in range(config['epochs']):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} Training Loss: {total_loss:.4f}")

    
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
    
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
    
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())


    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )

    print(f"Epoch {epoch+1} Eval -> Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    history['precision'].append(precision)
    history['recall'].append(recall)
    history['f1'].append(f1)

    if f1 > best_f1:
        best_f1 = f1
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= config['early_stop_patience']:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

model.eval()
predictions = []
with torch.no_grad():
    for item in tqdm(test_dataset, desc="Predicting Test Set"):
        input_ids = item['input_ids'].unsqueeze(0).to(device)
        attention_mask = item['attention_mask'].unsqueeze(0).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred_label = torch.argmax(outputs.logits, dim=1).item()
        predictions.append(pred_label)


Epoch 1: 100%|██████████| 1210/1210 [11:25<00:00,  1.77it/s]


Epoch 1 Training Loss: 780.1271
Epoch 1 Eval -> Precision: 0.0000, Recall: 0.0000, F1: 0.0000


Epoch 2: 100%|██████████| 1210/1210 [11:32<00:00,  1.75it/s]


Epoch 2 Training Loss: 840.3703
Epoch 2 Eval -> Precision: 0.0000, Recall: 0.0000, F1: 0.0000


Epoch 3:  85%|████████▌ | 1031/1210 [09:42<01:41,  1.77it/s]


KeyboardInterrupt: 

In [50]:
test_filenames = [os.path.basename(f) for f in test_df['file']]
output_df = pd.DataFrame({'file': test_filenames, 'label': predictions})
output_df.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")
metrics_df = pd.DataFrame(history)
metrics_df.to_csv('training_metrics.csv', index=False)
print("Training metrics saved to training_metrics.csv")

NameError: name 'predictions' is not defined

In [11]:
print(train_labels['label'].value_counts())


label
0    8460
1    3640
Name: count, dtype: int64


In [8]:
# Set model to evaluation mode
model.eval()
model.to(device)

precisions, recalls, f1s = [], [], []

test_loader = DataLoader(test_dataset, batch_size=8)

# Collect predictions
all_preds = []

for batch in tqdm(test_loader, desc="Predicting"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

    all_preds.extend(preds.cpu().numpy())


Predicting: 100%|██████████| 250/250 [00:09<00:00, 25.67it/s]


In [9]:
submission = pd.DataFrame({
    'Id': list(range(len(all_preds))),
    'predictions': all_preds
})

submission.to_csv('submission.csv', index=False)