In [1]:
# Import required libraries
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch_directml

In [2]:
# Step 1: Load and prepare training data
train_labels = pd.read_csv('./data/task2/train/labels.csv', index_col=0)
train_labels['file'] = './data/task2/train/' + train_labels['file'].astype(str)

# Step 2: Load test file paths
test_files = sorted(['./data/task2/test/' + f for f in os.listdir('./data/task2/test/') if f.endswith('.txt')])
test_df = pd.DataFrame({'file': test_files})

# Step 3: Load label mapping (optional, useful for later)
label_map = {}
with open('data/task2/mapping.txt', 'r') as f:
    for line in f:
        idx, label = line.strip().split('\t')
        label_map[int(idx)] = label

# Step 4: Define dataset class
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, is_test=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['file']
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}

        if not self.is_test:
            item['labels'] = int(self.data.iloc[idx]['label'])
        else:
            item['file'] = file_path

        return item

# Step 5: Prepare datasets
train_dataset = TextDataset(train_labels, tokenizer)
test_dataset = TextDataset(test_df, tokenizer, is_test=True)

# Step 6: Define model and training setup
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
    eval_strategy ="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Step 7: Train the model
trainer.train()

# Step 8: Predict on test data
model.eval()
test_loader = DataLoader(test_dataset, batch_size=8)

all_predictions = []

for batch in tqdm(test_loader, desc="Predicting"):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)

    for file, pred in zip(batch['file'], preds):
        all_predictions.append({
            'file': os.path.basename(file),
            'prediction': pred.item(),
            'label_name': label_map[pred.item()]
        })

# Step 9: Export predictions to CSV
pred_df = pd.DataFrame(all_predictions)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.6544
20,0.6027
30,0.6106
40,0.6653


KeyboardInterrupt: 