In [1]:
# Import required libraries
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Load labeled list of training files:
train_files = pd.read_csv('data/task2/train/labels.csv', index_col=0)
train_files['file'] = ['data/task2/train/' + s for s in train_files['file']]

In [3]:
# Load training data sample:
file_path = train_files.file.sample(1).iloc[0]
for i in range(10000):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

    except Exception as e:
        print(f"UnicodeDecodeError in file: {file_path}")
        print(f"Error details: {e}")
        break

In [4]:
# Load list of test files:
import os
test_files = ['data/task2/test/' + s for s in os.listdir('data/task2/test/')]
test_files.sort()
test_files = pd.DataFrame({'file': test_files})
test_files.head()

Unnamed: 0,file
0,data/task2/test/0000.txt
1,data/task2/test/0001.txt
2,data/task2/test/0002.txt
3,data/task2/test/0003.txt
4,data/task2/test/0004.txt


In [7]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def load_data(file_list, labels=None):
    texts = []
    for file in file_list:
        with open(file, 'r', encoding='utf-8') as f:
            texts.append(f.read())
    
    # if we have labels for the training data
    if labels is None or len(labels)==0:
        return Dataset.from_dict({"text": texts})
    else:
        return Dataset.from_dict({"text": texts, "label": labels})
        

train_data = load_data(train_files['file'], train_files['label'])

test_data = load_data(test_files['file'])

def tokenize_function(examples):
    return tokenizer(examples['text'], padding = "max_length", truncation = True, max_length = 512)

train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

train_data.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
test_data.set_format(type = 'torch', columns = ['input_ids', 'attention_mask'])

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels = 2)

def compute_metrics(p):
    preds = torch.argmax(p.predictions, axis = 1)
    return {'f1': f1_score(p.label_ids, preds)}

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",       
    load_best_model_at_end=True,     
    metric_for_best_model='f1',      
    greater_is_better=True,          
    learning_rate=5e-6              
)


trainer = Trainer(
    model = model,                         
    args = training_args,                  
    train_dataset = train_data,           
    eval_dataset = test_data,              
    compute_metrics = compute_metrics      
)

trainer.train()

predictions = trainer.predict(test_data)

pred_labels = torch.argmax(predictions.predictions, axis = 1).cpu().numpy()

pd.DataFrame(pred_labels, columns=['predictions']).to_csv('submission.csv', index=False)


Map: 100%|██████████| 12100/12100 [00:06<00:00, 1907.71 examples/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1740.05 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`