In [1]:
# Import required libraries
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# Load labeled list of training files:
train_files = pd.read_csv('data/task2/train/labels.csv', index_col=0)
train_files['file'] = ['data/task2/train/' + s for s in train_files['file']]

In [40]:
# Load training data sample:
file_path = train_files.file.sample(1).iloc[0]
for i in range(10000):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

    except Exception as e:
        print(f"UnicodeDecodeError in file: {file_path}")
        print(f"Error details: {e}")
        break

In [None]:
# Load list of test files:
import os
test_files = ['data/task2/test/' + s for s in os.listdir('data/task2/test/')]
test_files.sort()
test_files = pd.DataFrame({'file': test_files})
test_files.head()

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def load_data(file_list, labels=None):
    texts = []
    for file in file_list:
        with open(file, 'r') as f:
            texts.append(f.read())
    return Dataset.from_dict({"text": texts, "label": labels})

train_data = load_data(train_files['file'], train_files['label'])

test_data = load_data(test_files['file'])

def tokenize_function(examples):
    return tokenizer(examples['text'], padding = "max_length", truncation = True, max_length = 512)

train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

train_data.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
test_data.set_format(type = 'torch', columns = ['input_ids', 'attention_mask'])

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels = 2)

def compute_metrics(p):
    preds = torch.argmax(p.predictions, axis = 1)
    return {'f1': f1_score(p.label_ids, preds)}

training_args = TrainingArguments(
    output_dir ='./results',          
    num_train_epochs = 3,              
    per_device_train_batch_size = 16,  
    per_device_eval_batch_size = 64,   
    warmup_steps = 500,                
    weight_decay = 0.01,               
    logging_dir = './logs',            
    logging_steps = 10,
    evaluation_strategy = "epoch",     
    load_best_model_at_end = True,     
    metric_for_best_model = 'f1',      
    greater_is_better = True,          
    learning_rate = 5e-6              
)

trainer = Trainer(
    model = model,                         
    args = training_args,                  
    train_dataset = train_data,           
    eval_dataset = test_data,              
    compute_metrics = compute_metrics      
)

trainer.train()

predictions = trainer.predict(test_data)

pred_labels = torch.argmax(predictions.predictions, axis = 1).cpu().numpy()

submission = pd.DataFrame(predictions, columns = ['predictions']).to_csv('submission.csv')

submission.to_csv('submission.csv', index = False)

files.download('submission.csv')