In [1]:
import os
import pandas as pd
import torch
import torch_directml
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import get_scheduler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.metrics import f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch_directml.device()
device

device(type='privateuseone', index=0)

In [3]:
# Load labeled list of training files:
train_files = pd.read_csv('data/task2/train/labels.csv', index_col=0)
train_files['file'] = ['data/task2/train/' + s for s in train_files['file']]
test_files = ['data/task2/test/' + s for s in os.listdir('data/task2/test/')]
test_files.sort()
test_files = pd.DataFrame({'file': test_files})

In [9]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

class TextDataset(Dataset):
    def __init__(self, files, labels=None):
        self.texts = []
        self.labels = labels
        for path in files:
            with open(path, 'r', encoding='utf-8') as f:
                self.texts.append(f.read())

        self.encodings = tokenizer(self.texts, truncation=True, padding=True, return_tensors='pt', max_length=512)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = TextDataset(train_files['file'], train_files['label'].tolist())
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [10]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 4
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

best_f1 = 0
patience = 3
patience_counter = 0

In [11]:
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in DataLoader(train_dataset, batch_size=8):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())

    f1 = f1_score(labels, preds)
    print(f"Epoch {epoch+1}, F1 Score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

  6%|▌         | 88/1513 [00:52<14:12,  1.67it/s]


KeyboardInterrupt: 

In [8]:
predictions = []

# Predict:
for file in tqdm(test_files.file):
  with open(file, 'r', encoding="utf-8") as f:
    text = f.read()

  # Tokenize the text:
  inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
  inputs = {k: v.to(device) for k, v in inputs.items()}

  # Get the model predictions:
  with torch.no_grad():
    outputs = model(**inputs)

  # Get the predicted class:
  pred = outputs.logits.argmax(dim=1).cpu().numpy()[0]
  predictions.append(pred)
# Save predictions to CSV:
pd.DataFrame(predictions, columns=['predictions']).to_csv('submission.csv')

100%|██████████| 2000/2000 [00:15<00:00, 126.44it/s]
