In [1]:
!pip install datasets
!pip install peft transformers

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:
from datasets import load_dataset
import pandas as pd

In [3]:
# Load train dataset
ds = load_dataset("ailsntua/QEvasion")

# Convert to pandas and keep only useful columns
df_train = ds["train"].to_pandas()[["question","interview_answer",
                                   "label","annotator_id"]]

# Convert labels to direct clarity classification
# 0 -> Clear Reply
# 1 -> Ambivalent Reply
# 2 -> Clear Non-Reply
label_dict = {'1.1 Explicit': 0,
              '1.2 Implicit': 1,
              '2.1 Dodging': 1,
              '2.4 General': 1,
              '2.2 Deflection': 1,
              '2.3 Partial/half-answer': 1,
              '2.6 Declining to answer': 2,
              '2.7 Claims ignorance': 2,
              '2.8 Clarification': 2}
df_train["label"] = df_train["label"].map(label_dict)

df_train['sQA'] = df_train['question'] + ' ' + df_train['interview_answer']

# Create dictionary with key as the annotator and value the Dataframe with
# only the corresponding sQAs
split_dfs = {
    category: group[["sQA", "label"]]
    for category, group in df_train.groupby('annotator_id')
    }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
temp_train_set = split_dfs[85]
temp_train_set = temp_train_set.loc[0:100]
temp_train_set

Unnamed: 0,sQA,label
0,How would you respond to the accusation that t...,0
1,Do you think President Xi is being sincere abo...,1
2,1. Q1: Do you believe the country's slowdown a...,1
3,2. Q2: Are you worried about the meeting betwe...,1
4,Is the President's engagement with Asian coun...,0
5,Is there a danger of a cold war? \nThe Presid...,1
6,When will the President meet Mr. Xi? \nThe Pre...,1
7,1. How concerned are you about this lack of co...,1
8,1. Concerns about the lack of communication be...,0
9,2. Inquiry about the reaction of Kyiv regardin...,0


In [5]:
from transformers import RobertaTokenizer
from torch.utils.data import Dataset
import torch

In [6]:
def tokenize_dataset(df, tokenizer, max_length=512):
    tokenized_texts = tokenizer(
        df['sQA'].tolist(),
        max_length=max_length,
        return_attention_mask=True,
        return_tensors='pt',
        padding='max_length',
        truncation=True
    )

    labels = torch.tensor(df['label'].values)

    return {
        'input_ids': tokenized_texts['input_ids'],
        'attention_masks': tokenized_texts['attention_mask'],
        'labels': labels
    }

class CostumDataset(Dataset):
    def __init__(self, dataset):
        self.input_ids = dataset['input_ids']
        self.attention_masks = dataset['attention_masks']
        self.labels = dataset['labels']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Return the tensors directly
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }
        return item

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# Load pre-trained Roberta model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,     # Task type: sequence classification
    r=8,                            # Rank of the low-rank adaptation
    lora_alpha=16,                  # Scaling factor
    lora_dropout=0.1,               # Dropout probability for LoRA layers
    # target_modules=["query", "value"] # Target modules to inject LoRA into (attention projections)
    target_modules=["dense", "out_proj"]
    )

# Add LoRA to the model
model = get_peft_model(model, lora_config)

# Freeze the original model parameters (LoRA will only train the adapters)
# for param in model.base_model.parameters():
#     param.requires_grad = False

# Check if LoRA parameters are trainable
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"Trainable parameter: {name}")

# Define the dataset and data loader
# Assuming the dataset returns a dictionary with 'input_ids', 'attention_mask', and 'labels'
dataset = CostumDataset(tokenize_dataset(temp_train_set, RobertaTokenizer.from_pretrained('roberta-base')))
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Move model to the appropriate device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the optimizer and loss function
# optimizer = optim.Adam(model.parameters(), lr=1e-4)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(5):  # adjust the number of epochs as needed
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        print("All good")

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')

    model.eval()

# Save the model with LoRA adapters
model.save_pretrained("roberta-lora")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
