# Fine Tune
- RoBERTa
- No need for inference speed up using distil bert since dataset is very small
- Hyperparameter tuning using huggingfaces hyperparameter search
- group k fold cross validation for prediction

## Several conditions:
- (spell corrected and) expanded prompts
- raw conversational part


In [22]:
import torch
print(torch.backends.mps.is_available())
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

True


In [23]:
import sqlite3
import pandas as pd

conn  = sqlite3.connect('../../giicg.db')
all_prompts = pd.read_sql("Select * from expanded_roberta_prompts", conn)
conn.close()

## Check data

In [24]:
users_per_gender = all_prompts.groupby('gender')['user_id'].nunique().reset_index(name='num_users')
users_per_gender

Unnamed: 0,gender,num_users
0,Man (cisgender),15
1,Woman (cisgender),12


In [25]:
messages_per_user = all_prompts.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
messages_per_user

Unnamed: 0,user_id,num_messages
0,6,9
1,8,2
2,11,11
3,15,3
4,16,25
5,25,4
6,28,22
7,31,5
8,34,66
9,46,5


## Check max sample size

In [26]:
texts = prompts['conversational'].tolist()

token_counts = [len(tokenizer.encode(text, add_special_tokens=True)) for text in texts]

max_tokens = max(token_counts)
min_tokens = min(token_counts)
avg_tokens = sum(token_counts) / len(token_counts)

print(f"Max tokens: {max_tokens}")
print(f"Min tokens: {min_tokens}")


Max tokens: 407
Min tokens: 4


## Set up Model

In [27]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

with open("finetune/label2id.json", "r") as f:
    label2id = json.load(f)

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label2id)

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(
        examples["conversational"],
        truncation=True,
        padding=False # padding is handled in the data collator
    )

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Cross Validation

- selected hyperparameters: lr 3.2e-5, batchsizes 8, epochs 10

In [28]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from sklearn.model_selection import GroupKFold

groups = prompts['user_id'].values
texts = prompts['conversational'].tolist()
labels = prompts['label'].tolist()
n_splits = 5  # e.g. 5-fold CV

gkf = GroupKFold(n_splits=n_splits)

all_results = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(texts, labels, groups)):
    print(f"Fold {fold + 1}")

    train_prompts = prompts.iloc[train_idx]
    val_prompts = prompts.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_prompts[['conversational', 'label']])
    val_dataset = Dataset.from_pandas(val_prompts[['conversational', 'label']])

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # Set up Trainer with model/tokenizer/data_collator as before
    trainer = Trainer(
        model_init=model_init,
        args=TrainingArguments(
            output_dir=f"./second_run/fold_{fold+1}_results",
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir=f"./fold_{fold+1}_logs",
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=5,
            learning_rate=3.2e-5,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            logging_steps=50,
            logging_strategy="steps",
        ),
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_metrics = trainer.evaluate()
    print(f"Fold {fold + 1} metrics:", eval_metrics)
    all_results.append(eval_metrics)

print(all_results)




Fold 1


Map: 100%|██████████| 453/453 [00:00<00:00, 25372.50 examples/s]
Map: 100%|██████████| 114/114 [00:00<00:00, 23944.65 examples/s]
  trainer = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6335,1.412525,0.324561,0.468703,0.9829,0.324561
2,0.4341,1.723869,0.342105,0.48882,0.982912,0.342105
3,0.3073,2.339745,0.491228,0.64386,0.965215,0.491228




RuntimeError: MPS backend out of memory (MPS allocated: 8.46 GiB, other allocations: 9.67 GiB, max allowed: 18.13 GiB). Tried to allocate 38.16 MiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [28]:
results = pd.DataFrame(all_results)
results.describe()


Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,1.387722,0.636502,0.69652,0.834506,0.636502,0.41684,278.3538,36.3542,5.0
std,0.52683,0.093424,0.060681,0.104036,0.093424,0.068596,48.261568,6.589979,0.0
min,0.817659,0.5625,0.607909,0.713605,0.5625,0.3332,224.335,29.779,5.0
25%,1.133371,0.578947,0.684969,0.770444,0.578947,0.365,245.91,30.739,5.0
50%,1.154529,0.578947,0.695783,0.808108,0.578947,0.4268,267.093,35.144,5.0
75%,1.685343,0.681416,0.718226,0.912435,0.681416,0.4555,312.31,41.093,5.0
max,2.147707,0.780702,0.775714,0.967936,0.780702,0.5037,342.121,45.016,5.0
