# Fine Tune
- RoBERTa
- No need for inference speed up using distil bert since dataset is very small
- Hyperparameter tuning using huggingfaces hyperparameter search
- group k fold cross validation for prediction

## Several conditions:
- (spell corrected and) expanded prompts
- raw conversational part


In [15]:
import torch
print(torch.backends.mps.is_available())
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

True


In [16]:
import sqlite3
import pandas as pd

conn  = sqlite3.connect('../../giicg.db')
all_prompts = pd.read_sql("Select * from expanded_roberta_prompts", conn)
conn.close()

## Check data

In [17]:
users_per_gender = all_prompts.groupby('gender')['user_id'].nunique().reset_index(name='num_users')
users_per_gender

Unnamed: 0,gender,num_users
0,Man (cisgender),15
1,Woman (cisgender),12


In [18]:
messages_per_user = all_prompts.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
messages_per_user

Unnamed: 0,user_id,num_messages
0,6,9
1,8,2
2,11,11
3,15,3
4,16,25
5,25,4
6,28,22
7,31,5
8,34,66
9,46,5


## Combine prompts per user

In [19]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

user_prompts = (
    all_prompts.groupby(['user_id', 'gender', 'label'])['conversational']
    .apply(' '.join)
    .reset_index()    # Reset index to create a DataFrame
)

user_prompts.columns = ['user_id', 'gender', 'label', 'combined_prompts']
user_prompts = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
user_prompts['combined_prompts'] = user_prompts['combined_prompts'].apply(remove_punctuation_and_newlines)
user_prompts['combined_prompts'] = user_prompts['combined_prompts'].apply(remove_capitalization)

user_prompts

Unnamed: 0,index,user_id,gender,label,combined_prompts
0,0,6,Man (cisgender),0,parsing data from python iterator how it could...
1,1,8,Man (cisgender),0,i am working on the problem of reconstruc...
2,2,11,Woman (cisgender),1,can you adapt the following code so that inste...
3,3,15,Man (cisgender),0,setalltables action is currently not fetching ...
4,4,16,Woman (cisgender),1,i want to use dummy hot encoding to replace th...
5,5,25,Man (cisgender),0,what is the best way to encode and compress a ...
6,6,28,Woman (cisgender),1,i have a pandas dataframe like this i want to...
7,7,31,Man (cisgender),0,how can i make use of an observablehqdatabasec...
8,8,34,Man (cisgender),0,blender and python i have a collection of hund...
9,9,46,Man (cisgender),0,how to run a python future without blocking ie...


## Set up Model

In [22]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

with open("finetune/label2id.json", "r") as f:
    label2id = json.load(f)

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label2id)

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(
        examples["combined_prompts"],
        truncation=True,
        max_length=512, # trncate to max sample size to avoid index errors
        padding=False # padding is handled in the data collator
    )

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Check max sample size

In [24]:
texts = user_prompts['combined_prompts'].tolist()

token_counts = [len(tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512)) for text in texts]

max_tokens = max(token_counts)
min_tokens = min(token_counts)
avg_tokens = sum(token_counts) / len(token_counts)

print(f"Max tokens: {max_tokens}")
print(f"Min tokens: {min_tokens}")


Max tokens: 512
Min tokens: 31


## Cross Validation

- selected hyperparameters: lr 3.2e-5, batchsizes 8, epochs 5

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from sklearn.model_selection import GroupKFold

groups = user_prompts['user_id'].values
texts = user_prompts['combined_prompts'].tolist()
labels = user_prompts['label'].tolist()
n_splits = 5  # e.g. 5-fold CV

gkf = GroupKFold(n_splits=n_splits)

all_results = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(texts, labels, groups)):
    print(f"Fold {fold + 1}")

    train_prompts = user_prompts.iloc[train_idx]
    val_prompts = user_prompts.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_prompts[['combined_prompts', 'label']])
    val_dataset = Dataset.from_pandas(val_prompts[['combined_prompts', 'label']])

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # Set up Trainer with model/tokenizer/data_collator as before
    trainer = Trainer(
        model_init=model_init,
        args=TrainingArguments(
            output_dir=f"./finetune/cross_validation/run_1/fold_{fold+1}_results",
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir=f"./fold_{fold+1}_logs",
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=5,
            learning_rate=3.2e-5,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            logging_steps=50,
            logging_strategy="steps",
        ),
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_metrics = trainer.evaluate()
    print(f"Fold {fold + 1} metrics:", eval_metrics)
    all_results.append(eval_metrics)

print(all_results)




Fold 1


Map: 100%|██████████| 21/21 [00:00<00:00, 677.31 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 1179.72 examples/s]
  trainer = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG13GFamilyCommandBuffer: 0x174f42d20>
    label = <none> 
    device = <AGXG13GDevice: 0x1090acc00>
        name = Apple M1 
    commandQueue = <AGXG13GFamilyCommandQueue: 0x140c79200>
        label = <none> 
        device = <AGXG13GDevice: 0x1090acc00>
            name = Apple M1 
    retainedReferences = 1


In [None]:
results = pd.DataFrame(all_results)
stats = results.describe()
with open("roberta_per_user_stats.tex", "w") as f:
    stats.to_latex(f)


In [None]:
stats

In [None]:
trainer.save_model("finetune/best_model_per_user")