# Fine Tune
- RoBERTa
- No need for inference speed up using distil bert since dataset is very small
- Hyperparameter tuning using huggingfaces hyperparameter search
- group k fold cross validation for prediction

## Several conditions:
- (spell corrected and) expanded prompts
- raw conversational part


In [38]:
import torch
print(torch.backends.mps.is_available())
device = torch.device("cpu")

True


In [39]:
from helpers.normalization import remove_newlines
import sqlite3
import pandas as pd

conn  = sqlite3.connect('../../data/giicg.db')
all_prompts = pd.read_sql("Select * from expanded_roberta_prompts", conn)
all_prompts['conversational'] = all_prompts['conversational'].apply(remove_newlines)
conn.close()

## Check data

In [40]:
users_per_gender = all_prompts.groupby('gender')['user_id'].nunique().reset_index(name='num_users')
users_per_gender

Unnamed: 0,gender,num_users
0,Man (cisgender),15
1,Woman (cisgender),12


In [41]:
messages_per_user = all_prompts.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
messages_per_user

Unnamed: 0,user_id,num_messages
0,6,9
1,8,2
2,11,11
3,15,3
4,16,25
5,25,4
6,28,22
7,31,5
8,34,66
9,46,5


## Combine prompts per user

In [42]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

user_prompts = (
    all_prompts.groupby(['user_id', 'gender', 'label'])['conversational']
    .apply('\n '.join)
    .reset_index()    # Reset index to create a DataFrame
)

user_prompts.columns = ['user_id', 'gender', 'label', 'combined_prompts']
user_prompts = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
user_prompts

Unnamed: 0,index,user_id,gender,label,combined_prompts
0,0,6,Man (cisgender),0,"parsing data from python iterator, how it coul..."
1,1,8,Man (cisgender),0,I am working on the problem of reconstruc...
2,2,11,Woman (cisgender),1,Can you adapt the following code so that inste...
3,3,15,Man (cisgender),0,SET_ALL_TABLES action is currently not fetchin...
4,4,16,Woman (cisgender),1,I want to use Dummy Hot encoding to replace th...
5,5,25,Man (cisgender),0,what is the best way to encode and compress a ...
6,6,28,Woman (cisgender),1,I have a pandas dataframe like this: I want t...
7,7,31,Man (cisgender),0,How can I make use of an ObservableHQDatabaseC...
8,8,34,Man (cisgender),0,Blender and Python. I have a collection of hun...
9,9,46,Man (cisgender),0,"how to run a Python future without blocking, i..."


In [43]:
print(user_prompts[['combined_prompts', 'label']].head())
print(user_prompts[['combined_prompts']].nunique())


                                    combined_prompts  label
0  parsing data from python iterator, how it coul...      0
1       I am working on the problem of reconstruc...      0
2  Can you adapt the following code so that inste...      1
3  SET_ALL_TABLES action is currently not fetchin...      0
4  I want to use Dummy Hot encoding to replace th...      1
combined_prompts    27
dtype: int64


In [44]:
print(user_prompts['label'].dtype)


int64


## Set up Model

In [45]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

with open("finetune/label2id.json", "r") as f:
    label2id = json.load(f)

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label2id)

config = AutoConfig.from_pretrained(
    model_name,
    hidden_dropout_prob=0.3,        # Increase from 0.1 to 0.3+
    attention_probs_dropout_prob=0.3
)

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(
        examples["combined_prompts"],
        truncation=True,
        max_length=512, # trncate to max sample size to avoid index errors
        padding=False # padding is handled in the data collator
    )

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Check max sample size

In [46]:
texts = user_prompts['combined_prompts'].tolist()

token_counts = [len(tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512)) for text in texts]

max_tokens = max(token_counts)
min_tokens = min(token_counts)
avg_tokens = sum(token_counts) / len(token_counts)

print(f"Max tokens: {max_tokens}")
print(f"Min tokens: {min_tokens}")


Max tokens: 512
Min tokens: 34


## Cross Validation

- selected hyperparameters: lr 3.2e-5, batchsizes 8, epochs 5

In [47]:
from sklearn.model_selection import KFold, StratifiedKFold
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset

texts = user_prompts['combined_prompts'].tolist()
labels = user_prompts['label'].tolist()
n_splits = 5  # e.g. 5-fold CV

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)



all_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"Fold {fold + 1}")

    train_prompts = user_prompts.iloc[train_idx]
    val_prompts = user_prompts.iloc[val_idx]
    print("train prompts: ", train_prompts)
    print("val prompts: ",val_prompts)

    train_dataset = Dataset.from_pandas(train_prompts[['combined_prompts', 'label']])
    val_dataset = Dataset.from_pandas(val_prompts[['combined_prompts', 'label']])

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    print("  Train label counts:", train_prompts['label'].value_counts().to_dict())
    print("  Val   label counts:", val_prompts['label'].value_counts().to_dict())

    # Set up Trainer with model/tokenizer/data_collator as before
    trainer = Trainer(
        model_init=model_init,
        args=TrainingArguments(
            output_dir=f"./finetune/cross_validation/run_1/fold_{fold+1}_results",
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir=f"./fold_{fold+1}_logs",
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=5,
            learning_rate=1e-5,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            logging_steps=1,
            logging_strategy="steps",
            weight_decay=0.05,
        ),
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_metrics = trainer.evaluate()
    print(f"Fold {fold + 1} metrics:", eval_metrics)
    all_results.append(eval_metrics)

print(all_results)




Fold 1
train prompts:      index  user_id             gender  label  \
0       0        6    Man (cisgender)      0   
1       1        8    Man (cisgender)      0   
2       2       11  Woman (cisgender)      1   
4       4       16  Woman (cisgender)      1   
5       5       25    Man (cisgender)      0   
7       7       31    Man (cisgender)      0   
8       8       34    Man (cisgender)      0   
11     11       48  Woman (cisgender)      1   
12     12       55  Woman (cisgender)      1   
13     13       56    Man (cisgender)      0   
14     14       60  Woman (cisgender)      1   
16     16       65  Woman (cisgender)      1   
17     17       73  Woman (cisgender)      1   
18     18       77    Man (cisgender)      0   
20     20       81    Man (cisgender)      0   
21     21       83    Man (cisgender)      0   
22     22       88    Man (cisgender)      0   
23     23       89  Woman (cisgender)      1   
24     24       90  Woman (cisgender)      1   
25     25       9

Map: 100%|██████████| 21/21 [00:00<00:00, 3603.94 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 1680.19 examples/s]

  Train label counts: {0: 12, 1: 9}
  Val   label counts: {0: 3, 1: 3}



  trainer = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6599,0.696399,0.5,0.333333,0.25,0.5
2,0.713,0.696399,0.5,0.333333,0.25,0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG16XFamilyCommandBuffer: 0x15b4ab0e0>
    label = <none> 
    device = <AGXG16SDevice: 0x1098f5200>
        name = Apple M4 Pro 
    commandQueue = <AGXG16XFamilyCommandQueue: 0x105179400>
        label = <none> 
        device = <AGXG16SDevice: 0x1098f5200>
            name = Apple M4 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG16XFamilyCommandBuffer: 0x15c326ad0>
    label = <none> 
    device = <AGXG16SDevice: 0x1098f5200>
        name = Apple M4 Pro 
    commandQueue = <AGX

KeyboardInterrupt: 

In [None]:
results = pd.DataFrame(all_results)
stats = results.describe()
with open("roberta_per_user_stats.tex", "w") as f:
    stats.to_latex(f)


In [None]:
stats

In [None]:
trainer.save_model("finetune/best_model_per_user")

In [None]:
few = user_prompts.sample(8, random_state=42)
train_ds = Dataset.from_pandas(few[['combined_prompts', 'label']])
train_ds = train_ds.map(tokenize_function, batched=True)
trainer = Trainer(
    model=model_init(),
    args=TrainingArguments(output_dir="./tmp", per_device_train_batch_size=2, num_train_epochs=30, logging_steps=1),
    train_dataset=train_ds,
    eval_dataset=train_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
print(trainer.evaluate())