# Fine Tune
- RoBERTa
- No need for inference speed up using distil bert since dataset is very small
- Hyperparameter tuning using huggingfaces hyperparameter search
- group k fold cross validation for prediction

## Several conditions:
- (spell corrected and) expanded prompts
- raw conversational part


In [2]:
import torch
print(torch.backends.mps.is_available())
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

True


In [3]:
import sqlite3
import pandas as pd

conn  = sqlite3.connect('../../../data/giicg.db')
all_prompts = pd.read_sql("Select * from expanded_roberta_prompts", conn)
conn.close()

## Check data

In [4]:
users_per_gender = all_prompts.groupby('gender')['user_id'].nunique().reset_index(name='num_users')
users_per_gender

Unnamed: 0,gender,num_users
0,Man (cisgender),15
1,Woman (cisgender),12


In [5]:
messages_per_user = all_prompts.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
messages_per_user

Unnamed: 0,user_id,num_messages
0,6,9
1,8,2
2,11,11
3,15,3
4,16,25
5,25,4
6,28,22
7,31,5
8,34,66
9,46,5


## Set up Model

In [16]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

with open("../../prediction/finetune/label2id.json", "r") as f:
    label2id = json.load(f)

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label2id)

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(
        examples["conversational"],
        truncation=True,
        padding=False # padding is handled in the data collator
    )

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Check max sample size

In [17]:
texts = all_prompts['conversational'].tolist()

token_counts = [len(tokenizer.encode(text, add_special_tokens=True)) for text in texts]

max_tokens = max(token_counts)
min_tokens = min(token_counts)
avg_tokens = sum(token_counts) / len(token_counts)

print(f"Max tokens: {max_tokens}")
print(f"Min tokens: {min_tokens}")


Max tokens: 407
Min tokens: 4


## Cross Validation

- selected hyperparameters: lr 3.2e-5, batchsizes 8, epochs 5

In [18]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from sklearn.model_selection import GroupKFold

groups = all_prompts['user_id'].values
texts = all_prompts['conversational'].tolist()
labels = all_prompts['label'].tolist()
n_splits = 5  # e.g. 5-fold CV

gkf = GroupKFold(n_splits=n_splits)

all_results = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(texts, labels, groups)):
    print(f"Fold {fold + 1}")

    train_prompts = all_prompts.iloc[train_idx]
    val_prompts = all_prompts.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_prompts[['conversational', 'label']])
    val_dataset = Dataset.from_pandas(val_prompts[['conversational', 'label']])

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # Set up Trainer with model/tokenizer/data_collator as before
    trainer = Trainer(
        model_init=model_init,
        args=TrainingArguments(
            output_dir=f"./finetune/cross_validation/run_4/fold_{fold+1}_results",
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir=f"./fold_{fold+1}_logs",
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=5,
            learning_rate=3.2e-5,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            logging_steps=50,
            logging_strategy="steps",
        ),
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_metrics = trainer.evaluate()
    print(f"Fold {fold + 1} metrics:", eval_metrics)
    all_results.append(eval_metrics)

print(all_results)




Fold 1


Map: 100%|██████████| 453/453 [00:00<00:00, 66587.92 examples/s]
Map: 100%|██████████| 114/114 [00:00<00:00, 32898.77 examples/s]
  trainer = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6335,1.39664,0.324561,0.468703,0.9829,0.324561
2,0.4361,1.646291,0.350877,0.498677,0.982918,0.350877
3,0.313,2.516136,0.45614,0.611588,0.963846,0.45614
4,0.1341,2.321332,0.561404,0.704024,0.967456,0.561404
5,0.1499,3.167446,0.438596,0.594857,0.963081,0.438596




Fold 1 metrics: {'eval_loss': 2.3213324546813965, 'eval_accuracy': 0.5614035087719298, 'eval_f1': 0.7040240461293094, 'eval_precision': 0.9674561403508772, 'eval_recall': 0.5614035087719298, 'eval_runtime': 0.4168, 'eval_samples_per_second': 273.492, 'eval_steps_per_second': 35.986, 'epoch': 5.0}
Fold 2


Map: 100%|██████████| 454/454 [00:00<00:00, 64314.17 examples/s]
Map: 100%|██████████| 113/113 [00:00<00:00, 29260.18 examples/s]
  trainer = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6794,0.805562,0.353982,0.274157,0.668,0.353982
2,0.6675,0.632077,0.672566,0.623761,0.612031,0.672566
3,0.5431,0.856883,0.672566,0.6863,0.789035,0.672566
4,0.3403,1.003524,0.637168,0.653008,0.741009,0.637168
5,0.2301,1.11414,0.646018,0.661695,0.744885,0.646018




Fold 2 metrics: {'eval_loss': 0.6320773363113403, 'eval_accuracy': 0.672566371681416, 'eval_f1': 0.6237607411202007, 'eval_precision': 0.6120305456588643, 'eval_recall': 0.672566371681416, 'eval_runtime': 0.4662, 'eval_samples_per_second': 242.407, 'eval_steps_per_second': 32.178, 'epoch': 5.0}
Fold 3


Map: 100%|██████████| 453/453 [00:00<00:00, 59737.78 examples/s]
Map: 100%|██████████| 114/114 [00:00<00:00, 31113.40 examples/s]
  trainer = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6727,0.969304,0.45614,0.472962,0.697931,0.45614
2,0.461,1.133397,0.578947,0.607909,0.713605,0.578947
3,0.258,2.107358,0.526316,0.554896,0.703083,0.526316
4,0.2455,2.748728,0.517544,0.545838,0.699297,0.517544
5,0.1017,2.864187,0.535088,0.56386,0.706794,0.535088




Fold 3 metrics: {'eval_loss': 1.1333969831466675, 'eval_accuracy': 0.5789473684210527, 'eval_f1': 0.6079093902460049, 'eval_precision': 0.7136051335234638, 'eval_recall': 0.5789473684210527, 'eval_runtime': 0.3635, 'eval_samples_per_second': 313.616, 'eval_steps_per_second': 41.265, 'epoch': 5.0}
Fold 4


Map: 100%|██████████| 453/453 [00:00<00:00, 53884.45 examples/s]
Map: 100%|██████████| 114/114 [00:00<00:00, 36891.49 examples/s]
  trainer = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6817,0.599319,0.710526,0.695902,0.760671,0.710526
2,0.4979,0.548933,0.763158,0.760932,0.773338,0.763158
3,0.2891,0.867413,0.780702,0.776838,0.801587,0.780702
4,0.2508,1.210691,0.780702,0.776838,0.801587,0.780702
5,0.1944,1.316027,0.754386,0.75317,0.759498,0.754386




Fold 4 metrics: {'eval_loss': 0.8674131035804749, 'eval_accuracy': 0.7807017543859649, 'eval_f1': 0.7768381489311722, 'eval_precision': 0.8015873015873016, 'eval_recall': 0.7807017543859649, 'eval_runtime': 0.3534, 'eval_samples_per_second': 322.549, 'eval_steps_per_second': 42.441, 'epoch': 5.0}
Fold 5


Map: 100%|██████████| 455/455 [00:00<00:00, 58543.72 examples/s]
Map: 100%|██████████| 112/112 [00:00<00:00, 30826.30 examples/s]
  trainer = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.642,1.383812,0.196429,0.276603,0.906995,0.196429
2,0.5021,1.107049,0.526786,0.655293,0.909306,0.526786
3,0.2964,1.641585,0.517857,0.647657,0.908464,0.517857
4,0.3293,3.34346,0.401786,0.539513,0.894404,0.401786
5,0.0866,3.331898,0.428571,0.566042,0.898283,0.428571




Fold 5 metrics: {'eval_loss': 1.1070492267608643, 'eval_accuracy': 0.5267857142857143, 'eval_f1': 0.6552927978629208, 'eval_precision': 0.9093063186813186, 'eval_recall': 0.5267857142857142, 'eval_runtime': 0.4726, 'eval_samples_per_second': 236.99, 'eval_steps_per_second': 29.624, 'epoch': 5.0}
[{'eval_loss': 2.3213324546813965, 'eval_accuracy': 0.5614035087719298, 'eval_f1': 0.7040240461293094, 'eval_precision': 0.9674561403508772, 'eval_recall': 0.5614035087719298, 'eval_runtime': 0.4168, 'eval_samples_per_second': 273.492, 'eval_steps_per_second': 35.986, 'epoch': 5.0}, {'eval_loss': 0.6320773363113403, 'eval_accuracy': 0.672566371681416, 'eval_f1': 0.6237607411202007, 'eval_precision': 0.6120305456588643, 'eval_recall': 0.672566371681416, 'eval_runtime': 0.4662, 'eval_samples_per_second': 242.407, 'eval_steps_per_second': 32.178, 'epoch': 5.0}, {'eval_loss': 1.1333969831466675, 'eval_accuracy': 0.5789473684210527, 'eval_f1': 0.6079093902460049, 'eval_precision': 0.7136051335234638

In [19]:
results = pd.DataFrame(all_results)
stats = results.describe()
with open("stats.tex", "w") as f:
    stats.to_latex(f)


In [20]:
stats

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,1.212254,0.624081,0.673565,0.800797,0.624081,0.4145,277.8108,36.2988,5.0
std,0.652469,0.10285,0.068407,0.143862,0.10285,0.055651,39.43963,5.568248,0.0
min,0.632077,0.526786,0.607909,0.612031,0.526786,0.3534,236.99,29.624,5.0
25%,0.867413,0.561404,0.623761,0.713605,0.561404,0.3635,242.407,32.178,5.0
50%,1.107049,0.578947,0.655293,0.801587,0.578947,0.4168,273.492,35.986,5.0
75%,1.133397,0.672566,0.704024,0.909306,0.672566,0.4662,313.616,41.265,5.0
max,2.321332,0.780702,0.776838,0.967456,0.780702,0.4726,322.549,42.441,5.0


In [22]:
trainer.save_model("finetune/best_model")

## Get Validation set from best fold (4th)

In [10]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from sklearn.model_selection import GroupKFold

groups = all_prompts['user_id'].values
texts = all_prompts['conversational'].tolist()
labels = all_prompts['label'].tolist()
n_splits = 5

gkf = GroupKFold(n_splits=n_splits)

# Get validation indices for the 4th fold (fold==3)
for fold, (train_idx, val_idx) in enumerate(gkf.split(texts, labels, groups)):
    if fold == 3:
        val_prompts_4th = all_prompts.iloc[val_idx]
        val_dataset_4th = val_prompts_4th[['conversational', 'label']]
        break
val_dataset_4th


Unnamed: 0,conversational,label
48,what is the best way to encode and compress a ...,0
49,"does lzstring also work in the browser, client...",0
50,Is there a way in typescript to cast to a type...,0
51,I would like to distill a type based on an inc...,0
150,can you create Photoshop Scripts?,0
...,...,...
412,can you update directly the folder iterative c...,1
413,"super, now can you give me latex formula for t...",1
414,yes latex code please,1
515,I am working on the problem of reconstruc...,0


## save to db

In [13]:
conn  = sqlite3.connect('../../../data/giicg.db')
val_dataset_4th.to_sql("validation_set", conn, if_exists="replace", index=False)

114