# Hyperparameter Search

- optimizing for accuracy since classes are balanced (12:15)

first search run:
- learning_rate between  5e-6, 5e-5, log=True
- num_train_epochs between 2, 5
- per_device_train_batch_size between 4, 8
- per_device_eval_batch_size 4, 8
- Best hyperparameters: {'learning_rate': 1.752433329903465e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 8}
- Best eval accuracy: 0.6153846153846154

second search run:
- batch sizes 8
- epochs 5
- learing rate between 5e-6, 2e-5
- Best hyperparameters: {'learning_rate': 1.2308237496976495e-05}
- Best eval accuracy: 0.6837606837606838

third run:
batch sizes 4
- learing rate between 5e-6, 2e-5
- Best hyperparameters: {'learning_rate': 1.2665150015950181e-05}
- Best eval accuracy: 0.6239316239316239

fourth run:
- batch sizes 8
- learning_rate between 1e-5, 3e-5
- highest accuracy 0.726496
- learning_rate': 2.8213598460702224e-05

fifth run:
- batch sizes 8
- learning_rate between 3e-5, 4e-5
- highest accuracy 0.760684
- learning_rate':  3.035495167103403e-05

sixth run:
- batch sizes 8
- learning_rate between 2.5e-5, 3.5e-5
- highest accuracy 0.803419	at 3.20605942472665e-05 at epoch 3
- also good: 0.77777 at 3.2759208826863756e-05 at epoch 3
- 0.726496 at  2.9592151393562346e-05 at epoch 3
- 0.752137	3.443498945690748e-05 at epoch 3

7th run:
- batch sizes 8
- learning_rate between 3.1e-5, 3.4e-5
- highest accuracy 0.752137	at 3.246309190194653e-05 at epoch 3
- and at 3.186004390546374e-05 at epoch 2

8th run:
- batch sizes 8
- learning_rate between 1e-5, 1.5e-5
- highest accuracy 0.69	at 1.25e-5 at epoch 5


In [None]:
import torch
import sqlite3
import pandas as pd


print(torch.backends.mps.is_available())
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

conn  = sqlite3.connect('../../giicg.db')
prompts = pd.read_sql("Select * from expanded_roberta_prompts", conn)
conn.close()
prompts

## Build dataset
- group aware split: no prompts from the same user will occur in both sets
- build dataset in huggingface format

In [None]:
from sklearn.model_selection import GroupShuffleSplit
from datasets import Dataset

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
groups = prompts['user_id']

train_idx, val_idx = next(gss.split(prompts, groups=groups))
train_prompts = prompts.iloc[train_idx]
val_prompts = prompts.iloc[val_idx]


train_dataset = Dataset.from_pandas(train_prompts[['conversational', 'label']])
val_dataset = Dataset.from_pandas(val_prompts[['conversational', 'label']])

train_dataset

## Model, Tokenizer & Data Collator

In [1]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

with open("finetune/label2id.json", "r") as f:
    label2id = json.load(f)

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label2id)

def model_init():
    # Needed for Trainer's hyperparameter search to re-initialize the model each trial
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize_function(examples):
    return tokenizer(
        examples["conversational"],
        truncation=True,
        padding=False # padding is handled in the data collator
    )


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'label2id' is not defined

## Tokenize

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
val_dataset

## Trainer

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    learning_rate=3.2e-5,
    #weight_decay= #
    #warmup_steps = 10,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,
    logging_strategy="steps",
)


trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics,
)



## Search

In [None]:
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1.5e-5, log=True),
    }


best_run = trainer.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    n_trials=10,
    compute_objective=lambda metrics: metrics["eval_accuracy"]
)

print("Best hyperparameters:", best_run.hyperparameters)
print("Best eval accuracy:", best_run.objective)
