# Fine Tune
- RoBERTa
- No need for inference speed up using distil bert since dataset is very small

## Several conditions:
- (spell corrected and) expanded prompts
- raw conversational part


In [1]:
import torch
print(torch.backends.mps.is_available())
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")



True


In [2]:
import sqlite3
import pandas as pd

conn  = sqlite3.connect('../../giicg.db')
prompts = pd.read_sql("Select * from expanded_prompts", conn)
conn.close()
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
755,724,31,user,import pandas as pd\nimport numpy as np\nfrom ...,Please replace my retrieval pipeline here with...,import pandas as pd\nimport numpy as np\nfrom ...,You are tasked with separating user prompts in...,Man (cisgender),92,en
756,726,31,user,"please update my code accordingly, no comments...","please update my code accordingly, no comments...",,,Man (cisgender),92,en
757,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,"I want to tune optimal thresholds. Currently, ...",import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
758,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",I want to use an LLM for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en


## Filter and clean

In [4]:
from helpers.normalization import remove_newlines

prompts = prompts[prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
prompts['conversational']  = prompts['conversational'].apply(remove_newlines)
prompts

Unnamed: 0,level_0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...,...,...
748,748,755,724,31,user,import pandas as pd\nimport numpy as np\nfrom ...,Please replace my retrieval pipeline here with...,import pandas as pd\nimport numpy as np\nfrom ...,You are tasked with separating user prompts in...,Man (cisgender),92,en
749,749,756,726,31,user,"please update my code accordingly, no comments...","please update my code accordingly, no comments...",,,Man (cisgender),92,en
750,750,757,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,"I want to tune optimal thresholds. Currently, ...",import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
751,751,758,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",I want to use an LLM for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en


## Create label mapping

In [5]:
import json

labels = prompts['gender'].astype('category')
prompts['label'] = labels.cat.codes
label2id = dict(enumerate(labels.cat.categories))
label2id


with open("finetune/label2id.json", "w") as f:
    json.dump(label2id, f)



## Build dataset
- group aware split: no prompts from the same user will occur in both sets
- build dataset in huggingface format

In [6]:
from sklearn.model_selection import GroupShuffleSplit
from datasets import Dataset

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
groups = prompts['user_id']

train_idx, val_idx = next(gss.split(prompts, groups=groups))
train_prompts = prompts.iloc[train_idx]
val_prompts = prompts.iloc[val_idx]


train_dataset = Dataset.from_pandas(train_prompts[['conversational', 'label']])
val_dataset = Dataset.from_pandas(val_prompts[['conversational', 'label']])

train_dataset

Dataset({
    features: ['conversational', 'label', '__index_level_0__'],
    num_rows: 333
})

## Model, Tokenizer & Data Collator

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize_function(examples):
    return tokenizer(
        examples["conversational"],
        truncation=True,
        padding=False # padding is handled in the data collator
    )


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenize

In [9]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
val_dataset

Map: 100%|██████████| 333/333 [00:00<00:00, 11578.50 examples/s]
Map: 100%|██████████| 420/420 [00:00<00:00, 31021.32 examples/s]


Dataset({
    features: ['conversational', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 420
})

## Trainer


In [10]:
import torch
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    #weight_decay=0.0,
    #warmup_steps = 10,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,         
    logging_strategy="steps",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics,
)



  trainer = Trainer(


## Train

In [11]:
trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.647527,0.652381
2,0.688300,0.832852,0.67381
3,0.444800,1.024136,0.647619
4,0.220200,1.189306,0.683333
5,0.091100,1.34958,0.671429




TrainOutput(global_step=210, training_loss=0.34968053102493285, metrics={'train_runtime': 227.9171, 'train_samples_per_second': 7.305, 'train_steps_per_second': 0.921, 'total_flos': 48587670260220.0, 'train_loss': 0.34968053102493285, 'epoch': 5.0})

In [None]:
print(trainer.evaluate())