# Experiment example usage

In [1]:
%cd C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
%cd ..

C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop


In [2]:
from core.base_model import BaseModel
from basic.huggingface_model import HuggingFaceModel
from basic.llm_model import LLMModel
from intermediate.llm_size_experiment import LLMSizeExperiment
# from tmp.custom_model import EmotionBERT, EmotionClassifier 
from advanced.custom_model import EmotionBERT, EmotionClassifier

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import DataLoader
from typing import Union
from datasets import Dataset, load_dataset
import pandas as pd
import random
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Load data

In [3]:
go_emotions_simplified_data = load_dataset("go_emotions", "simplified")

In [4]:
with open("advanced/dict.txt", "r") as f:
    emotions = [line.strip() for line in f.readlines()]

def prepare_df(df, emotions):
    df["label"] = df["labels"].apply(lambda x: x[0])
    df["sentiment"] = df["label"].apply(lambda idx: emotions[idx])
    return df[["text", "sentiment"]]

In [5]:
train_df = prepare_df(go_emotions_simplified_data["train"].to_pandas(), emotions)
val_df = prepare_df(go_emotions_simplified_data["validation"].to_pandas(), emotions)
test_df = prepare_df(go_emotions_simplified_data["test"].to_pandas(), emotions)
print('len(train_df)', len(train_df))
print('len(val_df)', len(val_df))
print('len(test_df)', len(test_df))
print(train_df.head())

len(train_df) 43410
len(val_df) 5426
len(test_df) 5427
                                                text  sentiment
0  My favourite food is anything I didn't have to...    neutral
1  Now if he does off himself, everyone will thin...    neutral
2                     WHY THE FUCK IS BAYLESS ISOING      anger
3                        To make her feel threatened       fear
4                             Dirty Southern Wankers  annoyance


In [6]:
# x_train = train_df['text'].tolist()
y_train = train_df['sentiment'].tolist()
x_val = val_df['text'].tolist()
y_val = val_df['sentiment'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['sentiment'].tolist()

max_samples_for_llm_train=200
dataset_name='go_emotions'
concept="sentiment analysis"
concept_keywords=concept_keywords=["sentiment", "emotion"]
max_samples_for_concept=50

llm_models={
#         "opt-125m": LLMModel(model_name="facebook/opt-125m"),
#         "opt-350m": LLMModel(model_name="facebook/opt-350m"),
#         "gpt-neo-1.3B": LLMModel(model_name="EleutherAI/gpt-neo-1.3B"),
    
        "opt-125m": LLMModel(model_name="facebook/opt-125m"),
        "opt-350m": LLMModel(model_name="facebook/opt-350m"),
        "opt-1.3b": LLMModel(model_name="facebook/opt-1.3b"),
        "opt-2.7b": LLMModel(model_name="facebook/opt-2.7b"),
        "opt-6.7b": LLMModel(model_name="facebook/opt-6.7b"),
        "opt-13b": LLMModel(model_name="facebook/opt-13b"),
        "opt-30b": LLMModel(model_name="facebook/opt-30b"),
        "opt-66b": LLMModel(model_name="facebook/opt-66b")
    }

prompt_header_llm_concept    = "In 2 words I will guess, what the task is.\n"
prompt_content_llm_concept   = "{x_test} -> {y_test}\n"
prompt_tail_llm_concept      = "In 2 words the task is:"

prompt_header_llm_train      = "I am a classificator. I will find sentiment and answer in exactly 1 word.\n"
prompt_content_llm_train     = "{x_train} -> {y_train}\n"
prompt_tail_llm_train        = ""

prompt_llm_simulation        = "{x_test} -> "

Loading model: facebook/opt-125m
Model loaded successfully on cpu
Loading model: facebook/opt-350m
Model loaded successfully on cpu
Loading model: EleutherAI/gpt-neo-1.3B
Model loaded successfully on cpu


In [7]:
# ______________________________________________________
# Temporary for testing
# x_train = x_train[:64]
# y_train = y_train[:64]
# x_val = x_train[:16]
# y_val = y_train[:16]
# x_test = x_test[:20]
# y_test = y_test[:20]
# max_samples_for_llm_train=10
# max_samples_for_concept=20
# ______________________________________________________

## Run experiment
1. Classifier trains (if set to true) on train and val data. Then classifier predicts on test.
2. For each LLM is the following:
    1. Predict concept based on max_samples_for_concept first x_test and y predictions from classifier.
    2. LLM gets trained on first max_samples_for_llm_train from x_test and y_predictions from classifier.
    3. LLM simulates calssifier by predicting the sentiment.

In [8]:
experiment = LLMSizeExperiment()

In [9]:
model_statistics = pd.DataFrame()
prediction_statistics = pd.DataFrame()
data_statistics = pd.DataFrame()

### EmotionClassifier

In [10]:
custom_classifier = EmotionClassifier(device='cpu')

In [11]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    max_samples_for_llm_train=max_samples_for_llm_train,
    dataset_name=dataset_name,
    concept=concept,
    concept_keywords=concept_keywords,
    max_samples_for_concept=max_samples_for_concept,
    
    # Classifier parameters
    classifier_name="custom_model",
    classifier=custom_classifier,
    train_classifier=True,
    classifier_train_arguments = {"epochs": 2, "batch_size": 2},
    
    llm_models=llm_models,
    
    prompt_header_llm_concept    = prompt_header_llm_concept,
    prompt_content_llm_concept   = prompt_content_llm_concept,
    prompt_tail_llm_concept      = prompt_tail_llm_concept,
    
    prompt_header_llm_train      = prompt_header_llm_train,
    prompt_content_llm_train     = prompt_content_llm_train,
    prompt_tail_llm_train        = prompt_tail_llm_train,
    
    prompt_llm_simulation        = prompt_llm_simulation
)

Training classifier...
Labels setup: 20 unique labels found: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disapproval', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'neutral', 'optimism', 'sadness', 'surprise']
Epoch 1/2
  Train Loss: 2.8215
  Val Accuracy: 0.2500
  Val F1 Macro: 0.0364
  Val ROC AUC (Macro OVR): 0.0000
  New best model saved with F1 Macro: 0.0364
Epoch 2/2
  Train Loss: 2.2004
  Val Accuracy: 0.2500
  Val F1 Macro: 0.0364
  Val ROC AUC (Macro OVR): 0.0000
  New best model saved with F1 Macro: 0.0364
Training finished. Loading best model state.
Classifier is predicting sentiments...
Classifying task done.

Running experiment for LLM: opt-125m -----------------
LLM guessing context...guessed:  what task.
Training LLM based on classifier's inputs and outputs...
Training completed with 10 examples
LLM simulating classifier...
true label: sadness  | classifier label: neutral  | LLM label

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


true label: gratitude  | classifier label: neutral  | LLM label: i�
Simulation done.

Running experiment for LLM: gpt-neo-1.3B -----------------
LLM guessing context...guessed:  1. I need
Training LLM based on classifier's inputs and outputs...
Training completed with 10 examples
LLM simulating classifier...
true label: sadness  | classifier label: neutral  | LLM label: neutral
true label: admiration  | classifier label: neutral  | LLM label: neutral
true label: excitement  | classifier label: neutral  | LLM label: neutral
true label: gratitude  | classifier label: neutral  | LLM label: neutral
true label: neutral  | classifier label: neutral  | LLM label: neutral
true label: gratitude  | classifier label: neutral  | LLM label: neutral
true label: gratitude  | classifier label: neutral  | LLM label: neutral
true label: gratitude  | classifier label: neutral  | LLM label: neutral
true label: remorse  | classifier label: neutral  | LLM label: neutral
true label: sadness  | classifier lab

In [12]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

### roberta-base-go_emotions

In [13]:
HGF = HuggingFaceModel(model_name="SamLowe/roberta-base-go_emotions", use_gpu=False)

In [14]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    max_samples_for_llm_train=max_samples_for_llm_train,
    dataset_name=dataset_name,
    concept=concept,
    concept_keywords=concept_keywords,
    max_samples_for_concept=max_samples_for_concept,
    
    # Classifier parameters
    classifier_name="roberta-base-go_emotions",
    classifier=HGF,
    train_classifier=False,
    classifier_train_arguments = {},
    
    llm_models=llm_models,
    
    prompt_header_llm_concept    = prompt_header_llm_concept,
    prompt_content_llm_concept   = prompt_content_llm_concept,
    prompt_tail_llm_concept      = prompt_tail_llm_concept,
    
    prompt_header_llm_train      = prompt_header_llm_train,
    prompt_content_llm_train     = prompt_content_llm_train,
    prompt_tail_llm_train        = prompt_tail_llm_train,
    
    prompt_llm_simulation        = prompt_llm_simulation
)

Classifier is predicting sentiments...


Device set to use cpu


Classifying task done.

Running experiment for LLM: opt-125m -----------------
LLM guessing context...guessed:  to get rid of someone
Training LLM based on classifier's inputs and outputs...
Training completed with 10 examples
LLM simulating classifier...
true label: sadness  | classifier label: remorse  | LLM label: > remorse
true label: admiration  | classifier label: admiration  | LLM label: ~~i
true label: excitement  | classifier label: optimism  | LLM label: - thank
true label: gratitude  | classifier label: gratitude  | LLM label: > thank
true label: neutral  | classifier label: neutral  | LLM label: >neutral
true label: gratitude  | classifier label: gratitude  | LLM label: you‘
true label: gratitude  | classifier label: gratitude  | LLM label: “
true label: gratitude  | classifier label: admiration  | LLM label: “
true label: remorse  | classifier label: remorse  | LLM label: ~~bl
true label: sadness  | classifier label: sadness  | LLM label: ~s
true label: annoyance  | classi

In [15]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

## View results

In [16]:
model_statistics.head()

Unnamed: 0,run_id,dataset_name,classifier,llm,classifier_accuracy,classifier_precision,classifier_recall,classifier_f1,classifier_balanced_accuracy,classifier_cohen_kappa,classifier_mcc,prompt_header_llm_concept,prompt_content_llm_concept,prompt_tail_llm_concept,prompt_header_llm_train,prompt_content_llm_train,prompt_tail_llm_train,prompt_llm_simulation,llm_predicted_concept
0,1,go_emotions,custom_model,opt-125m,0.05,0.004545,0.090909,0.008658,0.090909,0.0,0.0,"In 2 words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In 2 words the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,what task.
1,1,go_emotions,custom_model,opt-350m,0.05,0.004545,0.090909,0.008658,0.090909,0.0,0.0,"In 2 words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In 2 words the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,I'm so sorry
2,1,go_emotions,custom_model,gpt-neo-1.3B,0.05,0.004545,0.090909,0.008658,0.090909,0.0,0.0,"In 2 words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In 2 words the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,1. I need
3,2,go_emotions,roberta-base-go_emotions,opt-125m,0.7,0.5,0.535897,0.486325,0.633333,0.662921,0.676221,"In 2 words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In 2 words the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,to get rid of someone
4,2,go_emotions,roberta-base-go_emotions,opt-350m,0.7,0.5,0.535897,0.486325,0.633333,0.662921,0.676221,"In 2 words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In 2 words the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,>I am


In [17]:
prediction_statistics.head(20)

Unnamed: 0,run_id,dataset_name,classifier_name,llm_name,x_test,y_test,classifier_predicted_label,classifier_predicted_label_confidence,x_test_present_in_prompt,llm_simulation_label_correct,llm_simulation_predicted_label
0,1,go_emotions,custom_model,opt-125m,I’m really sorry about your situation :( Altho...,sadness,neutral,0.385499,True,True,neutral
1,1,go_emotions,custom_model,opt-125m,It's wonderful because it's awful. At not with.,admiration,neutral,0.336687,True,True,neutral
2,1,go_emotions,custom_model,opt-125m,"Kings fan here, good luck to you guys! Will be...",excitement,neutral,0.51582,True,True,neutral :)
3,1,go_emotions,custom_model,opt-125m,"I didn't know that, thank you for teaching me ...",gratitude,neutral,0.499218,True,True,neutral
4,1,go_emotions,custom_model,opt-125m,They got bored from haunting earth for thousan...,neutral,neutral,0.425343,True,True,~~neutral~~
5,1,go_emotions,custom_model,opt-125m,Thank you for asking questions and recognizing...,gratitude,neutral,0.346359,True,True,~~neutral~~
6,1,go_emotions,custom_model,opt-125m,You’re welcome,gratitude,neutral,0.194501,True,True,neutral
7,1,go_emotions,custom_model,opt-125m,100%! Congrats on your job too!,gratitude,neutral,0.539394,True,True,~~neutral~~
8,1,go_emotions,custom_model,opt-125m,I’m sorry to hear that friend :(. It’s for the...,remorse,neutral,0.395173,True,True,~~neutral~~
9,1,go_emotions,custom_model,opt-125m,"Girlfriend weak as well, that jump was pathetic.",sadness,neutral,0.377418,True,False,~~girlfriend


In [18]:
data_statistics.head()

Unnamed: 0,run_id,dataset_name,classifier_name,partition,num_samples,label_counts,label_proportions,avg_text_length,avg_word_count
0,1,go_emotions,custom_model,train,64,"{'neutral': 24, 'anger': 5, 'fear': 1, 'annoya...","{'neutral': 0.38, 'anger': 0.08, 'fear': 0.02,...",56.6,10.5
1,1,go_emotions,custom_model,val,16,"{'neutral': 4, 'anger': 1, 'fear': 1, 'annoyan...","{'neutral': 0.25, 'anger': 0.06, 'fear': 0.06,...",74.1,13.4
2,1,go_emotions,custom_model,test,20,"{'sadness': 3, 'admiration': 2, 'excitement': ...","{'sadness': 0.15, 'admiration': 0.1, 'exciteme...",70.0,12.6
3,2,go_emotions,roberta-base-go_emotions,train,64,"{'neutral': 24, 'anger': 5, 'fear': 1, 'annoya...","{'neutral': 0.38, 'anger': 0.08, 'fear': 0.02,...",56.6,10.5
4,2,go_emotions,roberta-base-go_emotions,val,16,"{'neutral': 4, 'anger': 1, 'fear': 1, 'annoyan...","{'neutral': 0.25, 'anger': 0.06, 'fear': 0.06,...",74.1,13.4


In [19]:
model_statistics.to_csv('model_statistics.csv')
prediction_statistics.to_csv('prediction_statistics.csv')
data_statistics.to_csv('data_statistics.csv')