# Experiment

In [1]:
%cd C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
%cd ..

C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop


In [2]:
from core.base_model import BaseModel
from basic.huggingface_model import HuggingFaceModel
from basic.llm_model import LLMModel
from intermediate.llm_size_experiment import LLMSizeExperiment
from tmp.custom_model import EmotionBERT, EmotionClassifier 
# from advanced.custom_model import EmotionBERT, EmotionClassifier

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import DataLoader
import torch
from typing import Union
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Load data

In [3]:
# Load data set
go_emotions_simplified_data = load_dataset("go_emotions", "simplified")

In [4]:
# Get unique label list
with open("advanced/dict.txt", "r") as f:
    emotions = [line.strip() for line in f.readlines()]

def prepare_df(df, emotions):
    df["label"] = df["labels"].apply(lambda x: x[0])
    df["sentiment"] = df["label"].apply(lambda idx: emotions[idx])
    return df[["text", "sentiment"]]

In [5]:
# Create dataframes
train_df = prepare_df(go_emotions_simplified_data["train"].to_pandas(), emotions)
val_df = prepare_df(go_emotions_simplified_data["validation"].to_pandas(), emotions)
test_df = prepare_df(go_emotions_simplified_data["test"].to_pandas(), emotions)
print('len(train_df)', len(train_df))
print('len(val_df)', len(val_df))
print('len(test_df)', len(test_df))
print(train_df.head())

len(train_df) 43410
len(val_df) 5426
len(test_df) 5427
                                                text  sentiment
0  My favourite food is anything I didn't have to...    neutral
1  Now if he does off himself, everyone will thin...    neutral
2                     WHY THE FUCK IS BAYLESS ISOING      anger
3                        To make her feel threatened       fear
4                             Dirty Southern Wankers  annoyance


In [6]:
# Create x and y arrays (both are arrays that hold strings)
x_train = train_df['text'].tolist()
y_train = train_df['sentiment'].tolist()
x_val = val_df['text'].tolist()
y_val = val_df['sentiment'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['sentiment'].tolist()

## Custom model setup

In [7]:
# # Prepare subset for training
# x_train_subset = x_train[:1000]
# y_train_subset = y_train[:1000]
# x_val_subset = x_val[:500]
# y_val_subset = y_val[:500]
# device = 'cpu'
# emotions_subset = np.unique(y_train)

# print('for y_train_subset: ', np.unique(y_train_subset), ' emotion count: ', len(np.unique(y_train_subset)))
# print('for y_val_subset: ', np.unique(y_val_subset), ' emotion count: ', len(np.unique(y_val_subset)))

In [8]:
# # Train the model
# custom_classifier = EmotionClassifier(device=device)
# custom_classifier.train(x_train_subset, y_train_subset, x_val_subset, y_val_subset)

## Make subset for experiment

In [9]:
# Used only for training classifiers
# x_train = x_train[:1000]
# y_train = y_train[:1000]
# x_val = x_val[:500]
# y_val = y_val[:500]

# Used for testing classifiers, training LLMs and testing LLMs
x_test = x_test[:20]
y_test = y_test[:20]

print('for y_train: ', np.unique(y_train), ' emotion count: ', len(np.unique(y_train)))
print('for y_val: ', np.unique(y_val), ' emotion count: ', len(np.unique(y_val)))
print('for y_test: ', np.unique(y_test), ' emotion count: ', len(np.unique(y_test)))

for y_train:  ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 'surprise']  emotion count:  28
for y_val:  ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 'surprise']  emotion count:  28
for y_test:  ['admiration' 'amusement' 'annoyance' 'desire' 'disapproval' 'excitement'
 'fear' 'gratitude' 'neutral' 'remorse' 'sadness']  emotion count:  11


## Experiment parameters

In [10]:
max_samples_for_llm_train=10
dataset_name='go_emotions'
concept="sentiment analysis"
max_samples_for_concept=10
switched_classifier_prediction_labels={} #{"admiration": "joy", "amusement":"sadness"}
force_balanced_llm_train=True

llm_models={
#         "opt-125m": LLMModel(model_name="facebook/opt-125m"),
#         "opt-350m": LLMModel(model_name="facebook/opt-350m"),
        "gpt-neo-1.3B": LLMModel(model_name="EleutherAI/gpt-neo-1.3B"),
    
#         "opt-125m": LLMModel(model_name="facebook/opt-125m"),
#         "opt-350m": LLMModel(model_name="facebook/opt-350m"),
#         "opt-1.3b": LLMModel(model_name="facebook/opt-1.3b"),
#         "opt-2.7b": LLMModel(model_name="facebook/opt-2.7b"),
#         "opt-6.7b": LLMModel(model_name="facebook/opt-6.7b"),
#         "opt-13b": LLMModel(model_name="facebook/opt-13b"),
#         "opt-30b": LLMModel(model_name="facebook/opt-30b"),
#         "opt-66b": LLMModel(model_name="facebook/opt-66b")
    }

prompt_header_llm_concept    = "In 2 words I will guess, what the task is.\n"
prompt_content_llm_concept   = "{x_test} -> {y_test}\n"
prompt_tail_llm_concept      = "In 2 words the task is:"

prompt_header_llm_train      = "I am a classificator. I will find sentiment and answer in exactly 1 word.\n"
prompt_content_llm_train     = "{x_train} -> {y_train}\n"
prompt_tail_llm_train        = ""

prompt_llm_simulation        = "{x_test} -> "

Loading model: EleutherAI/gpt-neo-1.3B
Model loaded successfully on cpu


## Run experiment
1. Classifier trains (if set to true) on train and val data. Then classifier predicts on test.
2. For each LLM is the following:
    1. Predict concept based on max_samples_for_concept first x_test and y predictions from classifier.
    2. LLM gets trained on first max_samples_for_llm_train from x_test and y_predictions from classifier.
    3. LLM simulates calssifier by predicting the sentiment.

In [11]:
experiment = LLMSizeExperiment()

In [12]:
model_statistics = pd.DataFrame()
prediction_statistics = pd.DataFrame()
data_statistics = pd.DataFrame()

### EmotionClassifier

In [13]:
# # Train model in the experiment
# custom_classifier = EmotionClassifier(device='cpu')

# train_classifier = True
# classifier_train_arguments = {"epochs": 2, "batch_size": 2}

In [14]:
# Load trained model
device='cpu'
custom_classifier = EmotionClassifier(device=device)
custom_classifier.model = EmotionBERT(n_classes=len(emotions)).to(device)
custom_classifier._setup_labels(sorted(emotions))
custom_classifier.model.load_state_dict(torch.load('tmp/best_model_state.pth'))

train_classifier = False

Labels setup: 28 unique labels found: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']


In [15]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    max_samples_for_llm_train=max_samples_for_llm_train,
    dataset_name=dataset_name,
    concept=concept,
    max_samples_for_concept=max_samples_for_concept,
    force_balanced_llm_train=force_balanced_llm_train,
    switched_classifier_prediction_labels=switched_classifier_prediction_labels,
    
    # Classifier parameters
    classifier_name="custom_classifier",
    classifier=custom_classifier,
    train_classifier=False,
    classifier_train_arguments = {},
    
    llm_models=llm_models,
    
    prompt_header_llm_concept    = prompt_header_llm_concept,
    prompt_content_llm_concept   = prompt_content_llm_concept,
    prompt_tail_llm_concept      = prompt_tail_llm_concept,
    
    prompt_header_llm_train      = prompt_header_llm_train,
    prompt_content_llm_train     = prompt_content_llm_train,
    prompt_tail_llm_train        = prompt_tail_llm_train,
    
    prompt_llm_simulation        = prompt_llm_simulation
)

Classifier is predicting sentiments...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Classifying task done.

Running experiment for LLM: gpt-neo-1.3B -----------------
LLM guessing context...guessed:  How can we get you
Training LLM based on classifier's inputs and outputs...
Training completed with 10 examples
LLM simulating classifier...
sentence:  0 / 20  | true label: sadness  | classifier label: gratitude  | LLM prediction: irritation
but  | LLM label: No label
sentence:  1 / 20  | true label: admiration  | classifier label: love  | LLM prediction: love  | LLM label: love
sentence:  2 / 20  | true label: excitement  | classifier label: admiration  | LLM prediction: [quote  | LLM label: No label
sentence:  3 / 20  | true label: gratitude  | classifier label: gratitude  | LLM prediction: gratitude (this  | LLM label: gratitude
sentence:  4 / 20  | true label: neutral  | classifier label: neutral  | LLM prediction: disg  | LLM label: No label
sentence:  5 / 20  | true label: gratitude  | classifier label: gratitude  | LLM prediction: you  | LLM label: No label
senten

In [16]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

### roberta-base-go_emotions

In [17]:
HGF = HuggingFaceModel(model_name="SamLowe/roberta-base-go_emotions", use_gpu=False)

In [18]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    max_samples_for_llm_train=max_samples_for_llm_train,
    dataset_name=dataset_name,
    concept=concept,
    max_samples_for_concept=max_samples_for_concept,
    force_balanced_llm_train=force_balanced_llm_train,
    switched_classifier_prediction_labels=switched_classifier_prediction_labels,
    
    # Classifier parameters
    classifier_name="roberta-base-go_emotions",
    classifier=HGF,
    train_classifier=False,
    classifier_train_arguments = {},
    
    llm_models=llm_models,
    
    prompt_header_llm_concept    = prompt_header_llm_concept,
    prompt_content_llm_concept   = prompt_content_llm_concept,
    prompt_tail_llm_concept      = prompt_tail_llm_concept,
    
    prompt_header_llm_train      = prompt_header_llm_train,
    prompt_content_llm_train     = prompt_content_llm_train,
    prompt_tail_llm_train        = prompt_tail_llm_train,
    
    prompt_llm_simulation        = prompt_llm_simulation
)

Classifier is predicting sentiments...


Device set to use cpu


Classifying task done.

Running experiment for LLM: gpt-neo-1.3B -----------------
LLM guessing context...guessed:  1. to find
Training LLM based on classifier's inputs and outputs...
Training completed with 10 examples
LLM simulating classifier...
sentence:  0 / 20  | true label: sadness  | classifier label: remorse  | LLM prediction: regret  | LLM label: No label
sentence:  1 / 20  | true label: admiration  | classifier label: admiration  | LLM prediction: admiration  | LLM label: admiration
sentence:  2 / 20  | true label: excitement  | classifier label: optimism  | LLM prediction: cheer
you  | LLM label: No label
sentence:  3 / 20  | true label: gratitude  | classifier label: gratitude  | LLM prediction: gratitude:  | LLM label: gratitude
sentence:  4 / 20  | true label: neutral  | classifier label: neutral  | LLM prediction: neutral (i  | LLM label: neutral
sentence:  5 / 20  | true label: gratitude  | classifier label: gratitude  | LLM prediction: interest
how  | LLM label: No la

In [19]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

## View results

In [20]:
model_statistics.head()

Unnamed: 0,run_id,dataset_name,classifier,llm,classifier_accuracy,classifier_precision,classifier_recall,classifier_f1,classifier_balanced_accuracy,classifier_cohen_kappa,classifier_mcc,prompt_header_llm_concept,prompt_content_llm_concept,prompt_tail_llm_concept,prompt_header_llm_train,prompt_content_llm_train,prompt_tail_llm_train,prompt_llm_simulation,llm_predicted_concept
0,1,go_emotions,custom_classifier,gpt-neo-1.3B,0.45,0.173413,0.291667,0.205556,0.318182,0.367816,0.390971,"In 2 words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In 2 words the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,How can we get you
1,2,go_emotions,roberta-base-go_emotions,gpt-neo-1.3B,0.7,0.5,0.535897,0.486325,0.633333,0.662921,0.676221,"In 2 words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In 2 words the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,1. to find


In [21]:
prediction_statistics.head(20)

Unnamed: 0,run_id,dataset_name,classifier_name,llm_name,x_test,y_test,classifier_predicted_label,classifier_predicted_label_after_switch,llm_simulation_predicted_label,llm_simulation_raw_prediction,classifier_predicted_label_confidence,x_test_present_in_train_prompt,classifier_predicted_label_correct,llm_simulation_label_correct,llm_direct_label_correct,llm_simulation_label_correct_after_switch
0,1,go_emotions,custom_classifier,gpt-neo-1.3B,I’m really sorry about your situation :( Altho...,sadness,gratitude,gratitude,No label,irritation\nbut,0.175715,False,False,False,False,False
1,1,go_emotions,custom_classifier,gpt-neo-1.3B,It's wonderful because it's awful. At not with.,admiration,love,love,love,love,0.118051,True,False,True,False,True
2,1,go_emotions,custom_classifier,gpt-neo-1.3B,"Kings fan here, good luck to you guys! Will be...",excitement,admiration,admiration,No label,[quote,0.819836,True,False,False,False,False
3,1,go_emotions,custom_classifier,gpt-neo-1.3B,"I didn't know that, thank you for teaching me ...",gratitude,gratitude,gratitude,gratitude,gratitude (this,0.689541,True,True,True,True,True
4,1,go_emotions,custom_classifier,gpt-neo-1.3B,They got bored from haunting earth for thousan...,neutral,neutral,neutral,No label,disg,0.656525,False,True,False,False,False
5,1,go_emotions,custom_classifier,gpt-neo-1.3B,Thank you for asking questions and recognizing...,gratitude,gratitude,gratitude,No label,you,0.648427,False,True,False,False,False
6,1,go_emotions,custom_classifier,gpt-neo-1.3B,You’re welcome,gratitude,gratitude,gratitude,neutral,neutral,0.441028,False,True,False,False,False
7,1,go_emotions,custom_classifier,gpt-neo-1.3B,100%! Congrats on your job too!,gratitude,gratitude,gratitude,No label,good,0.481056,True,True,False,False,False
8,1,go_emotions,custom_classifier,gpt-neo-1.3B,I’m sorry to hear that friend :(. It’s for the...,remorse,gratitude,gratitude,No label,appreciation\nif,0.117892,False,False,False,False,False
9,1,go_emotions,custom_classifier,gpt-neo-1.3B,"Girlfriend weak as well, that jump was pathetic.",sadness,amusement,amusement,neutral,neutral,0.163717,False,False,False,False,False


In [22]:
data_statistics.head()

Unnamed: 0,run_id,dataset_name,concept,classifier_name,partition,is_llm_train_balanced,num_samples,label_counts,label_proportions,avg_text_length,avg_word_count
0,1,go_emotions,sentiment analysis,custom_classifier,train,True,43410,"{'neutral': 12823, 'anger': 1547, 'fear': 510,...","{'neutral': 0.3, 'anger': 0.04, 'fear': 0.01, ...",68.4,12.8
1,1,go_emotions,sentiment analysis,custom_classifier,val,True,5426,"{'neutral': 1592, 'approval': 355, 'sadness': ...","{'neutral': 0.29, 'approval': 0.07, 'sadness':...",68.2,12.8
2,1,go_emotions,sentiment analysis,custom_classifier,test,True,20,"{'sadness': 3, 'admiration': 2, 'excitement': ...","{'sadness': 0.15, 'admiration': 0.1, 'exciteme...",70.0,12.6
3,2,go_emotions,sentiment analysis,roberta-base-go_emotions,train,True,43410,"{'neutral': 12823, 'anger': 1547, 'fear': 510,...","{'neutral': 0.3, 'anger': 0.04, 'fear': 0.01, ...",68.4,12.8
4,2,go_emotions,sentiment analysis,roberta-base-go_emotions,val,True,5426,"{'neutral': 1592, 'approval': 355, 'sadness': ...","{'neutral': 0.29, 'approval': 0.07, 'sadness':...",68.2,12.8


In [24]:
model_statistics.to_csv('intermediate/model_statistics.csv')
prediction_statistics.to_csv('intermediate/prediction_statistics.csv')
data_statistics.to_csv('intermediate/data_statistics.csv')