# Experiment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/llm-workshop

/content/drive/MyDrive/Colab Notebooks/llm-workshop


In [None]:
from core.base_model import BaseModel
from basic.huggingface_model import HuggingFaceModel
from basic.llm_model import LLMModel
from intermediate.llm_size_experiment import LLMSizeExperiment
from tmp.custom_model import EmotionBERT, EmotionClassifier
# from advanced.custom_model import EmotionBERT, EmotionClassifier

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import DataLoader
import torch
from typing import Union
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Load data

In [None]:
# Load data set
go_emotions_simplified_data = load_dataset("go_emotions", "simplified")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Get unique label list
with open("advanced/dict.txt", "r") as f:
    emotions = [line.strip() for line in f.readlines()]

def prepare_df(df, emotions):
    df["label"] = df["labels"].apply(lambda x: x[0])
    df["sentiment"] = df["label"].apply(lambda idx: emotions[idx])
    return df[["text", "sentiment"]]

In [None]:
# Create dataframes
train_df = prepare_df(go_emotions_simplified_data["train"].to_pandas(), emotions)
val_df = prepare_df(go_emotions_simplified_data["validation"].to_pandas(), emotions)
test_df = prepare_df(go_emotions_simplified_data["test"].to_pandas(), emotions)
print('len(train_df)', len(train_df))
print('len(val_df)', len(val_df))
print('len(test_df)', len(test_df))
print(train_df.head())

len(train_df) 43410
len(val_df) 5426
len(test_df) 5427
                                                text  sentiment
0  My favourite food is anything I didn't have to...    neutral
1  Now if he does off himself, everyone will thin...    neutral
2                     WHY THE FUCK IS BAYLESS ISOING      anger
3                        To make her feel threatened       fear
4                             Dirty Southern Wankers  annoyance


In [None]:
# Create x and y arrays (both are arrays that hold strings)
x_train = train_df['text'].tolist()
y_train = train_df['sentiment'].tolist()
x_val = val_df['text'].tolist()
y_val = val_df['sentiment'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['sentiment'].tolist()

## Custom model setup

In [None]:
# # Prepare subset for training
# x_train_subset = x_train[:1000]
# y_train_subset = y_train[:1000]
# x_val_subset = x_val[:500]
# y_val_subset = y_val[:500]
# device = 'cpu'
# emotions_subset = np.unique(y_train)

# print('for y_train_subset: ', np.unique(y_train_subset), ' emotion count: ', len(np.unique(y_train_subset)))
# print('for y_val_subset: ', np.unique(y_val_subset), ' emotion count: ', len(np.unique(y_val_subset)))

In [None]:
# # Train the model
# custom_classifier = EmotionClassifier(device=device)
# custom_classifier.train(x_train_subset, y_train_subset, x_val_subset, y_val_subset)

## Experiment parameters

In [None]:
# Used only for training classifiers
# x_train = x_train[:1000]
# y_train = y_train[:1000]
# x_val = x_val[:500]
# y_val = y_val[:500]

# Used for testing classifiers, training LLMs and testing LLMs
llm_test_and_train_sample_size = 1000
x_test = x_test[:llm_test_and_train_sample_size]
y_test = y_test[:llm_test_and_train_sample_size]
max_samples_for_llm_train=56
max_samples_for_concept=56

print('for y_train: ', np.unique(y_train), ' emotion count: ', len(np.unique(y_train)))
print('for y_val: ', np.unique(y_val), ' emotion count: ', len(np.unique(y_val)))
print('for y_test: ', np.unique(y_test), ' emotion count: ', len(np.unique(y_test)))

dataset_name='go_emotions'
concept="sentiment analysis"
switched_classifier_prediction_labels={} #{"admiration": "joy", "amusement":"sadness"}
force_balanced_llm_train=True

llm_model_names = ["EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-j-6b"]

prompt_header_llm_concept    = "In few words I will guess, what the task is.\n"
prompt_content_llm_concept   = "{x_test} -> {y_test}\n"
prompt_tail_llm_concept      = "In the task is:"

prompt_header_llm_train      = "I am a classificator. I will find sentiment and answer in exactly 1 word from the words listed below.\n"
prompt_content_llm_train     = "{x_train} -> {y_train}\n"
prompt_tail_llm_train        = ""

prompt_llm_simulation        = "{x_test} -> "

for y_train:  ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 'surprise']  emotion count:  28
for y_val:  ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 'surprise']  emotion count:  28
for y_test:  ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 

## Run experiment
1. Classifier trains (if set to true) on train and val data. Then classifier predicts on test.
2. For each LLM is the following:
    1. Predict concept based on max_samples_for_concept first x_test and y predictions from classifier.
    2. LLM gets trained on first max_samples_for_llm_train from x_test and y_predictions from classifier.
    3. LLM simulates calssifier by predicting the sentiment.

In [None]:
experiment = LLMSizeExperiment()

In [None]:
model_statistics = pd.DataFrame()
prediction_statistics = pd.DataFrame()
data_statistics = pd.DataFrame()

### EmotionClassifier

In [None]:
# # Train model in the experiment
# custom_classifier = EmotionClassifier(device='cpu')

# train_classifier = True
# classifier_train_arguments = {"epochs": 2, "batch_size": 2}

In [None]:
# # Load trained model
# device='cpu'
# custom_classifier = EmotionClassifier(device=device)
# custom_classifier.model = EmotionBERT(n_classes=len(emotions)).to(device)
# custom_classifier._setup_labels(sorted(emotions))
# custom_classifier.model.load_state_dict(torch.load('tmp/best_model_state.pth'))

# train_classifier = False

In [None]:
# model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
#     x_train=x_train,
#     y_train=y_train,
#     x_test=x_test,
#     y_test=y_test,
#     x_val=x_val,
#     y_val=y_val,
#     max_samples_for_llm_train=max_samples_for_llm_train,
#     dataset_name=dataset_name,
#     concept=concept,
#     max_samples_for_concept=max_samples_for_concept,
#     force_balanced_llm_train=force_balanced_llm_train,
#     switched_classifier_prediction_labels=switched_classifier_prediction_labels,

#     # Classifier parameters
#     classifier_name="custom_classifier",
#     classifier=custom_classifier,
#     train_classifier=False,
#     classifier_train_arguments = {},

#     llm_model_names=llm_model_names,

#     prompt_header_llm_concept    = prompt_header_llm_concept,
#     prompt_content_llm_concept   = prompt_content_llm_concept,
#     prompt_tail_llm_concept      = prompt_tail_llm_concept,

#     prompt_header_llm_train      = prompt_header_llm_train,
#     prompt_content_llm_train     = prompt_content_llm_train,
#     prompt_tail_llm_train        = prompt_tail_llm_train,

#     prompt_llm_simulation        = prompt_llm_simulation
# )

In [None]:
# model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
# prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
# data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

### roberta-base-go_emotions

In [None]:
HGF = HuggingFaceModel(model_name="SamLowe/roberta-base-go_emotions", use_gpu=False)

In [None]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    max_samples_for_llm_train=max_samples_for_llm_train,
    dataset_name=dataset_name,
    concept=concept,
    max_samples_for_concept=max_samples_for_concept,
    force_balanced_llm_train=force_balanced_llm_train,
    switched_classifier_prediction_labels=switched_classifier_prediction_labels,

    # Classifier parameters
    classifier_name="roberta-base-go_emotions",
    classifier=HGF,
    train_classifier=False,
    classifier_train_arguments = {},

    llm_model_names=llm_model_names,

    prompt_header_llm_concept    = prompt_header_llm_concept,
    prompt_content_llm_concept   = prompt_content_llm_concept,
    prompt_tail_llm_concept      = prompt_tail_llm_concept,

    prompt_header_llm_train      = prompt_header_llm_train,
    prompt_content_llm_train     = prompt_content_llm_train,
    prompt_tail_llm_train        = prompt_tail_llm_train,

    prompt_llm_simulation        = prompt_llm_simulation
)

Classifier is predicting sentiments...


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Classifying task done.

Loading model: EleutherAI/gpt-neo-1.3B


tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model loaded successfully on cuda
Running experiment for LLM: EleutherAI/gpt-neo-1.3B -----------------
LLM guessing context...guessed:  What is it that is
Training LLM based on classifier's inputs and outputs...
Training completed with 56 examples
LLM simulating classifier...
sentence:  1 / 1000  | true label: sadness  | classifier label: remorse  | LLM prediction: regret
like  | LLM label: No label
sentence:  2 / 1000  | true label: admiration  | classifier label: admiration  | LLM prediction: panic
please  | LLM label: No label
sentence:  3 / 1000  | true label: excitement  | classifier label: optimism  | LLM prediction: inspiration
it  | LLM label: No label
sentence:  4 / 1000  | true label: gratitude  | classifier label: gratitude  | LLM prediction: gratitude.  | LLM label: gratitude
sentence:  5 / 1000  | true label: neutral  | classifier label: neutral  | LLM prediction: disg  | LLM label: No label
sentence:  6 / 1000  | true label: gratitude  | classifier label: gratitude  | LL

tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Some weights of the model checkpoint at EleutherAI/gpt-j-6b were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

Model loaded successfully on cuda
Running experiment for LLM: EleutherAI/gpt-j-6b -----------------
LLM guessing context...guessed:  (1) to pick
Training LLM based on classifier's inputs and outputs...
Training completed with 56 examples
LLM simulating classifier...
sentence:  1 / 1000  | true label: sadness  | classifier label: remorse  | LLM prediction: thank  | LLM label: No label
sentence:  2 / 1000  | true label: admiration  | classifier label: admiration  | LLM prediction: thank  | LLM label: No label
sentence:  3 / 1000  | true label: excitement  | classifier label: optimism  | LLM prediction: no response generated  | LLM label: No label
sentence:  4 / 1000  | true label: gratitude  | classifier label: gratitude  | LLM prediction: [  | LLM label: No label
sentence:  5 / 1000  | true label: neutral  | classifier label: neutral  | LLM prediction: "you  | LLM label: No label
sentence:  6 / 1000  | true label: gratitude  | classifier label: gratitude  | LLM prediction: no response g

In [None]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

## View results

In [None]:
model_statistics.head(5)

Unnamed: 0,run_id,dataset_name,classifier,llm,classifier_accuracy,classifier_precision,classifier_recall,classifier_f1,classifier_balanced_accuracy,classifier_cohen_kappa,classifier_mcc,prompt_header_llm_concept,prompt_content_llm_concept,prompt_tail_llm_concept,prompt_header_llm_train,prompt_content_llm_train,prompt_tail_llm_train,prompt_llm_simulation,llm_predicted_concept
0,1,go_emotions,roberta-base-go_emotions,EleutherAI/gpt-neo-1.3B,0.593,0.483662,0.48427,0.469951,0.48427,0.531984,0.533173,"In few words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,What is it that is
1,1,go_emotions,roberta-base-go_emotions,EleutherAI/gpt-j-6b,0.593,0.483662,0.48427,0.469951,0.48427,0.531984,0.533173,"In few words I will guess, what the task is.\n",{x_test} -> {y_test}\n,In the task is:,I am a classificator. I will find sentiment an...,{x_train} -> {y_train}\n,,{x_test} ->,(1) to pick


In [None]:
prediction_statistics.head(5)

Unnamed: 0,run_id,dataset_name,classifier_name,llm_name,x_test,y_test,classifier_predicted_label,classifier_predicted_label_after_switch,llm_simulation_predicted_label,llm_simulation_raw_prediction,classifier_predicted_label_confidence,x_test_present_in_train_prompt,classifier_predicted_label_correct,llm_simulation_label_correct,llm_direct_label_correct,llm_simulation_label_correct_after_switch
0,1,go_emotions,roberta-base-go_emotions,EleutherAI/gpt-neo-1.3B,I’m really sorry about your situation :( Altho...,sadness,remorse,remorse,No label,regret\nlike,0.67822,False,False,False,False,False
1,1,go_emotions,roberta-base-go_emotions,EleutherAI/gpt-neo-1.3B,It's wonderful because it's awful. At not with.,admiration,admiration,admiration,No label,panic\nplease,0.661048,False,True,False,False,False
2,1,go_emotions,roberta-base-go_emotions,EleutherAI/gpt-neo-1.3B,"Kings fan here, good luck to you guys! Will be...",excitement,optimism,optimism,No label,inspiration\nit,0.549641,True,False,False,False,False
3,1,go_emotions,roberta-base-go_emotions,EleutherAI/gpt-neo-1.3B,"I didn't know that, thank you for teaching me ...",gratitude,gratitude,gratitude,gratitude,gratitude.,0.98302,False,True,True,True,True
4,1,go_emotions,roberta-base-go_emotions,EleutherAI/gpt-neo-1.3B,They got bored from haunting earth for thousan...,neutral,neutral,neutral,No label,disg,0.868827,False,True,False,False,False


In [None]:
data_statistics.head(5)

Unnamed: 0,run_id,dataset_name,concept,classifier_name,partition,is_llm_train_balanced,num_samples,label_counts,label_proportions,avg_text_length,avg_word_count
0,1,go_emotions,sentiment analysis,roberta-base-go_emotions,train,True,43410,"{'neutral': 12823, 'anger': 1547, 'fear': 510,...","{'neutral': 0.3, 'anger': 0.04, 'fear': 0.01, ...",68.4,12.8
1,1,go_emotions,sentiment analysis,roberta-base-go_emotions,val,True,5426,"{'neutral': 1592, 'approval': 355, 'sadness': ...","{'neutral': 0.29, 'approval': 0.07, 'sadness':...",68.2,12.8
2,1,go_emotions,sentiment analysis,roberta-base-go_emotions,test,True,1000,"{'sadness': 26, 'admiration': 82, 'excitement'...","{'sadness': 0.03, 'admiration': 0.08, 'excitem...",67.4,12.6


In [None]:
model_statistics.to_csv('intermediate/model_statistics_gpt.csv')
prediction_statistics.to_csv('intermediate/prediction_statistics_gpt.csv')
data_statistics.to_csv('intermediate/data_statistics_gpt.csv')