# Experiment

In [1]:
%cd C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
%cd ..

C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop


In [2]:
from core.base_model import BaseModel
from basic.huggingface_model import HuggingFaceModel
from basic.llm_model import LLMModel
from intermediate.llm_size_experiment import LLMSizeExperiment
from tmp.custom_model import EmotionBERT, EmotionClassifier 
# from advanced.custom_model import EmotionBERT, EmotionClassifier

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import DataLoader
import torch
from typing import Union
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Load data

In [3]:
# Load data set
go_emotions_simplified_data = load_dataset("go_emotions", "simplified")

In [4]:
# Get unique label list
with open("advanced/dict.txt", "r") as f:
    emotions = [line.strip() for line in f.readlines()]

def prepare_df(df, emotions):
    df["label"] = df["labels"].apply(lambda x: x[0])
    df["sentiment"] = df["label"].apply(lambda idx: emotions[idx])
    return df[["text", "sentiment"]]

In [5]:
# Create dataframes
train_df = prepare_df(go_emotions_simplified_data["train"].to_pandas(), emotions)
val_df = prepare_df(go_emotions_simplified_data["validation"].to_pandas(), emotions)
test_df = prepare_df(go_emotions_simplified_data["test"].to_pandas(), emotions)
print('len(train_df)', len(train_df))
print('len(val_df)', len(val_df))
print('len(test_df)', len(test_df))
print(train_df.head())

len(train_df) 43410
len(val_df) 5426
len(test_df) 5427
                                                text  sentiment
0  My favourite food is anything I didn't have to...    neutral
1  Now if he does off himself, everyone will thin...    neutral
2                     WHY THE FUCK IS BAYLESS ISOING      anger
3                        To make her feel threatened       fear
4                             Dirty Southern Wankers  annoyance


In [6]:
# Create x and y arrays (both are arrays that hold strings)
x_train = train_df['text'].tolist()
y_train = train_df['sentiment'].tolist()
x_val = val_df['text'].tolist()
y_val = val_df['sentiment'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['sentiment'].tolist()

## Custom model setup

In [7]:
# # Prepare subset for training
# x_train_subset = x_train[:1000]
# y_train_subset = y_train[:1000]
# x_val_subset = x_val[:500]
# y_val_subset = y_val[:500]
# device = 'cpu'
# emotions_subset = np.unique(y_train)

# print('for y_train_subset: ', np.unique(y_train_subset), ' emotion count: ', len(np.unique(y_train_subset)))
# print('for y_val_subset: ', np.unique(y_val_subset), ' emotion count: ', len(np.unique(y_val_subset)))

In [8]:
# # Train the model
# custom_classifier = EmotionClassifier(device=device)
# custom_classifier.train(x_train_subset, y_train_subset, x_val_subset, y_val_subset)

## Experiment parameters

In [9]:
# Used only for training classifiers
# x_train = x_train[:1000]
# y_train = y_train[:1000]
# x_val = x_val[:500]
# y_val = y_val[:500]

# Used for testing classifiers, training LLMs and testing LLMs
llm_train_samples=50
x_test = x_test[:llm_train_samples]
y_test = y_test[:llm_train_samples]
max_samples_for_llm_train=llm_train_samples*2
max_samples_for_concept=llm_train_samples

print('for y_train: ', np.unique(y_train), ' emotion count: ', len(np.unique(y_train)))
print('for y_val: ', np.unique(y_val), ' emotion count: ', len(np.unique(y_val)))
print('for y_test: ', np.unique(y_test), ' emotion count: ', len(np.unique(y_test)))

dataset_name='go_emotions'
concept="sentiment analysis"
switched_classifier_prediction_labels={} #{"admiration": "joy", "amusement":"sadness"}
force_balanced_llm_train=True

# llm_model_names = ['EleutherAI/gpt-neo-125M', 'EleutherAI/gpt-neo-1.3B', 'EleutherAI/gpt-j-6B', 'EleutherAI/gpt-neox-20b']
# llm_model_names = ['EleutherAI/pythia-70m', 'EleutherAI/pythia-160m', 'EleutherAI/pythia-410m', 'EleutherAI/pythia-1.4b',
#                    'EleutherAI/pythia-2.8b', 'EleutherAI/pythia-6.9b', 'EleutherAI/pythia-12b']
llm_model_names = ['EleutherAI/pythia-70m', 'EleutherAI/pythia-160m', 'EleutherAI/pythia-410m', 'EleutherAI/pythia-1.4b', 
                  'EleutherAI/pythia-2.8b', 'EleutherAI/pythia-6.9b']
# llm_model_names = ['EleutherAI/gpt-neo-125M', 'EleutherAI/gpt-neo-1.3B', 'EleutherAI/gpt-j-6B']
# llm_model_names = ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b']
# llm_model_names = ['EleutherAI/gpt-neo-125M', 'facebook/opt-350m']

prompt_header_llm_concept    = "In few words I will guess, what the task is.\n"
prompt_content_llm_concept   = "{x_test} -> {y_test}\n"
prompt_tail_llm_concept      = "In the task is:"

prompt_header_llm_train      = "I am a classificator. I will find sentiment and answer in exactly 1 word from the words listed below.\n"
prompt_content_llm_train     = "{x_train} -> {y_train}\n"
prompt_tail_llm_train        = ""

prompt_llm_simulation        = "{x_test} -> "

for y_train:  ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 'surprise']  emotion count:  28
for y_val:  ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 'surprise']  emotion count:  28
for y_test:  ['admiration' 'amusement' 'annoyance' 'approval' 'caring' 'confusion'
 'curiosity' 'desire' 'disapproval' 'excitement' 'fear' 'gratitude' 'joy'
 'neutral' 'optimism' 'remorse' 'sadness']  emotion count:  17


## Run experiment
1. Classifier trains (if set to true) on train and val data. Then classifier predicts on test.
2. For each LLM is the following:
    1. Predict concept based on max_samples_for_concept first x_test and y predictions from classifier.
    2. LLM gets trained on first max_samples_for_llm_train from x_test and y_predictions from classifier.
    3. LLM simulates calssifier by predicting the sentiment.

In [10]:
experiment = LLMSizeExperiment()

In [11]:
model_statistics = pd.DataFrame()
prediction_statistics = pd.DataFrame()
data_statistics = pd.DataFrame()

### EmotionClassifier

In [12]:
# # Train model in the experiment
# custom_classifier = EmotionClassifier(device='cpu')

# train_classifier = True
# classifier_train_arguments = {"epochs": 2, "batch_size": 2}

In [13]:
# # Load trained model
# device='cpu'
# custom_classifier = EmotionClassifier(device=device)
# custom_classifier.model = EmotionBERT(n_classes=len(emotions)).to(device)
# custom_classifier._setup_labels(sorted(emotions))
# custom_classifier.model.load_state_dict(torch.load('tmp/best_model_state.pth'))

# train_classifier = False

In [14]:
# model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
#     x_train=x_train,
#     y_train=y_train,
#     x_test=x_test,
#     y_test=y_test,
#     x_val=x_val,
#     y_val=y_val,
#     max_samples_for_llm_train=max_samples_for_llm_train,
#     dataset_name=dataset_name,
#     concept=concept,
#     max_samples_for_concept=max_samples_for_concept,
#     force_balanced_llm_train=force_balanced_llm_train,
#     switched_classifier_prediction_labels=switched_classifier_prediction_labels,
    
#     # Classifier parameters
#     classifier_name="custom_classifier",
#     classifier=custom_classifier,
#     train_classifier=False,
#     classifier_train_arguments = {},
    
#     llm_model_names=llm_model_names,
    
#     prompt_header_llm_concept    = prompt_header_llm_concept,
#     prompt_content_llm_concept   = prompt_content_llm_concept,
#     prompt_tail_llm_concept      = prompt_tail_llm_concept,
    
#     prompt_header_llm_train      = prompt_header_llm_train,
#     prompt_content_llm_train     = prompt_content_llm_train,
#     prompt_tail_llm_train        = prompt_tail_llm_train,
    
#     prompt_llm_simulation        = prompt_llm_simulation
# )

In [15]:
# model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
# prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
# data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

### roberta-base-go_emotions

In [16]:
HGF = HuggingFaceModel(model_name="SamLowe/roberta-base-go_emotions", use_gpu=False)

In [26]:
llm_model_names = ['EleutherAI/gpt-j-6B']
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    max_samples_for_llm_train=max_samples_for_llm_train,
    dataset_name=dataset_name,
    concept=concept,
    max_samples_for_concept=max_samples_for_concept,
    force_balanced_llm_train=force_balanced_llm_train,
    switched_classifier_prediction_labels=switched_classifier_prediction_labels,
    
    # Classifier parameters
    classifier_name="roberta-base-go_emotions",
    classifier=HGF,
    train_classifier=False,
    classifier_train_arguments = {},
    
    llm_model_names=llm_model_names,
    
    prompt_header_llm_concept    = prompt_header_llm_concept,
    prompt_content_llm_concept   = prompt_content_llm_concept,
    prompt_tail_llm_concept      = prompt_tail_llm_concept,
    
    prompt_header_llm_train      = prompt_header_llm_train,
    prompt_content_llm_train     = prompt_content_llm_train,
    prompt_tail_llm_train        = prompt_tail_llm_train,
    
    prompt_llm_simulation        = prompt_llm_simulation
)

Classifier is predicting sentiments...




Classifying task done.

Loading model: EleutherAI/gpt-j-6B


Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

Model loaded successfully on cpu
Running experiment for LLM: EleutherAI/gpt-j-6B -----------------
LLM guessing context...guessed:  "You have a choice
Training LLM based on classifier's inputs and outputs...
Training completed with 50 examples
LLM simulating classifier...
sentence:  1 / 50  | true label: sadness  | classifier label: remorse  | LLM prediction: “  | LLM label: No label
sentence:  2 / 50  | true label: admiration  | classifier label: admiration  | LLM prediction: admiration  | LLM label: admiration
sentence:  3 / 50  | true label: excitement  | classifier label: optimism  | LLM prediction: if  | LLM label: No label


KeyboardInterrupt: 

In [23]:
llm_model_names = ['EleutherAI/gpt-neo-1.3B']
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    max_samples_for_llm_train=max_samples_for_llm_train,
    dataset_name=dataset_name,
    concept=concept,
    max_samples_for_concept=max_samples_for_concept,
    force_balanced_llm_train=force_balanced_llm_train,
    switched_classifier_prediction_labels=switched_classifier_prediction_labels,
    
    # Classifier parameters
    classifier_name="roberta-base-go_emotions",
    classifier=HGF,
    train_classifier=False,
    classifier_train_arguments = {},
    
    llm_model_names=llm_model_names,
    
    prompt_header_llm_concept    = prompt_header_llm_concept,
    prompt_content_llm_concept   = prompt_content_llm_concept,
    prompt_tail_llm_concept      = prompt_tail_llm_concept,
    
    prompt_header_llm_train      = prompt_header_llm_train,
    prompt_content_llm_train     = prompt_content_llm_train,
    prompt_tail_llm_train        = prompt_tail_llm_train,
    
    prompt_llm_simulation        = prompt_llm_simulation
)

Classifier is predicting sentiments...




Classifying task done.

Loading model: EleutherAI/gpt-neo-125M
Model loaded successfully on cpu
Running experiment for LLM: EleutherAI/gpt-neo-125M -----------------
LLM guessing context...guessed:  In few words I will guess, what the task is.
I’m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett! -> remorse
It's wonderful because it's awful. At not with. -> admiration
Kings fan here, good luck to you guys! Will be an interesting game to watch!  -> optimism
I didn't know that, thank you for teaching me something today! -> gratitude
They got bored from haunting earth for thousands of years and ultimately moved on to the afterlife. -> neutral
Thank you for asking questions and recognizing that there may be things that you don’t know or understand about police tactics. Seriously. Thank you. -> gratitude
You’re welcome -> gratitude
100%! Congrats on your job too! -> admiration
I’m sorry to hear that friend :(. It’s for the best most likely if sh


KeyboardInterrupt



In [24]:
llm_model_names = ['EleutherAI/pythia-70m', 'EleutherAI/pythia-160m', 'EleutherAI/pythia-410m', 'EleutherAI/pythia-1.4b']
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    max_samples_for_llm_train=max_samples_for_llm_train,
    dataset_name=dataset_name,
    concept=concept,
    max_samples_for_concept=max_samples_for_concept,
    force_balanced_llm_train=force_balanced_llm_train,
    switched_classifier_prediction_labels=switched_classifier_prediction_labels,
    
    # Classifier parameters
    classifier_name="roberta-base-go_emotions",
    classifier=HGF,
    train_classifier=False,
    classifier_train_arguments = {},
    
    llm_model_names=llm_model_names,
    
    prompt_header_llm_concept    = prompt_header_llm_concept,
    prompt_content_llm_concept   = prompt_content_llm_concept,
    prompt_tail_llm_concept      = prompt_tail_llm_concept,
    
    prompt_header_llm_train      = prompt_header_llm_train,
    prompt_content_llm_train     = prompt_content_llm_train,
    prompt_tail_llm_train        = prompt_tail_llm_train,
    
    prompt_llm_simulation        = prompt_llm_simulation
)

Classifier is predicting sentiments...




Classifying task done.

Loading model: EleutherAI/pythia-70m
Model loaded successfully on cpu
Running experiment for LLM: EleutherAI/pythia-70m -----------------
LLM guessing context...guessed:  1) If
Training LLM based on classifier's inputs and outputs...
Training completed with 50 examples
LLM simulating classifier...
sentence:  1 / 50  | true label: sadness  | classifier label: remorse  | LLM prediction: do you  | LLM label: No label
sentence:  2 / 50  | true label: admiration  | classifier label: admiration  | LLM prediction: at the  | LLM label: No label
sentence:  3 / 50  | true label: excitement  | classifier label: optimism  | LLM prediction: ive noticed that  | LLM label: No label
sentence:  4 / 50  | true label: gratitude  | classifier label: gratitude  | LLM prediction: so now  | LLM label: No label
sentence:  5 / 50  | true label: neutral  | classifier label: neutral  | LLM prediction: ive
been  | LLM label: No label
sentence:  6 / 50  | true label: gratitude  | classifier

In [None]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

## View results

In [None]:
model_statistics.head(5)

In [None]:
prediction_statistics.head(5)

In [None]:
data_statistics.head(5)

In [None]:
# model_statistics.to_csv('intermediate/model_statistics.csv')
# prediction_statistics.to_csv('intermediate/prediction_statistics.csv')
# data_statistics.to_csv('intermediate/data_statistics.csv')