# Experiment example usage

In [1]:
%cd C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
%cd ..

C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop


In [2]:
from core.base_model import BaseModel
from basic.huggingface_model import HuggingFaceModel
from basic.llm_model import LLMModel
from intermediate.llm_size_experiment import LLMSizeExperiment
from tmp.custom_model import EmotionBERT, EmotionClassifier

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import DataLoader
from typing import Union
from datasets import Dataset, load_dataset
import pandas as pd
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Temporary, for testing

In [3]:
class LLM(BaseModel):
    def __init__(self, model_name: str, device: Union[int, str] = 'cpu', **pipeline_kwargs):
        if 'max_length' in pipeline_kwargs:
            pipeline_kwargs['max_new_tokens'] = pipeline_kwargs.pop('max_length')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model     = AutoModelForCausalLM.from_pretrained(model_name)
        self.model.to('cpu')
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if device != 'cpu' else -1,
            **pipeline_kwargs
        )
    def train(self, train_loader: DataLoader, x_train, y_train, prompt) -> None:
        pass
    def predict(self, input_text: str) -> str:
#         outputs = self.generator(input_text, num_return_sequences=1)
#         return outputs[0]["generated_text"][len(input_text):].strip()
        return 'joy'
    def predict_concept(self, input_text: str) -> str:
#         outputs = self.generator(input_text, num_return_sequences=1)
#         return outputs[0]["generated_text"][len(input_text):].strip()
        return "Sentimantal sentiment"

## Load data

In [4]:
go_emotions_simplified_data = load_dataset("go_emotions", "simplified")

In [5]:
with open("tmp/dict.txt", "r") as f:
    emotions = [line.strip() for line in f.readlines()]

def prepare_df(df, emotions):
    df["label"] = df["labels"].apply(lambda x: x[0])
    df["sentiment"] = df["label"].apply(lambda idx: emotions[idx])
    return df[["text", "sentiment"]]

In [6]:
train_df = prepare_df(go_emotions_simplified_data["train"].to_pandas(), emotions)
val_df = prepare_df(go_emotions_simplified_data["validation"].to_pandas(), emotions)
test_df = prepare_df(go_emotions_simplified_data["test"].to_pandas(), emotions)
print(train_df.head())

                                                text  sentiment
0  My favourite food is anything I didn't have to...    neutral
1  Now if he does off himself, everyone will thin...    neutral
2                     WHY THE FUCK IS BAYLESS ISOING      anger
3                        To make her feel threatened       fear
4                             Dirty Southern Wankers  annoyance


In [7]:
x_train = train_df['text'].tolist()
y_train = train_df['sentiment'].tolist()
x_val = val_df['text'].tolist()
y_val = val_df['sentiment'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['sentiment'].tolist()
print(x_train[:5])
print(y_train[:5])

dataset_name='go_emotions'
concept=concept="sentiment analysis"
concept_keywords=concept_keywords=["sentiment", "emotion"]

# llm_models={} TODO

["My favourite food is anything I didn't have to cook myself.", 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead', 'WHY THE FUCK IS BAYLESS ISOING', 'To make her feel threatened', 'Dirty Southern Wankers']
['neutral', 'neutral', 'anger', 'fear', 'annoyance']


In [9]:
# Temporary for testing
x_train = x_train[:10]
y_train = y_train[:10]
x_val = x_train
y_val = y_train
x_test = x_train
y_test = y_train
dataset_name='go_emotions'
concept=concept="sentiment analysis"
concept_keywords=concept_keywords=["sentiment", "emotion"]

llm_models={
        "gpt2-124M": LLM("gpt2", max_length=5, temperature=1.0),
        "gpt2-345M": LLM("gpt2-medium", max_length=5, temperature=1.0)
    }
# llm_models={
#         "Phi-3-mini-4k-instruct": LLMModel(model_name="microsoft/Phi-3-mini-4k-instruct")
#     }

Device set to use cpu
Device set to use cpu


In [8]:
# Testing LLM
a = LLMModel(model_name="microsoft/Phi-3-mini-4k-instruct")
a.predict('What a depressing day!')

PackageNotFoundError: No package metadata was found for bitsandbytes

## Run experiment

In [10]:
experiment = LLMSizeExperiment()

In [11]:
model_statistics = pd.DataFrame()
prediction_statistics = pd.DataFrame()
data_statistics = pd.DataFrame()

### EmotionClassifier

In [12]:
custom_classifier = EmotionClassifier(device='cpu')

In [13]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    dataset_name=dataset_name,
    concept=concept,
    concept_keywords=concept_keywords,
    
    classifier_name="custom_model",
    classifier=custom_classifier,
    train_classifier=True,
    classifier_train_arguments = {"epochs": 2, "batch_size": 2},
    
    llm_models=llm_models,
    
    prompt_header_llm_concept    = "In 2 words guess, what task is the model doing:\n",
    prompt_content_llm_concept   = "{x_test} -> {y_test}\n",
    prompt_tail_llm_concept      = "What is this task?",
    
    prompt_header_llm_train    = "You are a classificator\n",
    prompt_content_llm_train   = "{x_train} -> {y_train}\n",
    prompt_tail_llm_train      = "Learn based on this.",
    
    prompt_llm_simulation= "{x_test}"
)

Labels setup: 8 unique labels found: ['admiration', 'anger', 'annoyance', 'desire', 'fear', 'gratitude', 'neutral', 'surprise']
Epoch 1/2g batch 5/5
  Train Loss: 2.2753
  Val Accuracy: 0.4000
  Val F1 Macro: 0.1875
  Val ROC AUC (Macro OVR): 0.8929
  New best model saved with F1 Macro: 0.1875
Epoch 2/2g batch 5/5
  Train Loss: 1.7603
  Val Accuracy: 0.8000
  Val F1 Macro: 0.7188
  Val ROC AUC (Macro OVR): 1.0000
  New best model saved with F1 Macro: 0.7188
Training finished. Loading best model state.
Running experiment for LLM: gpt2-124M
Running experiment for LLM: gpt2-345M




In [15]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

### roberta-base-go_emotions

In [16]:
HGF = HuggingFaceModel(model_name="SamLowe/roberta-base-go_emotions")

In [17]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    dataset_name=dataset_name,
    concept=concept,
    concept_keywords=concept_keywords,
    
    classifier_name="roberta-base-go_emotions",
    classifier=HGF,
    train_classifier=False,
    classifier_train_arguments = {},
    
    llm_models=llm_models,
    
    prompt_header_llm_concept    = "In 2 words guess, what task is the model doing:\n",
    prompt_content_llm_concept   = "{x_test} -> {y_test}\n",
    prompt_tail_llm_concept      = "What is this task?",
    
    prompt_header_llm_train    = "You are a classificator\n",
    prompt_content_llm_train   = "{x_train} -> {y_train}\n",
    prompt_tail_llm_train      = "Learn based on this.",
    
    prompt_llm_simulation= "{x_test}"
)

Device set to use cpu


Running experiment for LLM: gpt2-124M
Running experiment for LLM: gpt2-345M




In [18]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

## View results

In [19]:
model_statistics.head()

Unnamed: 0,run_id,dataset_name,classifier,llm,classifier_accuracy,classifier_precision,classifier_recall,classifier_f1,classifier_balanced_accuracy,classifier_cohen_kappa,classifier_mcc,llm_concept_accuracy,llm_simulation_accuracy,llm_simulation_precision,llm_simulation_recall,llm_simulation_f1,llm_simulation_balanced_accuracy,llm_simulation_cohen_kappa,llm_simulation_mcc,llm_direct_prediction_accuracy,llm_direct_precision,llm_direct_recall,llm_direct_f1,llm_direct_balanced_accuracy,llm_direct_cohen_kappa,llm_direct_mcc,prompt_header_llm_concept,prompt_content_llm_concept,prompt_tail_llm_concept,prompt_header_llm_train,prompt_content_llm_train,prompt_tail_llm_train,prompt_llm_simulation,llm_predicted_concept
0,1,go_emotions,custom_model,gpt2-124M,0.8,0.7,0.75,0.71875,0.75,0.75,0.782461,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Sentimantal sentiment
1,1,go_emotions,custom_model,gpt2-345M,0.8,0.7,0.75,0.71875,0.75,0.75,0.782461,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Sentimantal sentiment
2,2,go_emotions,roberta-base-go_emotions,gpt2-124M,0.6,0.444444,0.518519,0.470899,0.583333,0.518072,0.538173,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Sentimantal sentiment
3,2,go_emotions,roberta-base-go_emotions,gpt2-345M,0.6,0.444444,0.518519,0.470899,0.583333,0.518072,0.538173,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Sentimantal sentiment


In [20]:
prediction_statistics.head()

Unnamed: 0,run_id,dataset_name,classifier_name,llm_name,x_test,y_test,classifier_predicted_label,classifier_predicted_label_confidence,llm_simulation_predicted_label
0,1,go_emotions,custom_model,gpt2-124M,My favourite food is anything I didn't have to...,neutral,neutral,0.253798,joy
1,1,go_emotions,custom_model,gpt2-124M,"Now if he does off himself, everyone will thin...",neutral,neutral,0.338002,joy
2,1,go_emotions,custom_model,gpt2-124M,WHY THE FUCK IS BAYLESS ISOING,anger,neutral,0.18888,joy
3,1,go_emotions,custom_model,gpt2-124M,To make her feel threatened,fear,neutral,0.177935,joy
4,1,go_emotions,custom_model,gpt2-124M,Dirty Southern Wankers,annoyance,annoyance,0.182116,joy


In [21]:
data_statistics.head()

Unnamed: 0,run_id,dataset_name,classifier_name,partition,num_samples,label_counts,label_proportions,avg_text_length,avg_word_count
0,1,go_emotions,custom_model,train,10,"{'neutral': 3, 'anger': 1, 'fear': 1, 'annoyan...","{'neutral': 0.3, 'anger': 0.1, 'fear': 0.1, 'a...",66.1,12.5
1,1,go_emotions,custom_model,val,10,"{'neutral': 3, 'anger': 1, 'fear': 1, 'annoyan...","{'neutral': 0.3, 'anger': 0.1, 'fear': 0.1, 'a...",66.1,12.5
2,1,go_emotions,custom_model,test,10,"{'neutral': 3, 'anger': 1, 'fear': 1, 'annoyan...","{'neutral': 0.3, 'anger': 0.1, 'fear': 0.1, 'a...",66.1,12.5
3,2,go_emotions,roberta-base-go_emotions,train,10,"{'neutral': 3, 'anger': 1, 'fear': 1, 'annoyan...","{'neutral': 0.3, 'anger': 0.1, 'fear': 0.1, 'a...",66.1,12.5
4,2,go_emotions,roberta-base-go_emotions,val,10,"{'neutral': 3, 'anger': 1, 'fear': 1, 'annoyan...","{'neutral': 0.3, 'anger': 0.1, 'fear': 0.1, 'a...",66.1,12.5
