# Experiment testing

In [1]:
%cd C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
%cd ..

C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop


In [2]:
from llm_size_experiment import LLMSizeExperiment
from core.base_model import BaseModel
from tmp.custom_model import EmotionBERT, EmotionClassifier

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import DataLoader
from typing import Union
import pandas as pd
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Temporary, for testing

In [3]:
class HuggingFaceModel(BaseModel):
    def __init__(self, model_name: str = "distilbert-base-uncased"):
        self.model = pipeline("text-classification", model=model_name)
        tok_r = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
        mdl_r = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
        self.model = pipeline("text-classification", model=mdl_r, tokenizer=tok_r, device=0, batch_size=32, top_k=None)
    def predict(self, input_text: str) -> str:
        pred = self.model(input_text)[0][0]
        converted = { pred['label']: pred['score'] }
        return converted
    def train(self, train_loader: DataLoader) -> None:
        pass

class LLM(BaseModel):
    def __init__(self, model_name: str, device: Union[int, str] = 'cpu', **pipeline_kwargs):
        if 'max_length' in pipeline_kwargs:
            pipeline_kwargs['max_new_tokens'] = pipeline_kwargs.pop('max_length')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model     = AutoModelForCausalLM.from_pretrained(model_name)
        self.model.to('cpu')
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if device != 'cpu' else -1,
            **pipeline_kwargs
        )
    def train(self, train_loader: DataLoader, x_train, y_train, prompt) -> None:
        pass
    def predict(self, input_text: str) -> str:
#         outputs = self.generator(input_text, num_return_sequences=1)
#         return outputs[0]["generated_text"][len(input_text):].strip()
        return 'joy'
    def predict_concept(self, input_text: str) -> str:
#         outputs = self.generator(input_text, num_return_sequences=1)
#         return outputs[0]["generated_text"][len(input_text):].strip()
        return "Sentimantal sentiment"

## Load data

In [4]:
x_train = ["I am so happy today!", "This is really sad news.", "I feel quite angry about this.",
        "Feeling joyful and excited.", "What a depressing situation.", "He was furious with the outcome."]
y_train = ["joy", "sadness", "anger", "joy", "sadness", "anger"]

x_val = ["This is great!", "I am very upset."]
y_val = ["joy", "anger"]

x_test = ["This is not great!", "I am very upbeat."]
y_test = ["sadness", "joy"]

dataset_name='go_emotions'
concept=concept="sentiment analysis"
concept_keywords=concept_keywords=["sentiment", "emotion"]

## Run experiment

In [5]:
experiment = LLMSizeExperiment()

In [6]:
model_statistics = pd.DataFrame()
prediction_statistics = pd.DataFrame()
data_statistics = pd.DataFrame()

### EmotionClassifier

In [7]:
classifier = EmotionClassifier(device='cpu')

In [8]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    dataset_name=dataset_name,
    concept=concept,
    concept_keywords=concept_keywords,
    
    classifier_name="EmotionClassifier",
    classifier=classifier,
    train_classifier=True,
    classifier_train_arguments = {"epochs": 2, "batch_size": 2},
    
    llm_models={
        "gpt2-124M": LLM("gpt2", max_length=5, temperature=1.0),
        "gpt2-345M": LLM("gpt2-medium", max_length=5, temperature=1.0)
    },
    
    prompt_header_llm_concept    = "In 2 words guess, what task is the model doing:\n",
    prompt_content_llm_concept   = "{x_test} -> {y_test}\n",
    prompt_tail_llm_concept      = "What is this task?",
    
    prompt_header_llm_train    = "You are a classificator\n",
    prompt_content_llm_train   = "{x_train} -> {y_train}\n",
    prompt_tail_llm_train      = "Learn based on this.",
    
    prompt_llm_simulation= "{x_test}"
)

Device set to use cpu
Device set to use cpu


Labels setup: 3 unique labels found: ['anger', 'joy', 'sadness']
Epoch 1/2
  Train Loss: 1.1972
  Val Accuracy: 0.5000
  Val F1 Macro: 0.3333
  Val ROC AUC (Macro OVR): 0.0000
  New best model saved with F1 Macro: 0.3333
Epoch 2/2
  Train Loss: 1.0288
  Val Accuracy: 0.5000
  Val F1 Macro: 0.3333
  Val ROC AUC (Macro OVR): 0.0000
  New best model saved with F1 Macro: 0.3333
Training finished. Loading best model state.
Running experiment for LLM: gpt2-124M
Running experiment for LLM: gpt2-345M


In [9]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

### roberta-base-go_emotions

In [10]:
classifier=HuggingFaceModel(model_name="SamLowe/roberta-base-go_emotions")




Device set to use cpu
Device set to use cpu


In [11]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    x_val=x_val,
    y_val=y_val,
    dataset_name=dataset_name,
    concept=concept,
    concept_keywords=concept_keywords,
    
    classifier_name="roberta-base-go_emotions",
    classifier=classifier,
    train_classifier=False,
    classifier_train_arguments = {},
    
    llm_models={
        "gpt2-124M": LLM("gpt2", max_length=5, temperature=1.0),
        "gpt2-345M": LLM("gpt2-medium", max_length=5, temperature=1.0)
    },
    
    prompt_header_llm_concept    = "In 2 words guess, what task is the model doing:\n",
    prompt_content_llm_concept   = "{x_test} -> {y_test}\n",
    prompt_tail_llm_concept      = "What is this task?",
    
    prompt_header_llm_train    = "You are a classificator\n",
    prompt_content_llm_train   = "{x_train} -> {y_train}\n",
    prompt_tail_llm_train      = "Learn based on this.",
    
    prompt_llm_simulation= "{x_test}"
)

Device set to use cpu
Device set to use cpu


Running experiment for LLM: gpt2-124M
Running experiment for LLM: gpt2-345M


In [12]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

## View results

In [13]:
model_statistics

Unnamed: 0,run_id,dataset_name,classifier,llm,classifier_accuracy,llm_concept_accuracy,llm_simulation_accuracy,llm_direct_prediction_accuracy,prompt_header_llm_concept,prompt_content_llm_concept,prompt_tail_llm_concept,prompt_header_llm_train,prompt_content_llm_train,prompt_tail_llm_train,prompt_llm_simulation,llm_predicted_concept
0,1,go_emotions,EmotionClassifier,gpt2-124M,0.5,0.0,1.0,0.5,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Predicted concept
1,1,go_emotions,EmotionClassifier,gpt2-345M,0.5,0.0,1.0,0.5,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Predicted concept
2,2,go_emotions,roberta-base-go_emotions,gpt2-124M,0.5,0.0,0.5,0.5,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Predicted concept
3,2,go_emotions,roberta-base-go_emotions,gpt2-345M,0.5,0.0,0.5,0.5,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Predicted concept


In [14]:
prediction_statistics

Unnamed: 0,run_id,dataset_name,classifier_name,llm_name,x_test,y_test,classifier_predicted_label,classifier_predicted_label_confidence,llm_simulation_predicted_label
0,1,go_emotions,EmotionClassifier,gpt2-124M,This is not great!,sadness,joy,0.401358,joy
1,1,go_emotions,EmotionClassifier,gpt2-124M,I am very upbeat.,joy,joy,0.444713,joy
2,1,go_emotions,EmotionClassifier,gpt2-345M,This is not great!,sadness,joy,0.401358,joy
3,1,go_emotions,EmotionClassifier,gpt2-345M,I am very upbeat.,joy,joy,0.444713,joy
4,2,go_emotions,roberta-base-go_emotions,gpt2-124M,This is not great!,sadness,disapproval,0.734601,joy
5,2,go_emotions,roberta-base-go_emotions,gpt2-124M,I am very upbeat.,joy,joy,0.542846,joy
6,2,go_emotions,roberta-base-go_emotions,gpt2-345M,This is not great!,sadness,disapproval,0.734601,joy
7,2,go_emotions,roberta-base-go_emotions,gpt2-345M,I am very upbeat.,joy,joy,0.542846,joy


In [15]:
data_statistics

Unnamed: 0,run_id,dataset_name,classifier_name,partition,num_samples,label_counts,label_proportions,avg_text_length,avg_word_count
0,1,go_emotions,EmotionClassifier,train,6,"{'joy': 2, 'sadness': 2, 'anger': 2}","{'joy': 0.33, 'sadness': 0.33, 'anger': 0.33}",26.8,5.0
1,1,go_emotions,EmotionClassifier,val,2,"{'joy': 1, 'anger': 1}","{'joy': 0.5, 'anger': 0.5}",15.0,3.5
2,1,go_emotions,EmotionClassifier,test,2,"{'sadness': 1, 'joy': 1}","{'sadness': 0.5, 'joy': 0.5}",17.5,4.0
3,2,go_emotions,roberta-base-go_emotions,train,6,"{'joy': 2, 'sadness': 2, 'anger': 2}","{'joy': 0.33, 'sadness': 0.33, 'anger': 0.33}",26.8,5.0
4,2,go_emotions,roberta-base-go_emotions,val,2,"{'joy': 1, 'anger': 1}","{'joy': 0.5, 'anger': 0.5}",15.0,3.5
5,2,go_emotions,roberta-base-go_emotions,test,2,"{'sadness': 1, 'joy': 1}","{'sadness': 0.5, 'joy': 0.5}",17.5,4.0
