# Experiment testing

In [1]:
%cd C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
%cd ..

C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop\intermediate
C:\Users\Sebastian\Desktop\GitHubRepositories\llm-workshop


In [2]:
from llm_size_experiment import LLMSizeExperiment
from core.base_model import BaseModel

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from torch.utils.data import DataLoader
from typing import Union
import pandas as pd
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Temporary, for testing

In [3]:
class HuggingFaceModel(BaseModel):
    def __init__(self, model_name: str = "distilbert-base-uncased"):
        self.model = pipeline("text-classification", model=model_name)
    def predict(self, input_text: str) -> str:
        print('predict HF')
        print(self.model(input_text))
        return self.model(input_text)[0]["label"]
    def train(self, train_loader: DataLoader) -> None:
        pass

class LLM(BaseModel):
    def __init__(self, model_name: str, device: Union[int, str] = 'cpu', **pipeline_kwargs):
        if 'max_length' in pipeline_kwargs:
            pipeline_kwargs['max_new_tokens'] = pipeline_kwargs.pop('max_length')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model     = AutoModelForCausalLM.from_pretrained(model_name)
        self.model.to('cpu')
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if device != 'cpu' else -1,
            **pipeline_kwargs
        )
    def train(self, train_loader: DataLoader, x_train, y_train, prompt) -> None:
        print('Training...')
    def predict(self, input_text: str) -> str:
        outputs = self.generator(input_text, num_return_sequences=1)
        print('Predicting sentiment:')
        print('Prompt:', input_text)
        print('Output:', outputs)
        return random.choice(['POSITIVE', 'NEGATIVE'])
#         return outputs[0]["generated_text"][len(input_text):].strip()
    def predict_concept(self, input_text: str) -> str:
        outputs = self.generator(input_text, num_return_sequences=1)
        print('Predicting concept:')
        print('Prompt:', input_text)
        print('Output:', outputs)
        return random.choice(['Sentimeng guessing', 'Somethin different'])
#         return outputs[0]["generated_text"][len(input_text):].strip()

## Load data

In [4]:
data = {
    "text": [
        "I absolutely loved this movie, it was fantastic!",
        "Worst experience ever. I hated every second.",
        "The plot was not intriguing and the characters were flat.",
        "Fantastic performance by the lead actor!",
        "I wouldn't recommend this to anyone.",
        "It was not an okay film, boring.",
        "What a masterpiece! Truly a work of art.",
        "It bored me to tears, very dull.",
    ],
    "label": [
        "POSITIVE",
        "NEGATIVE",
        "NEGATIVE",
        "POSITIVE",
        "NEGATIVE",
        "NEGATIVE",
        "POSITIVE",
        "NEGATIVE",
    ]
}

## Run experiment

In [5]:
shots = 4
x_train=data["text"][:shots]
y_train=data["label"][:shots]
x_test=data["text"][shots:]
y_test=data["label"][shots:]

In [6]:
model_statistics = pd.DataFrame()
prediction_statistics = pd.DataFrame()
data_statistics = pd.DataFrame()

In [7]:
experiment = LLMSizeExperiment()

In [8]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    dataset_name='test_dataset',
    concept="sentiment analysis",
    concept_keywords=["sentiment", "emotion"],
    
    classifier_name="distilbert-sst2",
    classifier=HuggingFaceModel("distilbert-base-uncased-finetuned-sst-2-english"),
    
    llm_models={
        "gpt2-124M": LLM("gpt2", max_length=5, temperature=1.0),
        "gpt2-345M": LLM("gpt2-medium", max_length=5, temperature=1.0),
        "gpt2-774M": LLM("gpt2-large", max_length=5, temperature=1.0)
    },
    
    prompt_header_llm_concept    = "In 2 words guess, what task is the model doing:\n",
    prompt_content_llm_concept   = "{x_test} -> {y_test}\n",
    prompt_tail_llm_concept      = "What is this task?",
    
    prompt_header_llm_train    = "You are a classificator\n",
    prompt_content_llm_train   = "{x_train} -> {y_train}\n",
    prompt_tail_llm_train      = "Learn based on this.",
    
    prompt_llm_simulation= "{x_test}"
)




Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


predict HF
[{'label': 'POSITIVE', 'score': 0.6439165472984314}]
predict HF
[{'label': 'NEGATIVE', 'score': 0.99978107213974}]
predict HF
[{'label': 'POSITIVE', 'score': 0.9998670816421509}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


predict HF
[{'label': 'NEGATIVE', 'score': 0.9997370839118958}]
Running experiment for LLM: gpt2-124M


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting concept:
Prompt: In 2 words guess, what task is the model doing:
I wouldn't recommend this to anyone. -> NEGATIVE
It was not an okay film, boring. -> NEGATIVE
What a masterpiece! Truly a work of art. -> POSITIVE
It bored me to tears, very dull. -> NEGATIVE
What is this task?
Output: [{'generated_text': "In 2 words guess, what task is the model doing:\nI wouldn't recommend this to anyone. -> NEGATIVE\nIt was not an okay film, boring. -> NEGATIVE\nWhat a masterpiece! Truly a work of art. -> POSITIVE\nIt bored me to tears, very dull. -> NEGATIVE\nWhat is this task? It's really boring!"}]
Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: I wouldn't recommend this to anyone.
Output: [{'generated_text': "I wouldn't recommend this to anyone. All I'm saying is"}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: It was not an okay film, boring.
Output: [{'generated_text': 'It was not an okay film, boring. A lot of folks thought'}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: What a masterpiece! Truly a work of art.
Output: [{'generated_text': 'What a masterpiece! Truly a work of art.\n\nThank you,'}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: It bored me to tears, very dull.
Output: [{'generated_text': 'It bored me to tears, very dull.\n\n"It\'s'}]
Running experiment for LLM: gpt2-345M


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting concept:
Prompt: In 2 words guess, what task is the model doing:
I wouldn't recommend this to anyone. -> NEGATIVE
It was not an okay film, boring. -> NEGATIVE
What a masterpiece! Truly a work of art. -> POSITIVE
It bored me to tears, very dull. -> NEGATIVE
What is this task?
Output: [{'generated_text': "In 2 words guess, what task is the model doing:\nI wouldn't recommend this to anyone. -> NEGATIVE\nIt was not an okay film, boring. -> NEGATIVE\nWhat a masterpiece! Truly a work of art. -> POSITIVE\nIt bored me to tears, very dull. -> NEGATIVE\nWhat is this task? - the most boring thing"}]
Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: I wouldn't recommend this to anyone.
Output: [{'generated_text': "I wouldn't recommend this to anyone. There's a lot of"}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: It was not an okay film, boring.
Output: [{'generated_text': 'It was not an okay film, boring. I hated the film so'}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: What a masterpiece! Truly a work of art.
Output: [{'generated_text': 'What a masterpiece! Truly a work of art. Truly, truly a masterpiece'}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: It bored me to tears, very dull.
Output: [{'generated_text': "It bored me to tears, very dull. I couldn't understand any"}]
Running experiment for LLM: gpt2-774M


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting concept:
Prompt: In 2 words guess, what task is the model doing:
I wouldn't recommend this to anyone. -> NEGATIVE
It was not an okay film, boring. -> NEGATIVE
What a masterpiece! Truly a work of art. -> POSITIVE
It bored me to tears, very dull. -> NEGATIVE
What is this task?
Output: [{'generated_text': "In 2 words guess, what task is the model doing:\nI wouldn't recommend this to anyone. -> NEGATIVE\nIt was not an okay film, boring. -> NEGATIVE\nWhat a masterpiece! Truly a work of art. -> POSITIVE\nIt bored me to tears, very dull. -> NEGATIVE\nWhat is this task? -> NEGATIVE\n"}]
Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: I wouldn't recommend this to anyone.
Output: [{'generated_text': "I wouldn't recommend this to anyone. I had one of these"}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: It was not an okay film, boring.
Output: [{'generated_text': "It was not an okay film, boring. This is like, '"}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: What a masterpiece! Truly a work of art.
Output: [{'generated_text': 'What a masterpiece! Truly a work of art. Can anyone tell me what'}]
Predicting sentiment:
Prompt: It bored me to tears, very dull.
Output: [{'generated_text': 'It bored me to tears, very dull.\n\nHe was a'}]


In [9]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

In [10]:
model_statistics_tmp, prediction_statistics_tmp, data_statistics_tmp = experiment.run(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    dataset_name='test_dataset',
    concept="sentiment analysis",
    concept_keywords=["sentiment", "emotion"],
    
    classifier_name="distilbert-sst2_second",
    classifier=HuggingFaceModel("distilbert-base-uncased-finetuned-sst-2-english"),
    
    llm_models={
        "gpt-neo-125M": LLM("EleutherAI/gpt-neo-125M", max_length=5, temperature=1.0),
        "gpt2-345M": LLM("gpt2-medium", max_length=5, temperature=1.0)
    },
    
    prompt_header_llm_concept    = "In 2 words guess, what task is the model doing:\n",
    prompt_content_llm_concept   = "{x_test} -> {y_test}\n",
    prompt_tail_llm_concept      = "What is this task?",
    
    prompt_header_llm_train    = "You are a classificator\n",
    prompt_content_llm_train   = "{x_train} -> {y_train}\n",
    prompt_tail_llm_train      = "Learn based on this.",
    
    prompt_llm_simulation= "{x_test}"
)

Device set to use cpu


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu


predict HF
[{'label': 'POSITIVE', 'score': 0.6439165472984314}]
predict HF
[{'label': 'NEGATIVE', 'score': 0.99978107213974}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


predict HF
[{'label': 'POSITIVE', 'score': 0.9998670816421509}]
predict HF
[{'label': 'NEGATIVE', 'score': 0.9997370839118958}]
Running experiment for LLM: gpt-neo-125M


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting concept:
Prompt: In 2 words guess, what task is the model doing:
I wouldn't recommend this to anyone. -> NEGATIVE
It was not an okay film, boring. -> NEGATIVE
What a masterpiece! Truly a work of art. -> POSITIVE
It bored me to tears, very dull. -> NEGATIVE
What is this task?
Output: [{'generated_text': "In 2 words guess, what task is the model doing:\nI wouldn't recommend this to anyone. -> NEGATIVE\nIt was not an okay film, boring. -> NEGATIVE\nWhat a masterpiece! Truly a work of art. -> POSITIVE\nIt bored me to tears, very dull. -> NEGATIVE\nWhat is this task?\n\nA:\n"}]
Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: I wouldn't recommend this to anyone.
Output: [{'generated_text': "I wouldn't recommend this to anyone.\n\nA:\n"}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: It was not an okay film, boring.
Output: [{'generated_text': 'It was not an okay film, boring. It was a good film'}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: What a masterpiece! Truly a work of art.
Output: [{'generated_text': 'What a masterpiece! Truly a work of art.\n\nI’'}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: It bored me to tears, very dull.
Output: [{'generated_text': 'It bored me to tears, very dull. I was so happy to'}]
Running experiment for LLM: gpt2-345M


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting concept:
Prompt: In 2 words guess, what task is the model doing:
I wouldn't recommend this to anyone. -> NEGATIVE
It was not an okay film, boring. -> NEGATIVE
What a masterpiece! Truly a work of art. -> POSITIVE
It bored me to tears, very dull. -> NEGATIVE
What is this task?
Output: [{'generated_text': "In 2 words guess, what task is the model doing:\nI wouldn't recommend this to anyone. -> NEGATIVE\nIt was not an okay film, boring. -> NEGATIVE\nWhat a masterpiece! Truly a work of art. -> POSITIVE\nIt bored me to tears, very dull. -> NEGATIVE\nWhat is this task? -> NEGATIVE\n"}]
Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: I wouldn't recommend this to anyone.
Output: [{'generated_text': "I wouldn't recommend this to anyone. I think it would be"}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: It was not an okay film, boring.
Output: [{'generated_text': 'It was not an okay film, boring. We thought that the next'}]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicting sentiment:
Prompt: What a masterpiece! Truly a work of art.
Output: [{'generated_text': 'What a masterpiece! Truly a work of art.'}]
Predicting sentiment:
Prompt: It bored me to tears, very dull.
Output: [{'generated_text': 'It bored me to tears, very dull. The whole game is kind'}]


In [11]:
model_statistics = pd.concat([model_statistics, model_statistics_tmp], ignore_index=True)
prediction_statistics = pd.concat([prediction_statistics, prediction_statistics_tmp], ignore_index=True)
data_statistics = pd.concat([data_statistics, data_statistics_tmp], ignore_index=True)

## View results

In [12]:
model_statistics

Unnamed: 0,run_id,dataset_name,classifier,llm,classifier_accuracy,llm_concept_accuracy,llm_simulation_accuracy,prompt_header_llm_concept,prompt_content_llm_concept,prompt_tail_llm_concept,prompt_header_llm_train,prompt_content_llm_train,prompt_tail_llm_train,prompt_llm_simulation,llm_predicted_concept
0,1,test_dataset,distilbert-sst2,gpt2-124M,0.75,0.0,0.25,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Sentimeng guessing
1,1,test_dataset,distilbert-sst2,gpt2-345M,0.75,0.0,0.5,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Somethin different
2,1,test_dataset,distilbert-sst2,gpt2-774M,0.75,0.0,0.75,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Sentimeng guessing
3,2,test_dataset,distilbert-sst2_second,gpt-neo-125M,0.75,0.0,0.25,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Sentimeng guessing
4,2,test_dataset,distilbert-sst2_second,gpt2-345M,0.75,0.0,0.0,"In 2 words guess, what task is the model doing:\n",{x_test} -> {y_test}\n,What is this task?,You are a classificator\n,{x_train} -> {y_train}\n,Learn based on this.,{x_test},Sentimeng guessing


In [13]:
prediction_statistics

Unnamed: 0,run_id,dataset_name,classifier_name,llm_name,x_test,y_test,classifier_predicted_label,llm_simulation_predicted_label
0,1,test_dataset,distilbert-sst2,gpt2-124M,I wouldn't recommend this to anyone.,NEGATIVE,POSITIVE,NEGATIVE
1,1,test_dataset,distilbert-sst2,gpt2-124M,"It was not an okay film, boring.",NEGATIVE,NEGATIVE,POSITIVE
2,1,test_dataset,distilbert-sst2,gpt2-124M,What a masterpiece! Truly a work of art.,POSITIVE,POSITIVE,NEGATIVE
3,1,test_dataset,distilbert-sst2,gpt2-124M,"It bored me to tears, very dull.",NEGATIVE,NEGATIVE,NEGATIVE
4,1,test_dataset,distilbert-sst2,gpt2-345M,I wouldn't recommend this to anyone.,NEGATIVE,POSITIVE,POSITIVE
5,1,test_dataset,distilbert-sst2,gpt2-345M,"It was not an okay film, boring.",NEGATIVE,NEGATIVE,NEGATIVE
6,1,test_dataset,distilbert-sst2,gpt2-345M,What a masterpiece! Truly a work of art.,POSITIVE,POSITIVE,NEGATIVE
7,1,test_dataset,distilbert-sst2,gpt2-345M,"It bored me to tears, very dull.",NEGATIVE,NEGATIVE,POSITIVE
8,1,test_dataset,distilbert-sst2,gpt2-774M,I wouldn't recommend this to anyone.,NEGATIVE,POSITIVE,NEGATIVE
9,1,test_dataset,distilbert-sst2,gpt2-774M,"It was not an okay film, boring.",NEGATIVE,NEGATIVE,NEGATIVE


In [14]:
data_statistics

Unnamed: 0,run_id,dataset_name,classifier_name,llm_name,partition,num_samples,label_counts,label_proportions,avg_text_length,avg_word_count
0,1,test_dataset,distilbert-sst2,gpt2-124M,train,4,"{'POSITIVE': 2, 'NEGATIVE': 2}","{'POSITIVE': 0.5, 'NEGATIVE': 0.5}",47.2,7.8
1,1,test_dataset,distilbert-sst2,gpt2-124M,train,4,"{'NEGATIVE': 3, 'POSITIVE': 1}","{'NEGATIVE': 0.75, 'POSITIVE': 0.25}",47.2,7.8
2,1,test_dataset,distilbert-sst2,gpt2-345M,train,4,"{'POSITIVE': 2, 'NEGATIVE': 2}","{'POSITIVE': 0.5, 'NEGATIVE': 0.5}",47.2,7.8
3,1,test_dataset,distilbert-sst2,gpt2-345M,train,4,"{'NEGATIVE': 3, 'POSITIVE': 1}","{'NEGATIVE': 0.75, 'POSITIVE': 0.25}",47.2,7.8
4,1,test_dataset,distilbert-sst2,gpt2-774M,train,4,"{'POSITIVE': 2, 'NEGATIVE': 2}","{'POSITIVE': 0.5, 'NEGATIVE': 0.5}",47.2,7.8
5,1,test_dataset,distilbert-sst2,gpt2-774M,train,4,"{'NEGATIVE': 3, 'POSITIVE': 1}","{'NEGATIVE': 0.75, 'POSITIVE': 0.25}",47.2,7.8
6,2,test_dataset,distilbert-sst2_second,gpt-neo-125M,train,4,"{'POSITIVE': 2, 'NEGATIVE': 2}","{'POSITIVE': 0.5, 'NEGATIVE': 0.5}",47.2,7.8
7,2,test_dataset,distilbert-sst2_second,gpt-neo-125M,train,4,"{'NEGATIVE': 3, 'POSITIVE': 1}","{'NEGATIVE': 0.75, 'POSITIVE': 0.25}",47.2,7.8
8,2,test_dataset,distilbert-sst2_second,gpt2-345M,train,4,"{'POSITIVE': 2, 'NEGATIVE': 2}","{'POSITIVE': 0.5, 'NEGATIVE': 0.5}",47.2,7.8
9,2,test_dataset,distilbert-sst2_second,gpt2-345M,train,4,"{'NEGATIVE': 3, 'POSITIVE': 1}","{'NEGATIVE': 0.75, 'POSITIVE': 0.25}",47.2,7.8
