In [1]:
from datetime import datetime
import json
from os import listdir
from os.path import exists

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)

  from tqdm.autonotebook import tqdm, trange


## Load model

In [2]:
model_id = "microsoft/Phi-3-mini-4k-instruct"

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [3]:
# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# check device
model.device

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.01s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


device(type='cuda', index=0)

## Create text to continued by model

In [6]:
phi3_template = """
<|system|>
You are a helpful assistant.<|end|>
<|user|>
{prompt}<|end|>
<|assistant|>
{start}
"""

prompt = """
Write a short blog post about a recipe and the inspiration behind it.
 Do not include a title.
 Only reveal the dish after the story.
 Start with short story and then move to the recipe.
 To re-iterate, do not include a title.
""".replace(
    "\n", ""
)


start0 = """
Once upon a time, in the heart of Napa Valley, I stumbled upon an old, weathered cookbook in a quaint little antique shop.
 The pages were yellowed with age, and the handwritten notes on the margins told stories of family gatherings and cherished
 memories. One recipe, in particular, caught my eye\u2014a recipe for a dish that seemed to be a family heirloom, passed
 down through generations. It was a recipe for a hearty, comforting meal that promised to warm the soul just as it did the
 hearts of those who shared it in the past.
<br><br>
Inspired by the rich history and sentimental value of the recipe, I decided to recreate it in my own kitchen, infusing it with
 my own touch while honoring its origins. The dish was a
""".replace(
    "\n", ""
).replace(
    "<br>", "\n"
)

scenario0 = "old_cookbook"

starts = {scenario0 + f"_{i}": start0 for i in range(8)}

texts = {
    key: phi3_template.format(prompt=prompt, start=start)
    for key, start in starts.items()
}

print(texts["old_cookbook_0"])


<|system|>
You are a helpful assistant.<|end|>
<|user|>
Write a short blog post about a recipe and the inspiration behind it. Do not include a title. Only reveal the dish after the story. Start with short story and then move to the recipe. To re-iterate, do not include a title.<|end|>
<|assistant|>
Once upon a time, in the heart of Napa Valley, I stumbled upon an old, weathered cookbook in a quaint little antique shop. The pages were yellowed with age, and the handwritten notes on the margins told stories of family gatherings and cherished memories. One recipe, in particular, caught my eye—a recipe for a dish that seemed to be a family heirloom, passed down through generations. It was a recipe for a hearty, comforting meal that promised to warm the soul just as it did the hearts of those who shared it in the past.

Inspired by the rich history and sentimental value of the recipe, I decided to recreate it in my own kitchen, infusing it with my own touch while honoring its origins. The 

## Generate many samples and save in jsonl

In [7]:
max_new_tokens = 200
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
filename = f"../results/{current_time}_one_scenario_repeated.jsonl"
n_generations_default = 20

# create filename if not exists
if not exists(filename):
    with open(filename, "w") as f:
        pass

for scenario_name, text in texts.items():
    start = starts[scenario_name]
    for temperature in [0.0, 0.1, 0.2]:
        if temperature == 0.0:
            generation_args = {
                "max_new_tokens": max_new_tokens,
                "return_full_text": False,
                "do_sample": False,
            }
            n_generations = 1
        else:
            generation_args = {
                "max_new_tokens": max_new_tokens,
                "return_full_text": False,
                "temperature": temperature,
                "do_sample": True,
            }
            n_generations = n_generations_default
        for _ in range(n_generations):
            output = generator(text, **generation_args)

            data = {
                "model": model_id,
                "temperature": temperature,
                "max_new_tokens": max_new_tokens,
                "scenario": scenario_name,
                "prompt": prompt,
                "start": start,
                "output": output[0]["generated_text"],
            }

            with open(filename, "a") as file:
                file.write(json.dumps(data) + "\n")

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


## Load and analyze results from jsonl files

### Load results

In [8]:
# load results from json files
files = listdir("../results")

for file in files:
    print(file)

2024-07-25_10-50-45_one_scenario_repeated.jsonl
2024-07-25_08-45-48_laughing_kids_different_countries.jsonl
2024-07-23_different_stories.jsonl
2024-07-13_18-04-11_agnes_story.jsonl
2024-07-13_08-31_story_creation.jsonl


In [9]:
filepath = "../results/2024-07-25_10-50-45_one_scenario_repeated.jsonl"
results_df = pd.read_json(filepath, lines=True)
results_df.head()

Unnamed: 0,model,temperature,max_new_tokens,scenario,prompt,start,output
0,microsoft/Phi-3-mini-4k-instruct,0.0,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...
1,microsoft/Phi-3-mini-4k-instruct,0.1,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...
2,microsoft/Phi-3-mini-4k-instruct,0.1,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...
3,microsoft/Phi-3-mini-4k-instruct,0.1,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...
4,microsoft/Phi-3-mini-4k-instruct,0.1,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...


In [10]:
scenarios = results_df["scenario"].unique()
scenarios[0]

'old_cookbook_0'

### Apply sentence embedding

In [11]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(results_df["output"].to_list())
print(embeddings.shape)

(328, 384)


In [12]:
results_df["embeddings"] = embeddings.tolist()
results_df.head()

Unnamed: 0,model,temperature,max_new_tokens,scenario,prompt,start,output,embeddings
0,microsoft/Phi-3-mini-4k-instruct,0.0,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...,"[-0.008801046758890152, -0.009456978179514408,..."
1,microsoft/Phi-3-mini-4k-instruct,0.1,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...,"[0.011522555723786354, -0.025245748460292816, ..."
2,microsoft/Phi-3-mini-4k-instruct,0.1,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...,"[0.004139136057347059, -0.01414896547794342, 0..."
3,microsoft/Phi-3-mini-4k-instruct,0.1,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...,"[0.008915367536246777, -0.015004687011241913, ..."
4,microsoft/Phi-3-mini-4k-instruct,0.1,200,old_cookbook_0,Write a short blog post about a recipe and the...,"Once upon a time, in the heart of Napa Valley,...",\n**Rustic Chicken Pot Pie**\n\nIngredients:\n...,"[0.00576872518286109, -0.015508364886045456, 0..."


In [13]:
# define function that gets embeddings from the dataframe and concatenates into a 2d array
def get_embeddings(df):
    embeddings = np.array(df["embeddings"].to_list())
    return embeddings


embeddings = get_embeddings(results_df)
print(embeddings.shape)

(328, 384)


### Cosine similarity with temperature 0 output

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np


def plot_cdf_cosine_similarity(data: pd.DataFrame, title: str):
    data = data.copy()
    zero_embedding = data.loc[data["temperature"] == 0, "embeddings"].values[0]
    zero_embedding = np.array(zero_embedding).reshape(1, -1)
    data["cosine_sim"] = cosine_similarity(get_embeddings(data), zero_embedding)

    # Get unique temps
    # use [1:] to exclude 0 temperature
    temperatures = np.sort(data["temperature"].unique())[1:]

    # Create subplot
    fig = make_subplots(rows=1, cols=1)

    # Add a CDF trace for each category
    for temp in temperatures:
        # Filter data for the current category
        subset = data.loc[data["temperature"] == temp, "cosine_sim"]

        # Sort the data and calculate CDF
        sorted_data = np.sort(subset)
        cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)

        # Add trace
        fig.add_trace(
            go.Scatter(x=sorted_data, y=cdf, mode="lines", name=str(temp)), row=1, col=1
        )

    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title="Cosine Similarity",
        yaxis_title="Cumulative Probability",
        legend_title="Temperature",
        xaxis=dict(range=[0, 1]),
    )

    # Show the plot
    fig.show()

In [17]:
for scenario in scenarios:
    sub_data = results_df[results_df["scenario"] == scenario]
    plot_cdf_cosine_similarity(
        sub_data, f"CDF of Cosine Similarity with Temperature 0 output for {scenario}"
    )

### Rules based stats / evaluations

In [18]:
# for scenario0

df_subset = results_df.copy()

df_subset["contains_rustic_chicken_pot_pie"] = df_subset["output"].str.contains(
    "rustic chicken pot pie", case=False
)
df_subset["contains_chicken"] = df_subset["output"].str.contains("chicken", case=False)
df_subset["contains_beef"] = df_subset["output"].str.contains("beef", case=False)

df_subset.groupby(["temperature", "scenario"])[
    ["contains_rustic_chicken_pot_pie", "contains_chicken", "contains_beef"]
].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,contains_rustic_chicken_pot_pie,contains_chicken,contains_beef
temperature,scenario,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,old_cookbook_0,1.0,1.0,0.0
0.0,old_cookbook_1,1.0,1.0,0.0
0.0,old_cookbook_2,1.0,1.0,0.0
0.0,old_cookbook_3,1.0,1.0,0.0
0.0,old_cookbook_4,1.0,1.0,0.0
0.0,old_cookbook_5,1.0,1.0,0.0
0.0,old_cookbook_6,1.0,1.0,0.0
0.0,old_cookbook_7,1.0,1.0,0.0
0.1,old_cookbook_0,0.95,0.95,0.05
0.1,old_cookbook_1,1.0,1.0,0.0
