In [2]:
from datetime import datetime
import json
from os import listdir
from os.path import exists

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)

  from tqdm.autonotebook import tqdm, trange


## Utils

In [3]:
def create_jsonl_file(data: list[dict], file: str) -> None:
    with open(file, "w") as f:
        for item in data:
            json_line = json.dumps(item)
            f.write(json_line + "\n")


def load_jsonl_file(file: str) -> list[dict]:
    data = []
    with open(file, "r") as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

## Load model

In [4]:
model_id = "microsoft/Phi-3-mini-4k-instruct"

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [5]:
# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# check device
model.device

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.41s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


device(type='cuda', index=0)

## Create text to continued by model

In [6]:
phi3_template = """
<|system|>
You are a helpful assistant.<|end|>
<|user|>
{prompt}<|end|>
<|assistant|>
{start}
"""

prompt = """
Write a short blog post about a recipe and the inspiration behind it.
 Do not include a title.
 Only reveal the dish after the story.
 Start with short story and then move to the recipe.
 To re-iterate, do not include a title.
""".replace(
    "\n", ""
)

start = """
" Once upon a time, in a quaint little village nestled between rolling hills and verdant fields,
 there lived an elderly woman named Agnes. Agnes was known for her warm smile and her legendary
 Sunday dinners that brought the entire neighborhood together. Her recipes were family heirlooms,
 passed down through generations, with each family adding their own touch to the final dish.
<br><br>
One crisp autumn evening, Agnes was reminiscing about her childhood, and how her grandmother used
 to gather everyone around the dinner table, sharing stories and laughter. These were the moments
 that shaped her, the memories that she passed on to her own children and grandchildren.
<br><br>
Inspired by her grandmother's legacy, Agnes decided to create a new dish that would encapsulate
 the essence of those cherished gatherings. She wanted something that was comforting and nourishing,
 a dish that could be prepared with love and shared with others. After days of experimentation, she
 finally created a recipe that she believed truly captured the spirit of her family's Sunday dinners.
<br><br>
**Agnes's
""".replace(
    "\n", ""
).replace(
    "<br>", "\n"
)

# start = ""

text = phi3_template.format(
    prompt=prompt,
    start=start,
)

print(text)


<|system|>
You are a helpful assistant.<|end|>
<|user|>
Write a short blog post about a recipe and the inspiration behind it. Do not include a title. Only reveal the dish after the story. Start with short story and then move to the recipe. To re-iterate, do not include a title.<|end|>
<|assistant|>
" Once upon a time, in a quaint little village nestled between rolling hills and verdant fields, there lived an elderly woman named Agnes. Agnes was known for her warm smile and her legendary Sunday dinners that brought the entire neighborhood together. Her recipes were family heirlooms, passed down through generations, with each family adding their own touch to the final dish.

One crisp autumn evening, Agnes was reminiscing about her childhood, and how her grandmother used to gather everyone around the dinner table, sharing stories and laughter. These were the moments that shaped her, the memories that she passed on to her own children and grandchildren.

Inspired by her grandmother's leg

## Test by generating once

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

temperature = 0.2

generation_args = {
    "max_new_tokens": 200,
    "return_full_text": False,
    "temperature": temperature,
    "do_sample": True,
}

output = generator(text, **generation_args)

In [None]:
print(output[0]["generated_text"])

## Generate many samples and save in jsonl

In [8]:
max_new_tokens = 200
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# filename = f"../results/{current_time}_agnes_story.jsonl"
filename = f"../results/2024-07-13_18-04-11_agnes_story.jsonl"

# create filename if not exists
if not exists(filename):
    with open(filename, "w") as f:
        pass

for temperature in [2.0]:
    if temperature == 0:
        generation_args = {
            "max_new_tokens": max_new_tokens,
            "return_full_text": False,
            "do_sample": False,
        }
        n_generations = 1
    else:
        generation_args = {
            "max_new_tokens": max_new_tokens,
            "return_full_text": False,
            "temperature": temperature,
            "do_sample": True,
        }
        n_generations = 20
    for _ in range(n_generations):
        output = generator(text, **generation_args)

        data = {
            "model": model_id,
            "temperature": temperature,
            "max_new_tokens": max_new_tokens,
            "prompt": prompt,
            "start": start,
            "output": output[0]["generated_text"],
        }

        with open(filename, "a") as file:
            file.write(json.dumps(data) + "\n")

## Load and analyze results from jsonl files

### Load results

In [9]:
# load results from json files
files = listdir("../results")

for file in files:
    print(file)

2024-07-13_18-04-11_agnes_story.jsonl
2024-07-13_08-31_story_creation.jsonl


In [10]:
filepath = "../results/2024-07-13_18-04-11_agnes_story.jsonl"
results_df = pd.read_json(filepath, lines=True)
results_df.head()

Unnamed: 0,model,temperature,max_new_tokens,prompt,start,output
0,microsoft/Phi-3-mini-4k-instruct,0.8,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",\nHeritage Roast Chicken with Root Vegetables*...
1,microsoft/Phi-3-mini-4k-instruct,0.4,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",\nComforting Chicken and Vegetable Stew**\n\nI...
2,microsoft/Phi-3-mini-4k-instruct,0.2,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",\nAgnes's Sunday Dinner Delight**\n\nIngredien...
3,microsoft/Phi-3-mini-4k-instruct,1.0,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",Golden Mashed Sweet Potatoes**
4,microsoft/Phi-3-mini-4k-instruct,0.2,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",\nAgnes's Sunday Dinner: Roasted Chicken with ...


### Apply sentence embedding

In [11]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(results_df["output"].to_list())
print(embeddings.shape)

(236, 384)


In [12]:
results_df["embeddings"] = embeddings.tolist()
results_df.head()

Unnamed: 0,model,temperature,max_new_tokens,prompt,start,output,embeddings
0,microsoft/Phi-3-mini-4k-instruct,0.8,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",\nHeritage Roast Chicken with Root Vegetables*...,"[-0.0034415950067341328, -0.023009391501545906..."
1,microsoft/Phi-3-mini-4k-instruct,0.4,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",\nComforting Chicken and Vegetable Stew**\n\nI...,"[-0.026234900578856468, -0.006930043920874596,..."
2,microsoft/Phi-3-mini-4k-instruct,0.2,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",\nAgnes's Sunday Dinner Delight**\n\nIngredien...,"[0.0003854652459267527, 0.025976361706852913, ..."
3,microsoft/Phi-3-mini-4k-instruct,1.0,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",Golden Mashed Sweet Potatoes**,"[0.017272798344492912, -0.016275012865662575, ..."
4,microsoft/Phi-3-mini-4k-instruct,0.2,200,Write a short blog post about a recipe and the...,""" Once upon a time, in a quaint little village...",\nAgnes's Sunday Dinner: Roasted Chicken with ...,"[-0.018565285950899124, 0.038250308483839035, ..."


### Carry out dimension reduction and visualize

In [18]:
# carry out pca on embeddings using sklearn
reducer = PCA(n_components=2)
embeddings_reduced = reducer.fit_transform(embeddings)
print(embeddings_reduced.shape)

# visualize the embeddings using plotly
import plotly.express as px

results_df["dim1"] = embeddings_reduced[:, 1]
results_df["dim2"] = embeddings_reduced[:, 0]

results_df["truncated_output"] = results_df["output"].str[:50]

fig = px.scatter(
    results_df,
    x="dim1",
    y="dim2",
    hover_data=["temperature", "truncated_output"],
    title="Dim reduction of Agnes Story Embeddings",
    color="temperature",
)

fig.update_traces(
    hovertemplate="<br>".join(
        ["Temperature: %{customdata[0]}", "Output: %{customdata[1]}"]
    )
)

fig.show()

(236, 2)


### Cosine similarity with temperature 0 output

In [22]:
# compute the cosine similarity of embedding for zero temperature with others
from sklearn.metrics.pairwise import cosine_similarity
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np

zero_embedding = results_df.loc[results_df["temperature"] == 0, "embeddings"].values[0]
zero_embedding = np.array(zero_embedding).reshape(1, -1)
results_df["cosine_sim"] = cosine_similarity(embeddings, zero_embedding)

# Get unique temps
# use [1:] to exclude 0 temperature
temperatures = np.sort(results_df["temperature"].unique())[1:]

# Create subplot
fig = make_subplots(rows=1, cols=1)

# Add a CDF trace for each category
for temp in temperatures:
    # Filter data for the current category
    subset = results_df.loc[results_df["temperature"] == temp, "cosine_sim"]

    # Sort the data and calculate CDF
    sorted_data = np.sort(subset)
    cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)

    # Add trace
    fig.add_trace(
        go.Scatter(x=sorted_data, y=cdf, mode="lines", name=str(temp)), row=1, col=1
    )

# Update layout
fig.update_layout(
    title="CDF of Cosine Similarity with 0 Temperature",
    xaxis_title="Cosine Similarity",
    yaxis_title="Cumulative Probability",
    legend_title="Temperature",
)

# Show the plot
fig.show()

### Rules based stats / evaluations

In [23]:
# create column checking if 'chicken' is in the output
results_df["contains_chicken"] = results_df["output"].str.contains(
    "chicken", case=False
)

results_df.groupby("temperature").agg({"contains_chicken": "mean"})

Unnamed: 0_level_0,contains_chicken
temperature,Unnamed: 1_level_1
0.0,0.0
0.1,0.65
0.2,0.7
0.3,0.7
0.4,0.8
0.5,0.75
0.6,0.631579
0.7,0.5
0.8,0.5625
0.9,0.4
