In [None]:
from datetime import datetime
import json
from os import listdir

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)

## Utils

In [None]:
def create_jsonl_file(data: list[dict], file: str) -> None:
    with open(file, "w") as f:
        for item in data:
            json_line = json.dumps(item)
            f.write(json_line + "\n")


def load_jsonl_file(file: str) -> list[dict]:
    data = []
    with open(file, "r") as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

## Load model

In [None]:
model_id = "microsoft/Phi-3-mini-4k-instruct"

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [None]:
model.device

In [None]:
phi3_template = """
<|system|>
You are a helpful assistant.<|end|>
<|user|>
{prompt}<|end|>
<|assistant|>
{start}
"""

prompt = """
Write a short blog post about a recipe and the inspiration behind it.
 Do not include a title.
 Only reveal the dish after the story.
 Start with short story and then move to the recipe.
 To re-iterate, do not include a title.
""".replace(
    "\n", ""
)

start = """
" Once upon a time, in a quaint little village nestled between rolling hills and verdant fields,
 there lived an elderly woman named Agnes. Agnes was known for her warm smile and her legendary
 Sunday dinners that brought the entire neighborhood together. Her recipes were family heirlooms,
 passed down through generations, with each family adding their own touch to the final dish.
<br><br>
One crisp autumn evening, Agnes was reminiscing about her childhood, and how her grandmother used
 to gather everyone around the dinner table, sharing stories and laughter. These were the moments
 that shaped her, the memories that she passed on to her own children and grandchildren.
<br><br>
Inspired by her grandmother's legacy, Agnes decided to create a new dish that would encapsulate
 the essence of those cherished gatherings. She wanted something that was comforting and nourishing,
 a dish that could be prepared with love and shared with others. After days of experimentation, she
 finally created a recipe that she believed truly captured the spirit of her family's Sunday dinners.
<br><br>
**Agnes's
""".replace(
    "\n", ""
).replace(
    "<br>", "\n"
)

# start = ""

text = phi3_template.format(
    prompt=prompt,
    start=start,
)

print(text)

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

temperature = 0.2

generation_args = {
    "max_new_tokens": 200,
    "return_full_text": False,
    "temperature": temperature,
    "do_sample": True,
}

output = generator(text, **generation_args)

In [None]:
print(output[0]["generated_text"])

In [None]:
temperature = 0.5
max_new_tokens = 200
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

for temperature in [0.2, 0.4, 0.6, 0.8, 1.0]:
    generation_args = {
        "max_new_tokens": max_new_tokens,
        "return_full_text": False,
        "temperature": temperature,
        "do_sample": True,
    }
    for _ in range(20):
        output = generator(text, **generation_args)

        # export output to json file, also tracking choices made
        current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

        path_to_export = f"../results/{current_time}_agnes_story.json"

        # Create a dictionary with the data to be written to the JSON file
        data = {
            "model": model_id,
            "temperature": temperature,
            "max_new_tokens": max_new_tokens,
            "prompt": prompt,
            "start": start,
            "output": output[0]["generated_text"],
        }

        # Open the file in write mode and write the data as JSON
        with open(path_to_export, "w") as file:
            json.dump(data, file, indent=4)

## Load results from JSON files

In [None]:
# load results from json files
files = listdir("../results")

for file in files:
    print(file)

In [None]:
filepath = "../results/2024-07-13_18-04-11_agnes_story.jsonl"
results_df = pd.read_json(filepath, lines=True)
results_df.head()

In [None]:
# iterate through rows and print temperature and output

for _, row in results_df.iterrows():
    print(f"Temperature: {row['temperature']}")
    print(row["output"].replace("\n", "")[:60])

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(results_df["output"].to_list())
print(embeddings.shape)

# # Calculate the embedding similarities
# similarities = model.similarity(embeddings, embeddings)
# print(similarities)

In [None]:
results_df["embeddings"] = embeddings.tolist()
results_df.head()

In [None]:
# carry out pca on embeddings using sklearn
pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings)
print(pca_result.shape)

In [None]:
# visualize the embeddings using plotly
import plotly.express as px

results_df["pca1"] = pca_result[:, 0]
results_df["pca2"] = pca_result[:, 1]

fig = px.scatter(
    results_df,
    x="pca1",
    y="pca2",
    hover_data={"pca1": False, "pca2": False},
    title="PCA of Agnes Story Embeddings",
    color="temperature",
)

fig.show()