In [38]:
import openai
openai.api_key = "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"
from typing import Any, Optional, List, Union, Dict
from babydragon.memory.frames.memory_frame import MemoryFrame
from babydragon.memory.threads.base_thread import BaseThread
from babydragon.bd_types import infer_embeddable_type
from babydragon.models.embedders.ada2 import OpenAiEmbedder
from babydragon.models.embedders.cohere import CohereEmbedder
from babydragon.models.generators.PolarsGenerator import PolarsGenerator
import json
import polars as pl
from datasets import load_dataset

class ChatFrame(BaseThread):
    def __init__(self, name: str = "chat_frame",
                 context_columns: List[str] = [],
                 embeddable_columns: List[str] = ['content'],
                 embedding_columns: List[str] = [],
                 text_embedder: Optional[Union[OpenAiEmbedder, CohereEmbedder]] = OpenAiEmbedder,
                 markdown: str = "text/markdown",
                 max_memory: int | None = None,
                 tokenizer: Any | None = None,
                 save_path: str = 'threads') -> None:

        BaseThread.__init__(self, name, max_memory, tokenizer, save_path)
        self.context_columns = context_columns
        self.embeddable_columns = embeddable_columns
        self.embedding_columns = embedding_columns
        self.text_embedder = text_embedder
        self.markdown = markdown


    # Dot Product Query
    def search_column_with_dot_product(self, query: str, embeddable_column_name: str, top_k: int) -> pl.DataFrame:
        embedding_column_name = 'embedding|' + embeddable_column_name

        query_as_series = pl.Series(query)
        dot_product_frame = self.memory_thread.with_columns(self.memory_thread[embedding_column_name].list.eval(pl.element().explode().dot(query_as_series),parallel=True).list.first().alias("dot_product"))
        # Sort by dot product and select top_k rows
        result = dot_product_frame.sort('dot_product', descending=True).slice(0, top_k)
        return result

    # Tokenization
    def tokenize_column(self, column_name: str):
        new_values = self.tokenizer.encode_batch(self.memory_thread[column_name].to_list())
        new_series = pl.Series(f'tokens|{column_name}', new_values)
        len_values = [len(x) for x in new_values]
        new_series_len = pl.Series(f'tokens_len|{column_name}', len_values)
        self.memory_thread = self.memory_thread.with_columns(new_series)
        self.memory_thread = self.memory_thread.with_columns(new_series_len)
    
    def prepare_column_for_embeddings(self, column_name):

        df = self.memory_thread.select(column_name).with_columns(pl.lit("text-embedding-ada-002").alias("model"))
        input_df = df.with_columns(df[column_name].alias('input')).drop(column_name)

        return input_df
    
    def embed_column(self, column, generator_log_name="chat_embedding"):
        input_df = self.prepare_column_for_embeddings(column)
        embedder = PolarsGenerator(input_df = input_df, name = f"{generator_log_name}_text-embedding-ada-002")
        embedder.execute()
        out_path = f"./batch_generator/{generator_log_name}_text-embedding-ada-002.ndjson"
        #load output file to list
        with open(out_path) as f:
            output = f.readlines()
        #add to memory
        output = [x.strip() for x in output]
        output = [json.loads(x) for x in output]
        #reverse order
        output = output[::-1]
        output = pl.DataFrame(output)
        self.memory_thread = self.memory_thread.with_columns(output)


    def convert_column_to_messages(self, column_name, model_name = "gpt-3.5-turbo-16k", system_prompt = "Youre a Helpful Summarizer!"):
        df = self.memory_thread.select(column_name).with_columns(pl.lit(model_name).alias("model"))

        def create_content(value):
            return ([{"role": "system", "content":system_prompt},
                        {"role": "user", "content": f"{value}"}])

        input_df = df.with_columns(df[column_name].apply(create_content, return_dtype=pl.List).alias('messages')).drop(column_name)
        self.memory_thread = self.memory_thread.with_columns(input_df)

    def generate_column(self, column_name, generator_log_name="chat_summary",  model_name = "gpt-3.5-turbo-16k", system_prompt = "Youre a Helpful Summarizer!"):
        #TODO: Generate column with OpenAI functionAPI
        self.convert_column_to_messages(column_name = column_name, model_name = model_name, system_prompt = system_prompt)
        generator = PolarsGenerator( input_df = self.memory_thread, name = generator_log_name)
        generator.execute()
        out_path = f"./batch_generator/{generator_log_name}_output.ndjson"
        #load output file to list
        with open(out_path) as f:
            output = f.readlines()
        #add to memory
        output = [x.strip() for x in output]
        output = [json.loads(x) for x in output]
        #reverse order
        output = output[::-1]
        output = pl.DataFrame(output)
        self.memory_thread = self.memory_thread.with_columns(output)




chat_frame = ChatFrame()


In [39]:
dataset_url = "OpenAssistant/oasst1"
dataset = load_dataset(dataset_url)['train']


DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/OpenAssistant/oasst1 HTTP/1.1" 200 3056
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): s3.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/OpenAssistant/oasst1/OpenAssistant/oasst1.py HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/OpenAssistant/oasst1 HTTP/1.1" 200 3056
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/OpenAssistant/oasst1/resolve/fdf72ae0827c1cda404aff25b6603abec9e3399b/README.md HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/OpenAssistant/oasst1/revision/fdf7

In [3]:
column_names = dataset.column_names 
data = {name: dataset[name] for name in column_names}

In [4]:
df_polars = pl.DataFrame(data)


In [5]:
df_polars.head()

message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
str,str,str,str,str,str,str,i64,bool,bool,i64,bool,f32,struct[7],str,str,struct[2],struct[3]
"""6ab24d72-0181-…",,"""c3fe8c76-fc30-…","""2023-02-05T14:…","""Can you write …","""prompter""","""en""",3,True,False,,False,,"{0.000443,0.000033,0.000235,0.000142,0.000395,0.000041,0.000027}","""6ab24d72-0181-…","""ready_for_expo…","{[""+1"", ""_skip_reply"", ""_skip_ranking""],[10, 1, 4]}","{[""spam"", ""lang_mismatch"", … ""violence""],[0.0, 0.0, … 0.0],[3, 3, … 3]}"
"""c8e83833-ecbc-…","""6ab24d72-0181-…","""2c96e467-66f0-…","""2023-02-06T13:…","""""Monopsony"" re…","""assistant""","""en""",3,True,False,0.0,False,,"{0.000264,0.000027,0.000136,0.000093,0.000167,0.000038,0.000025}","""6ab24d72-0181-…","""ready_for_expo…","{[""+1"", ""_skip_labeling""],[3, 1]}","{[""spam"", ""fails_task"", … ""violence""],[0.0, 0.0, … 0.0],[3, 2, … 2]}"
"""6708c47f-05c9-…","""c8e83833-ecbc-…","""2c96e467-66f0-…","""2023-02-06T18:…","""Now explain it…","""prompter""","""en""",3,True,False,,False,,"{0.036485,0.000055,0.000376,0.000242,0.013612,0.001708,0.000102}","""6ab24d72-0181-…","""ready_for_expo…","{null,null}","{[""spam"", ""lang_mismatch"", … ""violence""],[0.0, 0.0, … 0.0],[3, 3, … 2]}"
"""343ee2d4-87ae-…","""6ab24d72-0181-…","""49ddcb0d-6588-…","""2023-02-06T13:…","""Monopsony is a…","""assistant""","""en""",3,True,False,1.0,False,,"{0.000887,0.00008,0.000347,0.000446,0.000511,0.000088,0.000057}","""6ab24d72-0181-…","""ready_for_expo…","{[""+1"", ""_skip_reply"", ""_skip_labeling""],[2, 1, 2]}","{[""spam"", ""fails_task"", … ""violence""],[0.0, 0.0, … 0.166667],[3, 3, … 3]}"
"""18145bf4-37fd-…","""343ee2d4-87ae-…","""e10e99a0-38ac-…","""2023-02-06T18:…","""How can one fi…","""prompter""","""en""",3,True,False,,False,,"{0.000936,0.000067,0.000416,0.00018,0.000344,0.00024,0.000064}","""6ab24d72-0181-…","""ready_for_expo…","{[""+1""],[1]}","{[""spam"", ""lang_mismatch"", … ""violence""],[0.0, 0.0, … 0.0],[3, 3, … 2]}"


In [6]:
#remove all rows without lang 'en'
df_polars = df_polars.filter(pl.col('lang') == 'en')

In [7]:
df_polars.head()

message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
str,str,str,str,str,str,str,i64,bool,bool,i64,bool,f32,struct[7],str,str,struct[2],struct[3]
"""6ab24d72-0181-…",,"""c3fe8c76-fc30-…","""2023-02-05T14:…","""Can you write …","""prompter""","""en""",3,True,False,,False,,"{0.000443,0.000033,0.000235,0.000142,0.000395,0.000041,0.000027}","""6ab24d72-0181-…","""ready_for_expo…","{[""+1"", ""_skip_reply"", ""_skip_ranking""],[10, 1, 4]}","{[""spam"", ""lang_mismatch"", … ""violence""],[0.0, 0.0, … 0.0],[3, 3, … 3]}"
"""c8e83833-ecbc-…","""6ab24d72-0181-…","""2c96e467-66f0-…","""2023-02-06T13:…","""""Monopsony"" re…","""assistant""","""en""",3,True,False,0.0,False,,"{0.000264,0.000027,0.000136,0.000093,0.000167,0.000038,0.000025}","""6ab24d72-0181-…","""ready_for_expo…","{[""+1"", ""_skip_labeling""],[3, 1]}","{[""spam"", ""fails_task"", … ""violence""],[0.0, 0.0, … 0.0],[3, 2, … 2]}"
"""6708c47f-05c9-…","""c8e83833-ecbc-…","""2c96e467-66f0-…","""2023-02-06T18:…","""Now explain it…","""prompter""","""en""",3,True,False,,False,,"{0.036485,0.000055,0.000376,0.000242,0.013612,0.001708,0.000102}","""6ab24d72-0181-…","""ready_for_expo…","{null,null}","{[""spam"", ""lang_mismatch"", … ""violence""],[0.0, 0.0, … 0.0],[3, 3, … 2]}"
"""343ee2d4-87ae-…","""6ab24d72-0181-…","""49ddcb0d-6588-…","""2023-02-06T13:…","""Monopsony is a…","""assistant""","""en""",3,True,False,1.0,False,,"{0.000887,0.00008,0.000347,0.000446,0.000511,0.000088,0.000057}","""6ab24d72-0181-…","""ready_for_expo…","{[""+1"", ""_skip_reply"", ""_skip_labeling""],[2, 1, 2]}","{[""spam"", ""fails_task"", … ""violence""],[0.0, 0.0, … 0.166667],[3, 3, … 3]}"
"""18145bf4-37fd-…","""343ee2d4-87ae-…","""e10e99a0-38ac-…","""2023-02-06T18:…","""How can one fi…","""prompter""","""en""",3,True,False,,False,,"{0.000936,0.000067,0.000416,0.00018,0.000344,0.00024,0.000064}","""6ab24d72-0181-…","""ready_for_expo…","{[""+1""],[1]}","{[""spam"", ""lang_mismatch"", … ""violence""],[0.0, 0.0, … 0.0],[3, 3, … 2]}"


In [8]:
import polars as pl
import pandas as pd  # Used for timestamp conversion

# Assume df_polars is your original DataFrame

# Step 1: Column Renaming
column_mapping = {
    "message_id": "message_id",
    "parent_id": "parent_id",
    "role": "role",
    "text": "content",
    "created_date": "timestamp",
    "message_tree_id": "conversation_id",
}
df_polars = df_polars.select([pl.col(old).alias(new) for old, new in column_mapping.items()])

# Step 2: Data Transformation
df_polars = df_polars.with_columns(
    pl.col("timestamp").apply(lambda x: float(pd.Timestamp(x).timestamp()), return_dtype=pl.Float64)
)
df_polars = df_polars.with_columns(
    pl.col("content").apply(lambda x: len(x.split()), return_dtype=pl.UInt16).alias("tokens_count")
)



In [9]:
df_polars.head()

message_id,parent_id,role,content,timestamp,conversation_id,tokens_count
str,str,str,str,f64,str,u16
"""6ab24d72-0181-…",,"""prompter""","""Can you write …",1675600000.0,"""6ab24d72-0181-…",30
"""c8e83833-ecbc-…","""6ab24d72-0181-…","""assistant""","""""Monopsony"" re…",1675700000.0,"""6ab24d72-0181-…",223
"""6708c47f-05c9-…","""c8e83833-ecbc-…","""prompter""","""Now explain it…",1675700000.0,"""6ab24d72-0181-…",6
"""343ee2d4-87ae-…","""6ab24d72-0181-…","""assistant""","""Monopsony is a…",1675700000.0,"""6ab24d72-0181-…",159
"""18145bf4-37fd-…","""343ee2d4-87ae-…","""prompter""","""How can one fi…",1675700000.0,"""6ab24d72-0181-…",11


In [19]:
conv_df = df_polars.groupby("conversation_id")

In [35]:
for i, (conv_id, conv) in enumerate(conv_df):
    if i > 5:
        break

    #print(conv_df.head(5))
    #get first message
    conv_dict = conv.to_dicts()

    for msg_dict in conv_dict:
        if msg_dict['parent_id'] == None:
            parent_id = msg_dict['message_id']
            curr_parent_id = parent_id
        if curr_parent_id == msg_dict['parent_id']:
            continue
        else:
            curr_parent_id = msg_dict['parent_id']

        message = {"role": f"{msg_dict['role']}", "content": f"{msg_dict['content']}"}
        chat_frame.add_dict_to_thread(message)
    print("\n")















In [15]:
chat_frame.tokenize_column("content")

In [28]:
chat_frame.memory_thread

conversation_id,message_id,parent_id,role,content,timestamp,tokens_count
str,str,str,str,str,f64,u16
"""8b9bafff-b498-…","""ba28d280-5512-…",,"""prompter""","""I have a funct…",1.6939e9,382
"""8b9bafff-b498-…","""7f94a560-5da3-…","""ba28d280-5512-…","""assistant""","""Here is your c…",1.6939e9,415
"""8b9bafff-b498-…","""d5858b03-7a17-…","""7f94a560-5da3-…","""prompter""","""my sister look…",1.6939e9,30
"""8b9bafff-b498-…","""cdcac8f7-c3c5-…","""d5858b03-7a17-…","""assistant""","""This code is t…",1.6939e9,163
"""8b9bafff-b498-…","""0fbd253d-1255-…","""cdcac8f7-c3c5-…","""assistant""","""I have a funct…",1.6939e9,382
"""8b9bafff-b498-…","""3ab44690-ff9c-…","""0fbd253d-1255-…","""prompter""","""You have not f…",1.6939e9,68
"""8b9bafff-b498-…","""f37ba6a4-f2c9-…","""3ab44690-ff9c-…","""assistant""","""Here you go: `…",1.6939e9,373
"""8b9bafff-b498-…","""6f09109c-cbf5-…","""f37ba6a4-f2c9-…","""assistant""","""I have a funct…",1.6939e9,657
"""8b9bafff-b498-…","""8bf03a75-74d1-…","""6f09109c-cbf5-…","""prompter""","""Can you answer…",1.6939e9,31
"""8b9bafff-b498-…","""8c2b7244-14d3-…","""8bf03a75-74d1-…","""prompter""","""Can you create…",1.6939e9,35


In [None]:
chat_frame.load(path="./threads/OA.parquet")

In [41]:
## The following input is a workaround to let work the asyncio functions in a jupyter notebook

import nest_asyncio
nest_asyncio.apply()
chat_frame.embed_column('content')

DEBUG:root:Logging initialized at level 10
DEBUG:root:Initialization complete.
DEBUG:root:Entering main loop
INFO:root:Next request is 0 of 464
INFO:root:Calling Api for 0...
INFO:root:Next request is 1 of 464
INFO:root:Calling Api for 1...
INFO:root:Next request is 2 of 464
INFO:root:Calling Api for 2...
INFO:root:Next request is 3 of 464
INFO:root:Calling Api for 3...
INFO:root:Next request is 4 of 464
INFO:root:Calling Api for 4...
INFO:root:Next request is 5 of 464
INFO:root:Calling Api for 5...
INFO:root:Next request is 6 of 464
INFO:root:Calling Api for 6...
INFO:root:Next request is 7 of 464
INFO:root:Calling Api for 7...
INFO:root:Next request is 8 of 464
INFO:root:Calling Api for 8...
INFO:root:Next request is 9 of 464
INFO:root:Calling Api for 9...
INFO:root:Next request is 10 of 464
INFO:root:Calling Api for 10...
INFO:root:Next request is 11 of 464
INFO:root:Calling Api for 11...
INFO:root:Next request is 12 of 464
INFO:root:Calling Api for 12...
INFO:root:Next request is 1

shape: (1, 6)
┌────────────────┬────────────────┬────────────────┬───────────────┬───────────────┬───────────────┐
│ name           ┆ num_rate_limit ┆ num_overloaded ┆ num_tasks_sta ┆ num_api_error ┆ num_other_err │
│ ---            ┆ _errors        ┆ _errors        ┆ rted          ┆ s             ┆ ors           │
│ str            ┆ ---            ┆ ---            ┆ ---           ┆ ---           ┆ ---           │
│                ┆ i64            ┆ i64            ┆ i64           ┆ i64           ┆ i64           │
╞════════════════╪════════════════╪════════════════╪═══════════════╪═══════════════╪═══════════════╡
│ chat_embedding ┆ 0              ┆ 0              ┆ 464           ┆ 0             ┆ 2             │
│ _text-embeddin ┆                ┆                ┆               ┆               ┆               │
│ g-ad…          ┆                ┆                ┆               ┆               ┆               │
└────────────────┴────────────────┴────────────────┴───────────────┴─────────

In [43]:
chat_frame.save(path="./threads/OA.parquet")

In [32]:
import polars as pl
import os
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import matplotlib.dates as mdates
from babydragon.models.generators.PolarsGenerator import PolarsGenerator


os.environ["OPENAI_API_KEY"] = "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"

DEBUG:matplotlib:matplotlib data path: /Users/danielhug/neuraldragon/frames_arc/BabyDragon/notebooks/venv/lib/python3.10/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/Users/danielhug/.matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is darwin
DEBUG:matplotlib:CACHEDIR=/Users/danielhug/.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /Users/danielhug/.matplotlib/fontlist-v330.json


In [81]:
chat_frame.memory_thread

conversation_id,message_id,parent_id,role,content,timestamp,tokens_count,tokens|content,tokens_len|content,embedding|content
str,str,str,str,str,f64,u16,list[i64],i64,list[f64]
"""23591151-afd7-…","""ff2ad8c3-8771-…",,"""prompter""","""Who is MrBeast…",1.6939e9,13,"[15546, 374, … 30]",6,"[-0.005128, -0.023558, … -0.030193]"
"""23591151-afd7-…","""68d34c6a-6e53-…","""ff2ad8c3-8771-…","""assistant""","""Jimmy Donaldso…",1.6939e9,112,"[86755, 9641, … 15]",105,"[-0.032206, -0.020325, … -0.010317]"
"""23591151-afd7-…","""addb2a31-c789-…","""68d34c6a-6e53-…","""prompter""","""Thank you""",1.6939e9,9,"[13359, 499]",2,"[-0.005717, -0.004763, … 0.004741]"
"""23591151-afd7-…","""bdeb7ba3-87be-…","""addb2a31-c789-…","""assistant""","""Glad to help. …",1.6939e9,21,"[38, 18599, … 30]",14,"[0.007404, 0.005515, … -0.006968]"
"""23591151-afd7-…","""6b0ba41b-9454-…","""bdeb7ba3-87be-…","""assistant""","""MrBeast, real …",1.6939e9,98,"[12555, 3513, … 13]",91,"[-0.034663, -0.016623, … -0.027447]"
"""23591151-afd7-…","""50388e0a-1cb6-…","""6b0ba41b-9454-…","""prompter""","""Is MrBeast acc…",1.6939e9,23,"[3957, 4491, … 30]",16,"[-0.016809, -0.009124, … -0.029952]"
"""23591151-afd7-…","""c280ed13-2a79-…","""50388e0a-1cb6-…","""assistant""","""MrBeast might …",1.6939e9,87,"[12555, 3513, … 13]",80,"[-0.012145, -0.009681, … -0.02196]"
"""23591151-afd7-…","""fe7cba4b-6691-…","""c280ed13-2a79-…","""prompter""","""What sort of c…",1.6939e9,18,"[3923, 3460, … 30]",11,"[0.000872, -0.012229, … -0.025525]"
"""23591151-afd7-…","""56f03a37-6cf7-…","""fe7cba4b-6691-…","""prompter""","""Who was the be…",1.6939e9,26,"[15546, 574, … 30]",19,"[-0.010938, -0.006049, … -0.025327]"
"""23591151-afd7-…","""2e93f9e3-7779-…","""56f03a37-6cf7-…","""assistant""","""The title of ""…",1.6939e9,406,"[791, 2316, … 13]",399,"[-0.006996, -0.0061, … -0.009303]"


In [82]:
## The following input is a workaround to let work the asyncio functions in a jupyter notebook
import nest_asyncio
nest_asyncio.apply()

chat_frame.generate_column(column_name='content', generator_log_name="chat_summary_OA", system_prompt='Please summarize what is going on in this part of the conversation text')

DEBUG:root:Logging initialized at level 10
DEBUG:root:Initialization complete.
DEBUG:root:Entering main loop
INFO:root:Next request is 0 of 464
INFO:root:Calling Api for 0...
INFO:root:Next request is 1 of 464
INFO:root:Calling Api for 1...
INFO:root:Next request is 2 of 464
INFO:root:Calling Api for 2...
INFO:root:Next request is 3 of 464
INFO:root:Calling Api for 3...
INFO:root:Next request is 4 of 464
INFO:root:Calling Api for 4...
INFO:root:Next request is 5 of 464
INFO:root:Calling Api for 5...
INFO:root:Next request is 6 of 464
INFO:root:Calling Api for 6...
INFO:root:Next request is 7 of 464
INFO:root:Calling Api for 7...
INFO:root:Next request is 8 of 464
INFO:root:Calling Api for 8...
INFO:root:Next request is 9 of 464
INFO:root:Calling Api for 9...
INFO:root:Next request is 10 of 464
INFO:root:Calling Api for 10...
INFO:root:Next request is 11 of 464
INFO:root:Calling Api for 11...
INFO:root:Next request is 12 of 464
INFO:root:Calling Api for 12...
INFO:root:Next request is 1

shape: (1, 6)
┌────────────────┬────────────────┬────────────────┬───────────────┬───────────────┬───────────────┐
│ name           ┆ num_rate_limit ┆ num_overloaded ┆ num_tasks_sta ┆ num_api_error ┆ num_other_err │
│ ---            ┆ _errors        ┆ _errors        ┆ rted          ┆ s             ┆ ors           │
│ str            ┆ ---            ┆ ---            ┆ ---           ┆ ---           ┆ ---           │
│                ┆ i64            ┆ i64            ┆ i64           ┆ i64           ┆ i64           │
╞════════════════╪════════════════╪════════════════╪═══════════════╪═══════════════╪═══════════════╡
│ chat_summary_O ┆ 56             ┆ 0              ┆ 644           ┆ 0             ┆ 0             │
│ A              ┆                ┆                ┆               ┆               ┆               │
└────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┘


In [3]:
chat_frame.memory_thread.head()

conversation_id,message_id,parent_id,role,content,timestamp,tokens_count,tokens|content,tokens_len|content,embedding|content,model,messages,id,start_time,output,prompt_tokens,completion_tokens,total_tokens,end_time,remaining_token_capacity
str,str,str,str,str,f64,u16,list[i64],i64,list[f64],str,list[struct[2]],i64,i64,str,i64,i64,i64,i64,i64
"""23591151-afd7-…","""ff2ad8c3-8771-…",,"""prompter""","""Who is MrBeast…",1693900000.0,13,"[15546, 374, … 30]",6,"[-0.005128, -0.023558, … -0.030193]","""gpt-3.5-turbo-…","[{""system"",""Please summarize what is going on in this part of the conversation text""}, {""user"",""Who is MrBeast?""}]",460,1693854092,"""In this part o…",344,212,556,1693854093,4499
"""23591151-afd7-…","""68d34c6a-6e53-…","""ff2ad8c3-8771-…","""assistant""","""Jimmy Donaldso…",1693900000.0,112,"[86755, 9641, … 15]",105,"[-0.032206, -0.020325, … -0.010317]","""gpt-3.5-turbo-…","[{""system"",""Please summarize what is going on in this part of the conversation text""}, {""user"",""Jimmy Donaldson, better known as MrBeast, is an American YouTuber. He is credited with pioneering a genre of YouTube videos that centers on expensive stunts. As of January 2023, his YouTube channel has reached 130 million subscribers, making it the fourth-most-subscribed on the platform, and the highest as a non-corporate identity. Wikipedia page: https://en.wikipedia.org/wiki/MrBeast His youtube channel: https://www.youtube.com/user/mrbeast6000""}]",454,1693854093,"""The observer p…",39,201,240,1693854093,524
"""23591151-afd7-…","""addb2a31-c789-…","""68d34c6a-6e53-…","""prompter""","""Thank you""",1693900000.0,9,"[13359, 499]",2,"[-0.005717, -0.004763, … 0.004741]","""gpt-3.5-turbo-…","[{""system"",""Please summarize what is going on in this part of the conversation text""}, {""user"",""Thank you""}]",448,1693854091,"""In this part o…",297,252,549,1693854091,7741
"""23591151-afd7-…","""bdeb7ba3-87be-…","""addb2a31-c789-…","""assistant""","""Glad to help. …",1693900000.0,21,"[38, 18599, … 30]",14,"[0.007404, 0.005515, … -0.006968]","""gpt-3.5-turbo-…","[{""system"",""Please summarize what is going on in this part of the conversation text""}, {""user"",""Glad to help. Is there anything else you want to know?""}]",462,1693854092,"""In this part o…",154,132,286,1693854093,1065
"""23591151-afd7-…","""6b0ba41b-9454-…","""bdeb7ba3-87be-…","""assistant""","""MrBeast, real …",1693900000.0,98,"[12555, 3513, … 13]",91,"[-0.034663, -0.016623, … -0.027447]","""gpt-3.5-turbo-…","[{""system"",""Please summarize what is going on in this part of the conversation text""}, {""user"",""MrBeast, real name Jimmy Donaldson is a popular youtuber from Kansas. His popularity primarily arose from him giving away money to either random strangers, random twitch streamers or his friends, and later started hosting competitions like for example ""Last To Take Hand Off Lamborghini, Keeps It"". As of writing this (5th of february 2023) he counts 132 million subscribers and is 4th most subscribed channel on youtube.""}]",455,1693854093,"""The participan…",101,87,188,1693854094,1217


In [86]:
chat_frame.save(path="./threads/OA.parquet")

In [40]:
chat_frame.load(path="./threads/OA.parquet")

In [None]:
from babydragon.chat.base_chat import BaseChat, Prompter
from typing import Union, Generator, Optional, List, Dict, Tuple
import tiktoken
import polars as pl
from babydragon.utils.chatml import (get_mark_from_response,
                                  get_str_from_response, mark_question,
                                  mark_system)

class CustomChat(BaseChat):
    def __init__(self, model: Union[str, None] = None, max_output_tokens: int = 200, memory_thread_name: str = "memory", max_memory: Optional[int] = None):
        BaseChat.__init__(self, model=model, max_output_tokens=max_output_tokens)
        self.prompter = Prompter()
        self.chat_frame = ChatFrame(name=memory_thread_name, max_memory=max_memory)
        self.memory_thread = self.chat_frame.memory_thread
        self.memory_thread.tokenizer = self.tokenizer

    def identity_prompter(self, message: str) -> Tuple[List[Dict], str]:
        self.add_message_to_thread(role="user", content=message)
        return [mark_question(message)], mark_question(message)

    def add_message_to_thread(self, role: str, content: str):
        message_dict = {"role": role, "content": content}
        self.chat_frame.add_dict_to_thread(message_dict)

    def chat_response(self, prompt: List[dict], max_tokens: Union[int, None] = None, stream: bool = False) -> Union[Generator, Tuple[Dict, bool]]:
        response, success = self.chat_response(prompt, max_tokens, stream)
        if success:
            content = get_str_from_response(response, self.model)
            self.add_message_to_thread(role="system", content=content)
        return response, success

    def get_conversation_history(self) -> pl.DataFrame:
        return self.memory_thread

    def get_last_user_message(self) -> pl.DataFrame:
        return self.chat_frame.last_message(role="user")

    def get_last_system_message(self) -> pl.DataFrame:
        return self.chat_frame.last_message(role="system")
    

In [None]:
import umap
import hdbscan
dim_reduction_model = umap.UMAP(n_neighbors=10)
cluster_model = hdbscan.HDBSCAN()

In [None]:
mfp.cluster_embeddings(column_name="embedding|code", dim_reduction_model=dim_reduction_model, cluster_model=cluster_model)

In [None]:
import openai
from bertopic.representation import OpenAI
import numpy as np
#convert code column to list
code_list = mfp.df['code'].to_list()
embeddings = mfp.df['embedding|code'].to_list()
representation_model = OpenAI(model="gpt-3.5-turbo-16k", chat=True, nr_docs=10, diversity=0.5)
topic_model = BERTopic(representation_model=representation_model)
topics, probs = topic_model.fit_transform(documents=code_list, embeddings=np.array(embeddings))