In [1]:
import requests
import json
import random
import openai
import tiktoken
import ast
import polars as pl
import os
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import matplotlib.dates as mdates
from babydragon.models.generators.PolarsGenerator import PolarsGenerator


os.environ["OPENAI_API_KEY"] = "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"


data = pl.read_parquet('babydragon_frame.parquet')


In [None]:
data

In [3]:
def prepare_input_df(df, messages_col, system_prompt):

    df = df.select(messages_col).with_columns(pl.lit("gpt-3.5-turbo-16k-0613").alias("model"))

    def create_content(value):
        return ([{"role": "system", "content":system_prompt}, 
                       {"role": "user", "content": f"{value}"}])

    input_df = df.with_columns(df[messages_col].apply(create_content, return_dtype=pl.List).alias('messages')).drop(messages_col)

    return(input_df)

In [4]:
input_df = prepare_input_df(df=data,
                            messages_col='code', 
                            system_prompt="You are a helpful Summarizer. Please summarize the meaning of the code I am gonna show you.")

In [None]:
input_df

In [None]:
## The following input is a workaround to let work the asyncio functions in a jupyter notebook
import nest_asyncio
nest_asyncio.apply()




generator = PolarsGenerator( input_df = input_df, name = 'babydragon_code')

generator.execute()

In [36]:
output = pl.read_ndjson('batch_generator/babydragon_code_output.ndjson')

In [None]:
output

In [38]:
def define_time_axis(start,end,span):


    start = datetime.utcfromtimestamp(start)
    end = datetime.utcfromtimestamp(end)

    dates = [start + timedelta(seconds=x) for x in range(0, int((end-start).total_seconds())+span, span)]

    return pl.Series("Time", dates)

time_axis = define_time_axis(start=output['start_time'].min(),end=output['end_time'].max(),span=10)

In [None]:
time_axis

In [40]:
df = output

In [41]:
def get_interval(start_time, end_time, time_intervals):
    start = datetime.utcfromtimestamp(start_time)
    end = datetime.utcfromtimestamp(end_time)
    mask = (start >= time_intervals) & (time_intervals <= end)
    time_axis_df = pl.DataFrame({"Time": time_intervals, "Mask": mask})
    true_values_df = time_axis_df.filter(pl.col("Mask"))
    true_values = true_values_df["Time"]
    return pl.Series([true_values.max()])
    
    


In [42]:
pl.Config.set_tbl_rows(100)
df_bins = output.with_columns(output.apply(lambda t: get_interval(t[1],t[6],time_axis))).rename({'apply':'bin'})

In [None]:
df_bins

In [44]:
df = df_bins.with_columns(pl.col("bin").apply(lambda x: x[0] if len(x) > 0 else None, return_dtype=pl.Datetime).alias("bin")).sort('bin')

In [None]:
df

In [46]:
df_grouped = df.groupby("bin").agg(pl.col("total_tokens").sum()).sort('bin')

In [None]:
df_grouped

In [50]:
x = df_grouped['bin'].to_numpy()

In [49]:
y = df_grouped['total_tokens'].to_numpy()

In [None]:
# Create a new figure and an axes
fig, ax = plt.subplots(figsize=(10, 6)) 

# Plot
ax.plot(x, y)

# Set the x-axis label
ax.set_xlabel('Time')

# Set the y-axis label
ax.set_ylabel('Total Tokens')

# Set the plot title
ax.set_title('Total Tokens Over Time')


xformatter = mdates.DateFormatter('%H:%M:%S')
plt.gca().xaxis.set_major_formatter(xformatter)

plt.xticks(rotation=45)  # optional, to prevent overlap of x-axis labels
plt.grid()
plt.show()


In [32]:
logs = pl.read_ndjson('batch_generator/babydragon_code_log.ndjson')

In [None]:
logs

In [None]:
output.with_columns(pl.col('id').cast(pl.UInt32))

In [69]:
merged_data = data.with_row_count('id').join(output.with_columns(pl.col('id').cast(pl.UInt32)).select('output','id').sort('id'), on="id")

In [78]:
merged_data

id,code,libcst tree,filename,output
u32,str,str,str,str
0,""" class Embedda…","""ClassDef(  …","""/Users/danielh…","""The code defin…"
1,""" def infer_emb…","""FunctionDef(  …","""/Users/danielh…","""This code defi…"
2,""" def numeric_e…","""FunctionDef(  …","""/Users/danielh…","""The code defin…"
3,""" class Embedd…","""ClassDef(  …","""/Users/danielh…","""This code is d…"
4,"""def __init__( …","""FunctionDef(  …","""/Users/danielh…","""This code is d…"
5,""" def _execute_…","""FunctionDef(  …","""/Users/danielh…","""This code is d…"
6,""" def parallel_…","""FunctionDef(  …","""/Users/danielh…","""The code defin…"
7,""" class TopicT…","""ClassDef(  …","""/Users/danielh…","""The code provi…"
8,"""def __init__( …","""FunctionDef(  …","""/Users/danielh…","""This code is f…"
9,""" def _setup_m…","""FunctionDef(  …","""/Users/danielh…","""The code is de…"
