In [1]:
import polars as pl
from typing import List
from datetime import datetime
import cynde.functional as cf
import os

In [2]:
import openai
api_key = os.getenv("OPENAI_API_KEY")#
client = openai.Client(api_key=api_key)


In [3]:
#get current path with os.getcwd()

In [4]:
script_dir = os.getcwd()
# Navigate one directory up to reach /cynde from /cynde/experiments
parent_dir = os.path.dirname(script_dir)

# Define the cache directory path as /cynde/cache
cache_dir = os.path.join(parent_dir, "cache")

# Ensure the cache directory exists, create if it doesn't
os.makedirs(cache_dir, exist_ok=True)

# Define file paths within the /cynde/cache directory
requests_filepath = os.path.join(cache_dir, "chat_payloads.jsonl")
results_filepath = os.path.join(cache_dir, "openai_results.jsonl")
requests_filepath_ = os.path.join(cache_dir, "chat_payloads_.jsonl")
results_filepath_ = os.path.join(cache_dir, "openai_results_.jsonl")

In [5]:
df = pl.DataFrame(
    {
        "customer_id": [101, 102, 103],
        "feedback": [
            "Loved the new product line!",
            "The service was disappointing this time.",
            "Great experience with customer support.",
        ],
        "ratings": [[4, 5, 5], [2, 3, 2], [5, 4, 5]],
        "timestamp": [
            datetime(2023, 1, 1, 14, 30),
            datetime(2023, 1, 2, 9, 15),
            datetime(2023, 1, 3, 18, 45),
        ],
    }
)
print(df)

shape: (3, 4)
┌─────────────┬───────────────────────────────────┬───────────┬─────────────────────┐
│ customer_id ┆ feedback                          ┆ ratings   ┆ timestamp           │
│ ---         ┆ ---                               ┆ ---       ┆ ---                 │
│ i64         ┆ str                               ┆ list[i64] ┆ datetime[μs]        │
╞═════════════╪═══════════════════════════════════╪═══════════╪═════════════════════╡
│ 101         ┆ Loved the new product line!       ┆ [4, 5, 5] ┆ 2023-01-01 14:30:00 │
│ 102         ┆ The service was disappointing th… ┆ [2, 3, 2] ┆ 2023-01-02 09:15:00 │
│ 103         ┆ Great experience with customer s… ┆ [5, 4, 5] ┆ 2023-01-03 18:45:00 │
└─────────────┴───────────────────────────────────┴───────────┴─────────────────────┘


In [6]:
embedded_df = cf.embed_columns(df, ["feedback"], client=client)
print(embedded_df)

Creating embeddings for column feedback
Processing 3 chunks of text in a single batch
Embedding Processing took 0.5426173210144043 seconds
shape: (3, 5)
┌─────────────┬─────────────────────────┬───────────┬─────────────────────┬────────────────────────┐
│ customer_id ┆ feedback                ┆ ratings   ┆ timestamp           ┆ feedback_text-embeddin │
│ ---         ┆ ---                     ┆ ---       ┆ ---                 ┆ g-3-small_…            │
│ i64         ┆ str                     ┆ list[i64] ┆ datetime[μs]        ┆ ---                    │
│             ┆                         ┆           ┆                     ┆ list[f64]              │
╞═════════════╪═════════════════════════╪═══════════╪═════════════════════╪════════════════════════╡
│ 101         ┆ Loved the new product   ┆ [4, 5, 5] ┆ 2023-01-01 14:30:00 ┆ [0.029205, -0.036287,  │
│             ┆ line!                   ┆           ┆                     ┆ … 0.000765…            │
│ 102         ┆ The service was        

In [7]:
import polars as pl

# Sample data frame initialization

fstring = "Customer ID: {} provided feedback at {} with ratings {} an average rating of {} with a global mean of {}: '{}'"
# Dynamic prompt generation with in-select computations

df_prompted = cf.prompt(embedded_df, 
                     fstring,
                     [pl.col("customer_id"),
                      pl.col("timestamp").dt.hour(), #from timestamp to hour
                      pl.col("ratings").list.eval(pl.element().cast(pl.Utf8)).list.join("-"), #needs to convert list columns to string
                      pl.col("ratings").list.mean(), #from list to float
                      pl.col("ratings").list.mean().mean(), #constant that gets broadcasted with pl.lit
                      pl.col("feedback")],
                      "customer_prompt")
print(df_prompted)
for prompt in df_prompted["customer_prompt"]:
        print(prompt)

shape: (3, 6)
┌─────────────┬───────────────────┬───────────┬──────────────┬──────────────────┬──────────────────┐
│ customer_id ┆ feedback          ┆ ratings   ┆ timestamp    ┆ feedback_text-em ┆ customer_prompt  │
│ ---         ┆ ---               ┆ ---       ┆ ---          ┆ bedding-3-small_ ┆ ---              │
│ i64         ┆ str               ┆ list[i64] ┆ datetime[μs] ┆ …                ┆ str              │
│             ┆                   ┆           ┆              ┆ ---              ┆                  │
│             ┆                   ┆           ┆              ┆ list[f64]        ┆                  │
╞═════════════╪═══════════════════╪═══════════╪══════════════╪══════════════════╪══════════════════╡
│ 101         ┆ Loved the new     ┆ [4, 5, 5] ┆ 2023-01-01   ┆ [0.029205,       ┆ Customer ID: 101 │
│             ┆ product line!     ┆           ┆ 14:30:00     ┆ -0.036287, …     ┆ provided feedba… │
│             ┆                   ┆           ┆              ┆ 0.000765…     

In [8]:
from cynde.functional.generate import generate_chat_completion_payloads, generate_chat_payloads_from_column

In [9]:
system_prompt = "Evaluate the following customer feedback return a True or False based on the sentiment:"

In [10]:
payload_df = generate_chat_payloads_from_column(requests_filepath, df_prompted, "customer_prompt", system_prompt)
print(payload_df)

shape: (3, 2)
┌───────────────────────────────────┬───────────────────────────────────┐
│ customer_prompt                   ┆ str_messages                      │
│ ---                               ┆ ---                               │
│ str                               ┆ str                               │
╞═══════════════════════════════════╪═══════════════════════════════════╡
│ Customer ID: 101 provided feedba… ┆ {"role":"system","content":"Eval… │
│ Customer ID: 102 provided feedba… ┆ {"role":"system","content":"Eval… │
│ Customer ID: 103 provided feedba… ┆ {"role":"system","content":"Eval… │
└───────────────────────────────────┴───────────────────────────────────┘


In [11]:
import nest_asyncio
nest_asyncio.apply()
import asyncio
from cynde.async_tools.api_request_parallel_processor import process_api_requests_from_file

In [12]:
from cynde.functional.generate import process_and_merge_llm_responses

In [13]:
request_url = "https://api.openai.com/v1/chat/completions"  # Replace with your actual API endpoint
    # Process multiple api requests to ChatGPT
asyncio.run(
    process_api_requests_from_file(
        requests_filepath=requests_filepath,
        save_filepath=results_filepath,
        request_url=request_url,
        api_key=api_key,
        max_requests_per_minute=float(90000),
        max_tokens_per_minute=float(170000),
        token_encoding_name="cl100k_base",
        max_attempts=int(5),
        logging_level=int(20),
        )
    )

INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2
INFO:root:Parallel processing complete. Results saved to c:\Users\Tommaso\Documents\Dev\Cynde\cache\openai_results.jsonl


In [14]:
from cynde.functional.generate import merge_df_with_openai_results,load_openai_results_jsonl


In [15]:
results_df = load_openai_results_jsonl(results_filepath)
print(results_df)

shape: (3, 4)
┌──────────────────────────┬───────────────────────┬───────────┬───────────────────────────────────┐
│ messages                 ┆ choices               ┆ usage     ┆ str_messages                      │
│ ---                      ┆ ---                   ┆ ---       ┆ ---                               │
│ list[struct[2]]          ┆ struct[2]             ┆ struct[3] ┆ str                               │
╞══════════════════════════╪═══════════════════════╪═══════════╪═══════════════════════════════════╡
│ [{"system","Evaluate the ┆ {"assistant","True"}  ┆ {79,1,80} ┆ {"role":"system","content":"Eval… │
│ followi…                 ┆                       ┆           ┆                                   │
│ [{"system","Evaluate the ┆ {"assistant","False"} ┆ {80,1,81} ┆ {"role":"system","content":"Eval… │
│ followi…                 ┆                       ┆           ┆                                   │
│ [{"system","Evaluate the ┆ {"assistant","True"}  ┆ {78,1,79} ┆ {"role":"sys

In [16]:
merged_df = merge_df_with_openai_results(df_prompted, payload_df, results_df, "customer_prompt")
print(merged_df)

shape: (3, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ customer_ ┆ feedback  ┆ ratings   ┆ timestamp ┆ … ┆ customer_ ┆ messages  ┆ choices   ┆ usage    │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ prompt    ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ str       ┆ list[i64] ┆ datetime[ ┆   ┆ ---       ┆ list[stru ┆ struct[2] ┆ struct[3 │
│ i64       ┆           ┆           ┆ μs]       ┆   ┆ str       ┆ ct[2]]    ┆           ┆ ]        │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 101       ┆ Loved the ┆ [4, 5, 5] ┆ 2023-01-0 ┆ … ┆ Customer  ┆ [{"system ┆ {"assista ┆ {79,1,80 │
│           ┆ new       ┆           ┆ 1         ┆   ┆ ID: 101   ┆ ","Evalua ┆ nt","True ┆ }        │
│           ┆ product   ┆           ┆ 14:30:00  ┆   ┆ provided  ┆ te the    ┆ "}        ┆          │
│           ┆ line!     ┆           ┆           ┆   ┆ feedba…   ┆ followi…  ┆

In [17]:
merged_df = process_and_merge_llm_responses(df= df_prompted,
                                column_name= "customer_prompt",
                                system_prompt = system_prompt,
                                requests_filepath = requests_filepath_,
                                results_filepath = results_filepath_,
                                api_key=api_key,)
print(merged_df)

Generating chat completion payloads...


OSError: [Errno 22] Invalid argument: '_c:\\Users\\Tommaso\\Documents\\Dev\\Cynde\\cache\\chat_payloads.jsonl'