# Imports and Setup

In [1]:
import polars as pl
from typing import List
from datetime import datetime
import cynde.functional as cf
import os
import json

In [2]:
import openai
api_key = os.getenv("OPENAI_API_KEY")#
client = openai.Client(api_key=api_key)


In [3]:
#get current path with os.getcwd()

In [3]:
script_dir = os.getcwd()
# Navigate one directory up to reach /cynde from /cynde/experiments
parent_dir = os.path.dirname(script_dir)

# Define the cache directory path as /cynde/cache
cache_dir = os.path.join(parent_dir, "cache")


# Ensure the cache directory exists, create if it doesn't
os.makedirs(cache_dir, exist_ok=True)

# Define file paths within the /cynde/cache directory
requests_filepath = os.path.join(cache_dir, "chat_payloads.jsonl")
emb_requests_filepath = os.path.join(cache_dir, "chat_payloads_emb.jsonl")

results_filepath = os.path.join(cache_dir, "openai_results.jsonl")
emb_results_file_path = os.path.join(cache_dir, "openai_results_emb.jsonl")
requests_filepath_ = os.path.join(cache_dir, "chat_payloads_.jsonl")
results_filepath_ = os.path.join(cache_dir, "openai_results_.jsonl")
requests_filepath_tools = os.path.join(cache_dir, "chat_payloads_tools.jsonl")
results_filepath_tools = os.path.join(cache_dir, "openai_results_tools.jsonl")


In [5]:
df = pl.DataFrame(
    {
        "customer_id": [101, 102, 103],
        "feedback": [
            "Loved the new product line!",
            "The service was disappointing this time.",
            "Great experience with customer support.",
        ],
        "ratings": [[4, 5, 5], [2, 3, 2], [5, 4, 5]],
        "timestamp": [
            datetime(2023, 1, 1, 14, 30),
            datetime(2023, 1, 2, 9, 15),
            datetime(2023, 1, 3, 18, 45),
        ],
    }
)
print(df)

shape: (3, 4)
┌─────────────┬───────────────────────────────────┬───────────┬─────────────────────┐
│ customer_id ┆ feedback                          ┆ ratings   ┆ timestamp           │
│ ---         ┆ ---                               ┆ ---       ┆ ---                 │
│ i64         ┆ str                               ┆ list[i64] ┆ datetime[μs]        │
╞═════════════╪═══════════════════════════════════╪═══════════╪═════════════════════╡
│ 101         ┆ Loved the new product line!       ┆ [4, 5, 5] ┆ 2023-01-01 14:30:00 │
│ 102         ┆ The service was disappointing th… ┆ [2, 3, 2] ┆ 2023-01-02 09:15:00 │
│ 103         ┆ Great experience with customer s… ┆ [5, 4, 5] ┆ 2023-01-03 18:45:00 │
└─────────────┴───────────────────────────────────┴───────────┴─────────────────────┘


# Batch API Request

In [6]:
import polars as pl

# Sample data frame initialization

fstring = "Customer ID: {} provided feedback at {} with ratings {} an average rating of {} with a global mean of {}: '{}'"
# Dynamic prompt generation with in-select computations

df_prompted = cf.prompt(df, 
                     fstring,
                     [pl.col("customer_id"),
                      pl.col("timestamp").dt.hour(), #from timestamp to hour
                      pl.col("ratings").list.eval(pl.element().cast(pl.Utf8)).list.join("-"), #needs to convert list columns to string
                      pl.col("ratings").list.mean(), #from list to float
                      pl.col("ratings").list.mean().mean(), #constant that gets broadcasted with pl.lit
                      pl.col("feedback")],
                      "customer_prompt")
print(df_prompted)
for prompt in df_prompted["customer_prompt"]:
        print(prompt)

shape: (3, 5)
┌─────────────┬───────────────────────────────┬───────────┬─────────────────────┬──────────────────┐
│ customer_id ┆ feedback                      ┆ ratings   ┆ timestamp           ┆ customer_prompt  │
│ ---         ┆ ---                           ┆ ---       ┆ ---                 ┆ ---              │
│ i64         ┆ str                           ┆ list[i64] ┆ datetime[μs]        ┆ str              │
╞═════════════╪═══════════════════════════════╪═══════════╪═════════════════════╪══════════════════╡
│ 101         ┆ Loved the new product line!   ┆ [4, 5, 5] ┆ 2023-01-01 14:30:00 ┆ Customer ID: 101 │
│             ┆                               ┆           ┆                     ┆ provided feedba… │
│ 102         ┆ The service was disappointing ┆ [2, 3, 2] ┆ 2023-01-02 09:15:00 ┆ Customer ID: 102 │
│             ┆ th…                           ┆           ┆                     ┆ provided feedba… │
│ 103         ┆ Great experience with         ┆ [5, 4, 5] ┆ 2023-01-03 18:45:

## Batch Embedding API

In [7]:
from cynde.functional.embed import embed_column, embed_columns
from cynde.async_tools.api_request_parallel_processor import process_api_requests_from_file


import nest_asyncio
nest_asyncio.apply()
import asyncio

### Embed a single column

In [8]:
merged_df = embed_column(df_prompted, "customer_prompt", emb_requests_filepath, emb_results_file_path, api_key, model_name="text-embedding-3-small")
print(merged_df)



INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2
INFO:root:Parallel processing complete. Results saved to /Users/hk3user/Documents/Dev/Cynde/cache/openai_results_emb.jsonl


shape: (30, 6)
┌─────────────┬───────────────────┬───────────┬──────────────┬──────────────────┬──────────────────┐
│ customer_id ┆ feedback          ┆ ratings   ┆ timestamp    ┆ customer_prompt  ┆ customer_prompt_ │
│ ---         ┆ ---               ┆ ---       ┆ ---          ┆ ---              ┆ text-embedding-3 │
│ i64         ┆ str               ┆ list[i64] ┆ datetime[μs] ┆ str              ┆ …                │
│             ┆                   ┆           ┆              ┆                  ┆ ---              │
│             ┆                   ┆           ┆              ┆                  ┆ list[f64]        │
╞═════════════╪═══════════════════╪═══════════╪══════════════╪══════════════════╪══════════════════╡
│ 101         ┆ Loved the new     ┆ [4, 5, 5] ┆ 2023-01-01   ┆ Customer ID: 101 ┆ [-0.01657,       │
│             ┆ product line!     ┆           ┆ 14:30:00     ┆ provided feedba… ┆ 0.001133, …      │
│             ┆                   ┆           ┆              ┆              

### Embed Multiple Columns

In [9]:
embed_columns(df=df_prompted, column_names=["customer_prompt", "feedback"], models=["text-embedding-3-small"], api_key=api_key)



INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2
INFO:root:Parallel processing complete. Results saved to /Users/hk3user/Documents/Dev/Cynde/cache/customer_prompt_text-embedding-3-small_results.jsonl
INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2


Embeddings for column 'customer_prompt' with model 'text-embedding-3-small' have been merged into the DataFrame.


INFO:root:Parallel processing complete. Results saved to /Users/hk3user/Documents/Dev/Cynde/cache/feedback_text-embedding-3-small_results.jsonl


Embeddings for column 'feedback' with model 'text-embedding-3-small' have been merged into the DataFrame.


customer_id,feedback,ratings,timestamp,customer_prompt,customer_prompt_text-embedding-3-small_embedding,feedback_text-embedding-3-small_embedding
i64,str,list[i64],datetime[μs],str,list[f64],list[f64]
101,"""Loved the new …","[4, 5, 5]",2023-01-01 14:30:00,"""Customer ID: 1…","[-0.016553, 0.001147, … -0.042199]","[0.029176, -0.036259, … 0.000736]"
102,"""The service wa…","[2, 3, 2]",2023-01-02 09:15:00,"""Customer ID: 1…","[-0.037493, 0.013733, … -0.030259]","[-0.005782, 0.019236, … -0.004272]"
103,"""Great experien…","[5, 4, 5]",2023-01-03 18:45:00,"""Customer ID: 1…","[-0.019336, -0.001893, … -0.026274]","[-0.014194, -0.027349, … 0.021451]"


## Batch Chat Completion API

In [8]:
from cynde.functional.generate import generate_chat_completion_payloads, generate_chat_payloads_from_column

In [9]:
system_prompt = "Evaluate the following customer feedback return a True or False based on the sentiment:"

In [10]:
payload_df = generate_chat_payloads_from_column(requests_filepath, df_prompted, "customer_prompt", system_prompt)
print(payload_df)

Using Pydantic Model inside None
shape: (3, 2)
┌───────────────────────────────────┬───────────────────────────────────┐
│ customer_prompt                   ┆ str_messages                      │
│ ---                               ┆ ---                               │
│ str                               ┆ str                               │
╞═══════════════════════════════════╪═══════════════════════════════════╡
│ Customer ID: 101 provided feedba… ┆ {"role":"system","content":"Eval… │
│ Customer ID: 102 provided feedba… ┆ {"role":"system","content":"Eval… │
│ Customer ID: 103 provided feedba… ┆ {"role":"system","content":"Eval… │
└───────────────────────────────────┴───────────────────────────────────┘


In [11]:

nest_asyncio.apply()


In [12]:
from cynde.functional.generate import process_and_merge_llm_responses

In [13]:
request_url = "https://api.openai.com/v1/chat/completions"  # Replace with your actual API endpoint
    # Process multiple api requests to ChatGPT
asyncio.run(
    process_api_requests_from_file(
        requests_filepath=requests_filepath,
        save_filepath=results_filepath,
        request_url=request_url,
        api_key=api_key,
        max_requests_per_minute=float(90000),
        max_tokens_per_minute=float(170000),
        token_encoding_name="cl100k_base",
        max_attempts=int(5),
        logging_level=int(20),
        )
    )

INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2
INFO:root:Parallel processing complete. Results saved to c:\Users\Tommaso\Documents\Dev\Cynde\cache\openai_results.jsonl


### Merge Results

In [14]:
from cynde.functional.generate import merge_df_with_openai_results,load_openai_results_jsonl


In [15]:
results_df = load_openai_results_jsonl(results_filepath)
print(results_df)

shape: (3, 5)
┌─────────────────────┬─────────────────────┬───────────┬─────────────────────┬────────────────────┐
│ messages            ┆ choices             ┆ usage     ┆ results             ┆ str_messages       │
│ ---                 ┆ ---                 ┆ ---       ┆ ---                 ┆ ---                │
│ list[struct[2]]     ┆ struct[2]           ┆ struct[3] ┆ struct[7]           ┆ str                │
╞═════════════════════╪═════════════════════╪═══════════╪═════════════════════╪════════════════════╡
│ [{"system","Evaluat ┆ {"assistant","False ┆ {80,1,81} ┆ {"chatcmpl-8ohMf68J ┆ {"role":"system"," │
│ e the followi…      ┆ "}                  ┆           ┆ VYbra9wxtdR4P…      ┆ content":"Eval…    │
│ [{"system","Evaluat ┆ {"assistant","True" ┆ {78,1,79} ┆ {"chatcmpl-8ohMfqjH ┆ {"role":"system"," │
│ e the followi…      ┆ }                   ┆           ┆ JfCA8LIfCZuxc…      ┆ content":"Eval…    │
│ [{"system","Evaluat ┆ {"assistant","True" ┆ {79,1,80} ┆ {"chatcmpl-8ohMgNc5

In [16]:
merged_df = merge_df_with_openai_results(df_prompted, payload_df, results_df, "customer_prompt")
print(merged_df)

shape: (3, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ customer_ ┆ feedback  ┆ ratings   ┆ timestamp ┆ … ┆ messages  ┆ choices   ┆ usage     ┆ results  │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ str       ┆ list[i64] ┆ datetime[ ┆   ┆ list[stru ┆ struct[2] ┆ struct[3] ┆ struct[7 │
│ i64       ┆           ┆           ┆ μs]       ┆   ┆ ct[2]]    ┆           ┆           ┆ ]        │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 101       ┆ Loved the ┆ [4, 5, 5] ┆ 2023-01-0 ┆ … ┆ [{"system ┆ {"assista ┆ {79,1,80} ┆ {"chatcm │
│           ┆ new       ┆           ┆ 1         ┆   ┆ ","Evalua ┆ nt","True ┆           ┆ pl-8ohMg │
│           ┆ product   ┆           ┆ 14:30:00  ┆   ┆ te the    ┆ "}        ┆           ┆ Nc5eVHGE │
│           ┆ line!     ┆           ┆           ┆   ┆ followi…  ┆           

In [17]:
merged_df = process_and_merge_llm_responses(df= df_prompted,
                                column_name= "customer_prompt",
                                system_prompt = system_prompt,
                                requests_filepath = requests_filepath_,
                                results_filepath = results_filepath_,
                                api_key=api_key,)
print(merged_df)

INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2


Generating chat completion payloads...
Using Pydantic Model before calling None
Using Pydantic Model inside None
Chat completion payloads generated in 0.00 seconds.
Processing chat completion payloads with the LLM...


INFO:root:Parallel processing complete. Results saved to c:\Users\Tommaso\Documents\Dev\Cynde\cache\openai_results_.jsonl


Chat completion payloads processed in 0.53 seconds.
Loading results from LLM processing...
Results loaded in 0.01 seconds.
Merging LLM results back into the original DataFrame...
LLM results merged in 0.00 seconds.
Total process completed in 0.54 seconds.
shape: (3, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ customer_ ┆ feedback  ┆ ratings   ┆ timestamp ┆ … ┆ messages  ┆ choices   ┆ usage     ┆ results  │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ str       ┆ list[i64] ┆ datetime[ ┆   ┆ list[stru ┆ struct[2] ┆ struct[3] ┆ struct[7 │
│ i64       ┆           ┆           ┆ μs]       ┆   ┆ ct[2]]    ┆           ┆           ┆ ]        │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 101       ┆ Loved the ┆ [4, 5, 5] ┆ 2023-01-0 ┆ … ┆ [{"system ┆ {"assista ┆ {79,1,80} ┆ {"chatcm │
│           ┆ new     

In [18]:
from instructor.function_calls import openai_schema
from pydantic import BaseModel, Field
import enum
from typing import Optional, List
from cynde.utils.expressions import list_struct_to_string

## Instructor

In [19]:

class CustomerSentimentLabels(str, enum.Enum):
    """Enumeration for single-label customer sentiment classification."""
    POS = "PositiveCustomerSentiment"
    NEG = "NegativeCustomerSentiment"

class SentimentLabeller(BaseModel):
    """
    Class for a single class label prediction.
    """
    class_label: CustomerSentimentLabels
    extra_details: Optional[str] = Field(None, description="Extra details used for the prediction.")

In [24]:
merged_df_funct = process_and_merge_llm_responses(df= df_prompted,
                                column_name= "customer_prompt",
                                system_prompt = system_prompt,
                                requests_filepath = requests_filepath_tools,
                                results_filepath = results_filepath_tools,
                                pydantic_model=SentimentLabeller,
                                api_key=api_key,)
merged_df_funct["choices"]

INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2


Generating chat completion payloads...
Using Pydantic Model before calling <class '__main__.SentimentLabeller'>
Using Pydantic Model inside <class '__main__.SentimentLabeller'>
Using Pydantic Model
Using Function Calling SentimentLabeller
Chat completion payloads generated in 0.00 seconds.
Processing chat completion payloads with the LLM...


INFO:root:Parallel processing complete. Results saved to c:\Users\Tommaso\Documents\Dev\Cynde\cache\openai_results_tools.jsonl


Chat completion payloads processed in 0.60 seconds.
Loading results from LLM processing...
Results loaded in 0.01 seconds.
Merging LLM results back into the original DataFrame...
LLM results merged in 0.00 seconds.
Total process completed in 0.61 seconds.


choices
struct[3]
"{""assistant"",null,[{""call_s4iKZka6P5AxzMDphu4oJpHa"",""function"",{""SentimentLabeller"",""{""class_label"":""PositiveCustomerSentiment""}""}}]}"
"{""assistant"",null,[{""call_xmbr4alXiv5WNqkR0fgu5uwo"",""function"",{""SentimentLabeller"",""{""class_label"":""PositiveCustomerSentiment""}""}}]}"
"{""assistant"",null,[{""call_AbVq7oJXx97ayU6xNruIpCnR"",""function"",{""SentimentLabeller"",""{""class_label"":""NegativeCustomerSentiment""}""}}]}"
"{""assistant"",null,[{""call_FUyXnrvlrWACvp3IHce5Nb7x"",""function"",{""SentimentLabeller"",""{""class_label"":""NegativeCustomerSentiment""}""}}]}"
"{""assistant"",null,[{""call_bP7y8aLMA7i8kgchpQDIGaCL"",""function"",{""SentimentLabeller"",""{""class_label"":""PositiveCustomerSentiment""}""}}]}"
"{""assistant"",null,[{""call_SCttvGUCIv7xDt0ed73oDS3X"",""function"",{""SentimentLabeller"",""{""class_label"":""PositiveCustomerSentiment""}""}}]}"


In [21]:
schema = openai_schema(SentimentLabeller)

In [23]:
from cynde.functional.generate import load_openai_results_jsonl_pydantic
from pydantic import ValidationError

# Load and parse the file
completions = load_openai_results_jsonl_pydantic(results_filepath_tools)
pydantic_objects = []
# Print or process the loaded completions as needed
for completion in completions:
    try:
        out = schema.from_response(completion)
        print(out)
        pydantic_objects.append(out)
    except ValidationError as e:
        print(e)


class_label=<CustomerSentimentLabels.POS: 'PositiveCustomerSentiment'> extra_details=None
class_label=<CustomerSentimentLabels.POS: 'PositiveCustomerSentiment'> extra_details=None
class_label=<CustomerSentimentLabels.NEG: 'NegativeCustomerSentiment'> extra_details=None
