In [1]:
import polars as pl
from typing import List
from datetime import datetime

# from cynde.functional.embed_columns import embed_columns,create_embedding_df_from_column
# from cynde.functional.prompt import prompt
import cynde.functional as cf

In [2]:
import openai
client = openai.Client(api_key="sk-")


In [3]:
df = pl.DataFrame(
    {
        "customer_id": [101, 102, 103],
        "feedback": [
            "Loved the new product line!",
            "The service was disappointing this time.",
            "Great experience with customer support.",
        ],
        "ratings": [[4, 5, 5], [2, 3, 2], [5, 4, 5]],
        "timestamp": [
            datetime(2023, 1, 1, 14, 30),
            datetime(2023, 1, 2, 9, 15),
            datetime(2023, 1, 3, 18, 45),
        ],
    }
)
print(df)

shape: (3, 4)
┌─────────────┬───────────────────────────────────┬───────────┬─────────────────────┐
│ customer_id ┆ feedback                          ┆ ratings   ┆ timestamp           │
│ ---         ┆ ---                               ┆ ---       ┆ ---                 │
│ i64         ┆ str                               ┆ list[i64] ┆ datetime[μs]        │
╞═════════════╪═══════════════════════════════════╪═══════════╪═════════════════════╡
│ 101         ┆ Loved the new product line!       ┆ [4, 5, 5] ┆ 2023-01-01 14:30:00 │
│ 102         ┆ The service was disappointing th… ┆ [2, 3, 2] ┆ 2023-01-02 09:15:00 │
│ 103         ┆ Great experience with customer s… ┆ [5, 4, 5] ┆ 2023-01-03 18:45:00 │
└─────────────┴───────────────────────────────────┴───────────┴─────────────────────┘


In [4]:
embedded_df = cf.embed_columns(df, ["feedback"], client=client)
print(embedded_df)

Creating embeddings for column feedback
Processing 3 chunks of text in a single batch
Embedding Processing took 0.5150561332702637 seconds
shape: (3, 5)
┌─────────────┬─────────────────────────┬───────────┬─────────────────────┬────────────────────────┐
│ customer_id ┆ feedback                ┆ ratings   ┆ timestamp           ┆ feedback_text-embeddin │
│ ---         ┆ ---                     ┆ ---       ┆ ---                 ┆ g-3-small_…            │
│ i64         ┆ str                     ┆ list[i64] ┆ datetime[μs]        ┆ ---                    │
│             ┆                         ┆           ┆                     ┆ list[f64]              │
╞═════════════╪═════════════════════════╪═══════════╪═════════════════════╪════════════════════════╡
│ 101         ┆ Loved the new product   ┆ [4, 5, 5] ┆ 2023-01-01 14:30:00 ┆ [0.029205, -0.036287,  │
│             ┆ line!                   ┆           ┆                     ┆ … 0.000765…            │
│ 102         ┆ The service was        

In [10]:
import polars as pl

# Sample data frame initialization

fstring = "Customer ID: {} provided feedback at {} with ratings {} an average rating of {} with a global mean of {}: '{}'"
# Dynamic prompt generation with in-select computations

df_prompted = cf.prompt(embedded_df, 
                     fstring,
                     [pl.col("customer_id"),
                      pl.col("timestamp").dt.hour(), #from timestamp to hour
                      pl.col("ratings").list.eval(pl.element().cast(pl.Utf8)).list.join("-"), #needs to convert list columns to string
                      pl.col("ratings").list.mean(), #from list to float
                      pl.col("ratings").list.mean().mean(), #constant that gets broadcasted with pl.lit
                      pl.col("feedback")],
                      "customer_prompt")
print(df_prompted)

shape: (3, 6)
┌─────────────┬───────────────────┬───────────┬──────────────┬──────────────────┬──────────────────┐
│ customer_id ┆ feedback          ┆ ratings   ┆ timestamp    ┆ feedback_text-em ┆ customer_prompt  │
│ ---         ┆ ---               ┆ ---       ┆ ---          ┆ bedding-3-small_ ┆ ---              │
│ i64         ┆ str               ┆ list[i64] ┆ datetime[μs] ┆ …                ┆ str              │
│             ┆                   ┆           ┆              ┆ ---              ┆                  │
│             ┆                   ┆           ┆              ┆ list[f64]        ┆                  │
╞═════════════╪═══════════════════╪═══════════╪══════════════╪══════════════════╪══════════════════╡
│ 101         ┆ Loved the new     ┆ [4, 5, 5] ┆ 2023-01-01   ┆ [0.029205,       ┆ Customer ID: 101 │
│             ┆ product line!     ┆           ┆ 14:30:00     ┆ -0.036287, …     ┆ provided feedba… │
│             ┆                   ┆           ┆              ┆ 0.000765…     

In [6]:
for prompt in df_prompted["customer_prompt"]:
        print(prompt)

Customer ID: 101 provided feedback at 14 with ratings 4-5-5 an average rating of 4.666666666666667 with a global mean of 3.8888888888888893: 'Loved the new product line!'
Customer ID: 102 provided feedback at 9 with ratings 2-3-2 an average rating of 2.3333333333333335 with a global mean of 3.8888888888888893: 'The service was disappointing this time.'
Customer ID: 103 provided feedback at 18 with ratings 5-4-5 an average rating of 4.666666666666667 with a global mean of 3.8888888888888893: 'Great experience with customer support.'
