# GenAI Session Analyzer - Data Exploration

Interactive notebook to explore the simulated generation data using Polars.

In [1]:
import polars as pl
import duckdb

# Connect to DuckDB
conn = duckdb.connect('../data/warehouse.duckdb', read_only=True)
print("Connected to warehouse.duckdb")
print("\nTables available:")
print(conn.execute("SHOW TABLES").fetchdf())

Connected to warehouse.duckdb

Tables available:
              name
0  raw_generations
1      raw_prompts
2        raw_users


## Load Tables into Polars DataFrames

In [2]:
# Load all tables into Polars
users = pl.from_pandas(conn.execute("SELECT * FROM raw_users").fetchdf())
prompts = pl.from_pandas(conn.execute("SELECT * FROM raw_prompts").fetchdf())
generations = pl.from_pandas(conn.execute("SELECT * FROM raw_generations").fetchdf())

print(f"Users: {users.shape[0]:,} rows")
print(f"Prompts: {prompts.shape[0]:,} rows")
print(f"Generations: {generations.shape[0]:,} rows")

Users: 500 rows
Prompts: 10,000 rows
Generations: 10,000 rows


## Explore Users

In [6]:
users.head(10)

user_id,user_tier,signup_date,cohort_week,region,device_type
i32,str,datetime[μs],str,str,str
0,"""free""",2025-12-22 02:24:48.784808,"""2025-W51""","""us-west""","""desktop"""
1,"""free""",2025-10-27 19:16:29.632612,"""2025-W43""","""europe""","""desktop"""
2,"""pro""",2025-11-19 07:18:53.979638,"""2025-W46""","""us-west""","""desktop"""
3,"""free""",2025-11-14 15:23:12.699845,"""2025-W45""","""us-west""","""desktop"""
4,"""free""",2025-12-30 20:01:46.161339,"""2025-W52""","""us-west""","""desktop"""
5,"""free""",2025-12-25 10:55:21.214200,"""2025-W51""","""us-west""","""desktop"""
6,"""pro""",2026-01-13 20:21:34.318473,"""2026-W02""","""us-west""","""mobile"""
7,"""free""",2025-11-02 09:02:22.362526,"""2025-W43""","""us-east""","""desktop"""
8,"""enterprise""",2025-12-02 12:36:10.069873,"""2025-W48""","""us-east""","""desktop"""
9,"""free""",2025-10-28 05:36:49.178350,"""2025-W43""","""asia""","""desktop"""


In [7]:
# User tier distribution
users.group_by("user_tier").agg(
    pl.count().alias("count"),
    (pl.count() * 100 / users.shape[0]).round(1).alias("pct")
).sort("count", descending=True)

(Deprecated in version 0.20.5)
  pl.count().alias("count"),
(Deprecated in version 0.20.5)
  (pl.count() * 100 / users.shape[0]).round(1).alias("pct")


user_tier,count,pct
str,u32,f64
"""free""",342,68.4
"""pro""",130,26.0
"""enterprise""",28,5.6


## Explore Prompts

In [8]:
prompts.head(10)

prompt_id,prompt,generation_seed,step,cfg,sampler,width,height
i64,str,u32,u16,f32,u8,u16,u16
883764,"""candy park for a game candy th…",1972271414,50,7.0,8,704,512
483842,"""A girl in Wal-Mart, CCTV foota…",1720961023,50,7.0,8,512,512
1647175,"""skateboarder character eric sp…",592351992,50,7.0,8,704,448
967054,"""street vendor in osaka, by ale…",2537382346,50,7.0,8,512,512
1815056,"""the gigachad visiting the Eiff…",3331160150,50,7.0,8,512,512
755941,"""a cardinal communing with a be…",307942235,150,7.0,8,512,512
451904,"""detailed realistic beautiful s…",2154020414,150,7.0,8,512,704
1067270,"""“ Neon hero by concept art, sc…",1111999,50,7.0,8,512,512
948685,"""A striking Pre-Raphaelite witc…",2751726335,20,7.0,8,512,512
617995,"""Ellie from the Last of Us hold…",825439614,50,7.0,8,512,512


In [9]:
# Prompt length stats
prompts.select(
    pl.col("prompt").str.len_chars().alias("prompt_length")
).describe()

statistic,prompt_length
str,f64
"""count""",10000.0
"""null_count""",0.0
"""mean""",161.5787
"""std""",110.970142
"""min""",0.0
"""25%""",68.0
"""50%""",141.0
"""75%""",241.0
"""max""",1338.0


## Explore Generations

In [10]:
generations.head(10)

generation_id,prompt_id,user_id,session_id,timestamp,session_date,latency_ms,status,cost_credits,retry_count,feedback,downloaded,model_version,token_count
i32,i32,i32,f64,datetime[μs],datetime[μs],i32,str,f64,i32,str,bool,str,i32
6233,826170,8,80001.0,2025-12-01 01:53:12,2025-12-01 00:00:00,739,"""success""",0.3176,0,,True,"""v2.0""",36
9343,562871,8,80002.0,2025-12-01 11:49:27,2025-12-01 00:00:00,1899,"""success""",0.436,1,,True,"""v2.1""",45
869,1630512,8,80003.0,2025-12-01 15:29:09,2025-12-01 00:00:00,200,"""success""",0.048,0,,False,"""v2.1""",5
2443,1192145,8,80004.0,2025-12-01 19:49:37,2025-12-01 00:00:00,443,"""success""",0.0657,0,,False,"""v1.5""",6
7315,1232484,8,80005.0,2025-12-02 12:19:14,2025-12-02 00:00:00,3546,"""success""",0.6698,0,"""thumbs_up""",True,"""v1.5""",66
9096,1660106,8,80006.0,2025-12-02 20:17:18,2025-12-02 00:00:00,445,"""success""",0.1138,0,,True,"""v1.5""",12
5339,1876187,8,80007.0,2025-12-03 03:36:43,2025-12-03 00:00:00,1003,"""success""",0.2001,0,,True,"""v2.1""",20
2477,401305,8,80008.0,2025-12-03 16:31:26,2025-12-03 00:00:00,2394,"""success""",0.4798,1,"""thumbs_up""",True,"""v1.5""",48
5357,1146790,8,80009.0,2025-12-03 18:40:52,2025-12-03 00:00:00,1357,"""success""",0.2703,0,,True,"""v2.1""",27
417,1858424,8,80010.0,2025-12-03 22:43:15,2025-12-03 00:00:00,579,"""success""",0.2072,0,,False,"""v2.0""",23


In [11]:
# Schema overview
generations.schema

Schema([('generation_id', Int32),
        ('prompt_id', Int32),
        ('user_id', Int32),
        ('session_id', Float64),
        ('timestamp', Datetime(time_unit='us', time_zone=None)),
        ('session_date', Datetime(time_unit='us', time_zone=None)),
        ('latency_ms', Int32),
        ('status', String),
        ('cost_credits', Float64),
        ('retry_count', Int32),
        ('feedback', String),
        ('downloaded', Boolean),
        ('model_version', String),
        ('token_count', Int32)])

In [12]:
# Status distribution
generations.group_by("status").agg(
    pl.count().alias("count"),
    (pl.count() * 100 / generations.shape[0]).round(1).alias("pct")
).sort("count", descending=True)

(Deprecated in version 0.20.5)
  pl.count().alias("count"),
(Deprecated in version 0.20.5)
  (pl.count() * 100 / generations.shape[0]).round(1).alias("pct")


status,count,pct
str,u32,f64
"""success""",9412,94.1
"""rate_limited""",269,2.7
"""model_error""",184,1.8
"""safety_violation""",131,1.3
"""timeout""",4,0.0


In [13]:
# Feedback distribution
generations.group_by("feedback").agg(
    pl.count().alias("count")
).sort("count", descending=True)

(Deprecated in version 0.20.5)
  pl.count().alias("count")


feedback,count
str,u32
,8317
"""thumbs_up""",1293
"""thumbs_down""",390


In [14]:
# Download rate by status
generations.group_by("status").agg(
    pl.count().alias("total"),
    pl.col("downloaded").sum().alias("downloads"),
    (pl.col("downloaded").mean() * 100).round(1).alias("download_rate_pct")
).sort("total", descending=True)

(Deprecated in version 0.20.5)
  pl.count().alias("total"),


status,total,downloads,download_rate_pct
str,u32,u32,f64
"""success""",9412,5896,62.6
"""rate_limited""",269,0,0.0
"""model_error""",184,0,0.0
"""safety_violation""",131,0,0.0
"""timeout""",4,0,0.0


## Joined Analysis

In [15]:
# Join generations with users
gen_with_users = generations.join(users, on="user_id", how="left")
gen_with_users.head(5)

generation_id,prompt_id,user_id,session_id,timestamp,session_date,latency_ms,status,cost_credits,retry_count,feedback,downloaded,model_version,token_count,user_tier,signup_date,cohort_week,region,device_type
i32,i32,i32,f64,datetime[μs],datetime[μs],i32,str,f64,i32,str,bool,str,i32,str,datetime[μs],str,str,str
6233,826170,8,80001.0,2025-12-01 01:53:12,2025-12-01 00:00:00,739,"""success""",0.3176,0,,True,"""v2.0""",36,"""enterprise""",2025-12-02 12:36:10.069873,"""2025-W48""","""us-east""","""desktop"""
9343,562871,8,80002.0,2025-12-01 11:49:27,2025-12-01 00:00:00,1899,"""success""",0.436,1,,True,"""v2.1""",45,"""enterprise""",2025-12-02 12:36:10.069873,"""2025-W48""","""us-east""","""desktop"""
869,1630512,8,80003.0,2025-12-01 15:29:09,2025-12-01 00:00:00,200,"""success""",0.048,0,,False,"""v2.1""",5,"""enterprise""",2025-12-02 12:36:10.069873,"""2025-W48""","""us-east""","""desktop"""
2443,1192145,8,80004.0,2025-12-01 19:49:37,2025-12-01 00:00:00,443,"""success""",0.0657,0,,False,"""v1.5""",6,"""enterprise""",2025-12-02 12:36:10.069873,"""2025-W48""","""us-east""","""desktop"""
7315,1232484,8,80005.0,2025-12-02 12:19:14,2025-12-02 00:00:00,3546,"""success""",0.6698,0,"""thumbs_up""",True,"""v1.5""",66,"""enterprise""",2025-12-02 12:36:10.069873,"""2025-W48""","""us-east""","""desktop"""


In [16]:
# Metrics by user tier
gen_with_users.group_by("user_tier").agg(
    pl.count().alias("generations"),
    pl.col("cost_credits").sum().round(2).alias("total_cost"),
    pl.col("latency_ms").mean().round(0).alias("avg_latency"),
    (pl.col("status").eq("success").mean() * 100).round(1).alias("success_rate_pct"),
    (pl.col("feedback").is_not_null().mean() * 100).round(1).alias("feedback_rate_pct"),
    (pl.col("downloaded").mean() * 100).round(1).alias("download_rate_pct")
).sort("generations", descending=True)

(Deprecated in version 0.20.5)
  pl.count().alias("generations"),


user_tier,generations,total_cost,avg_latency,success_rate_pct,feedback_rate_pct,download_rate_pct
str,u32,f64,f64,f64,f64,f64
"""pro""",4100,1107.69,1310.0,96.8,18.7,64.6
"""free""",3579,996.84,1200.0,89.6,10.6,39.0
"""enterprise""",2321,551.34,1276.0,96.4,23.3,79.7


In [17]:
# Daily trends
daily = generations.group_by("session_date").agg(
    pl.count().alias("generations"),
    pl.col("cost_credits").sum().round(2).alias("daily_cost"),
    (pl.col("status").eq("success").mean() * 100).round(1).alias("success_rate"),
    pl.col("downloaded").sum().alias("downloads")
).sort("session_date")

daily.head(15)

(Deprecated in version 0.20.5)
  pl.count().alias("generations"),


session_date,generations,daily_cost,success_rate,downloads
datetime[μs],u32,f64,f64,u32
2025-12-01 00:00:00,337,91.61,93.8,178
2025-12-02 00:00:00,300,79.35,98.7,192
2025-12-03 00:00:00,314,78.86,92.7,193
2025-12-04 00:00:00,323,85.56,93.8,199
2025-12-05 00:00:00,324,77.69,88.6,175
…,…,…,…,…
2025-12-11 00:00:00,322,84.2,92.5,182
2025-12-12 00:00:00,318,84.09,93.4,197
2025-12-13 00:00:00,355,94.84,94.4,213
2025-12-14 00:00:00,292,76.23,95.9,171


In [18]:
# Feedback sentiment by user tier
gen_with_users.filter(pl.col("feedback").is_not_null()).group_by(
    "user_tier", "feedback"
).agg(
    pl.count().alias("count")
).sort("user_tier", "feedback")

(Deprecated in version 0.20.5)
  pl.count().alias("count")


user_tier,feedback,count
str,str,u32
"""enterprise""","""thumbs_down""",130
"""enterprise""","""thumbs_up""",410
"""free""","""thumbs_down""",104
"""free""","""thumbs_up""",274
"""pro""","""thumbs_down""",156
"""pro""","""thumbs_up""",609


## Custom Queries

Run your own DuckDB SQL and load into Polars:

In [19]:
# Example: Top 10 users by spend
query = """
SELECT 
    u.user_id,
    u.user_tier,
    COUNT(*) as generations,
    ROUND(SUM(g.cost_credits), 2) as total_cost,
    SUM(CASE WHEN g.downloaded THEN 1 ELSE 0 END) as downloads
FROM raw_generations g
JOIN raw_users u ON g.user_id = u.user_id
GROUP BY u.user_id, u.user_tier
ORDER BY total_cost DESC
LIMIT 10
"""

pl.from_pandas(conn.execute(query).fetchdf())

user_id,user_tier,generations,total_cost,downloads
i32,str,i64,f64,f64
11,"""enterprise""",107,27.1,91.0
224,"""enterprise""",97,24.94,76.0
490,"""enterprise""",91,24.51,75.0
449,"""enterprise""",95,22.77,77.0
498,"""enterprise""",95,22.63,72.0
285,"""enterprise""",92,21.68,71.0
8,"""enterprise""",85,21.54,63.0
474,"""enterprise""",88,21.38,70.0
182,"""enterprise""",82,21.37,66.0
369,"""enterprise""",76,21.09,53.0


In [20]:
# Your query here
query = """
SELECT * FROM raw_generations LIMIT 5
"""

pl.from_pandas(conn.execute(query).fetchdf())

generation_id,prompt_id,user_id,session_id,timestamp,session_date,latency_ms,status,cost_credits,retry_count,feedback,downloaded,model_version,token_count
i32,i32,i32,f64,datetime[μs],datetime[μs],i32,str,f64,i32,str,bool,str,i32
6233,826170,8,80001.0,2025-12-01 01:53:12,2025-12-01 00:00:00,739,"""success""",0.3176,0,,True,"""v2.0""",36
9343,562871,8,80002.0,2025-12-01 11:49:27,2025-12-01 00:00:00,1899,"""success""",0.436,1,,True,"""v2.1""",45
869,1630512,8,80003.0,2025-12-01 15:29:09,2025-12-01 00:00:00,200,"""success""",0.048,0,,False,"""v2.1""",5
2443,1192145,8,80004.0,2025-12-01 19:49:37,2025-12-01 00:00:00,443,"""success""",0.0657,0,,False,"""v1.5""",6
7315,1232484,8,80005.0,2025-12-02 12:19:14,2025-12-02 00:00:00,3546,"""success""",0.6698,0,"""thumbs_up""",True,"""v1.5""",66


In [None]:
# IMPORTANT: Close connection before running Dagster pipeline
conn.close()
print("Connection closed - Dagster can now write to the database")