In [12]:
from time import time
import polars as pl
import pandas as pd
import numpy as np
import pyarrow

import matplotlib.pyplot as plt
import seaborn as sns

df = pl.read_csv(r"./datasets/2019-Nov-1M.csv")

In [13]:
%%time

print(f"Dataset Shape: {df.shape}")
print(f"Dataset rows: {df.height}")
print(f"Dataset columns: {df.width}")

print(f"\nColumn Types\n-----")

for idx, col in enumerate(df.columns):
   print(f"{col}\tType: {df.dtypes[idx]}")

Dataset Shape: (1000000, 9)
Dataset rows: 1000000
Dataset columns: 9

Column Types
-----
event_time	Type: Utf8
event_type	Type: Utf8
product_id	Type: Int64
category_id	Type: Int64
category_code	Type: Utf8
brand	Type: Utf8
price	Type: Float64
user_id	Type: Int64
user_session	Type: Utf8
CPU times: total: 0 ns
Wall time: 0 ns


In [14]:
%%time

# Data Inspection

df.head(5)

CPU times: total: 0 ns
Wall time: 0 ns


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-11-01 00:…","""view""",1003461,2053013555631882655,"""electronics.sm…","""xiaomi""",489.07,520088904,"""4d3b30da-a5e4-…"
"""2019-11-01 00:…","""view""",5000088,2053013566100866035,"""appliances.sew…","""janome""",293.65,530496790,"""8e5f4f83-366c-…"
"""2019-11-01 00:…","""view""",17302664,2053013553853497655,,"""creed""",28.31,561587266,"""755422e7-9040-…"
"""2019-11-01 00:…","""view""",3601530,2053013563810775923,"""appliances.kit…","""lg""",712.87,518085591,"""3bfb58cd-7892-…"
"""2019-11-01 00:…","""view""",1004775,2053013555631882655,"""electronics.sm…","""xiaomi""",183.27,558856683,"""313628f1-68b8-…"


In [16]:
df = df.with_columns(
    (pl.col("event_time").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S %Z"))
)

df.head(15).select("event_time")

SchemaError: invalid series dtype: expected `Utf8`, got `datetime[μs]`

In [29]:
%%time

#Get a random element from a user_id column

random_user = df.select("user_id").sample(1).item()

CPU times: total: 0 ns
Wall time: 6.04 ms


In [31]:
%%time

# get some random sample using a random user_id

df.filter(pl.col("user_id") == random_user).sample(5)

CPU times: total: 0 ns
Wall time: 962 µs


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
datetime[μs],str,i64,i64,str,str,f64,i64,str
2019-11-01 06:12:57,"""view""",15100370,2053013557024391671,,,257.15,521709157,"""7cb6c860-2476-…"
2019-11-01 06:25:49,"""view""",16800177,2053013558316237377,"""furniture.kitc…",,153.16,521709157,"""7cb6c860-2476-…"
2019-11-01 06:18:04,"""view""",18700066,2053013555380224399,,"""sv""",205.9,521709157,"""7cb6c860-2476-…"
2019-11-01 06:19:22,"""view""",18700063,2053013555380224399,,,192.8,521709157,"""7cb6c860-2476-…"
2019-11-01 06:21:52,"""view""",17800204,2053013559868129947,"""computers.desk…","""zeta""",30.0,521709157,"""7cb6c860-2476-…"


In [33]:
%%time

# Same operations as the last two cells but with one inline operation

df.filter(pl.col("user_id") == df.select("user_id").sample(1).item()).sample(5)

CPU times: total: 0 ns
Wall time: 7 ms


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
datetime[μs],str,i64,i64,str,str,f64,i64,str
2019-11-01 04:41:20,"""view""",13101617,2053013553526341921,,"""skad""",254.32,517969065,"""8eec90e7-47a0-…"
2019-11-01 07:22:35,"""view""",13100906,2053013553526341921,,"""skad""",266.67,517969065,"""324250f3-1bf6-…"
2019-11-01 05:30:31,"""view""",1700954,2053013553031414015,"""computers.peri…","""samsung""",223.66,517969065,"""0f876213-0684-…"
2019-11-01 07:22:01,"""view""",13100906,2053013553526341921,,"""skad""",266.67,517969065,"""59574890-096c-…"
2019-11-01 04:54:11,"""purchase""",13100906,2053013553526341921,,"""skad""",266.67,517969065,"""8eec90e7-47a0-…"


In [35]:
%%time

df_purchases = df.filter(pl.col("event_type") == "purchase")
df_purchases.shape

CPU times: total: 0 ns
Wall time: 6 ms


(17817, 9)

In [44]:
df_purchases = df_purchases.select(['event_time', 'user_id', 'price']).unique()


# ⌛ Compute time Difference

In [45]:
import datetime

today = datetime.date.today()

anchor_date = datetime.datetime(int(today.strftime("%Y")), int(today.strftime("%m")), int(today.strftime("%d")))

anchor_date

datetime.datetime(2023, 7, 6, 0, 0)

In [91]:
%%time

# Calculating how old are all purchases using some nested commands

df_purchases.with_columns(
    (anchor_date - pl.col('event_time')).alias('purchase_recency')
).with_columns(
    (((pl.col('purchase_recency') // (1e6 * 3600 * 24))
      .cast(pl.Int16, strict=False))
     .cast(pl.Utf8) + " days")
    .alias("purchase_recency_str")
)


CPU times: total: 0 ns
Wall time: 2 ms


event_time,user_id,price,purchase_recency,purchase_recency_str
datetime[μs],i64,f64,duration[μs],str
2019-11-01 00:11:04,549256216,531.26,1342d 23h 48m 56s,"""1342 days"""
2019-11-01 00:11:15,513645631,128.42,1342d 23h 48m 45s,"""1342 days"""
2019-11-01 00:11:44,555942729,772.19,1342d 23h 48m 16s,"""1342 days"""
2019-11-01 00:33:17,542210844,1665.36,1342d 23h 26m 43s,"""1342 days"""
2019-11-01 00:34:14,557994805,458.28,1342d 23h 25m 46s,"""1342 days"""
2019-11-01 00:35:19,561564372,40.09,1342d 23h 24m 41s,"""1342 days"""
2019-11-01 00:40:50,541920131,38.35,1342d 23h 19m 10s,"""1342 days"""
2019-11-01 00:42:43,518679035,142.84,1342d 23h 17m 17s,"""1342 days"""
2019-11-01 00:44:24,554096532,275.25,1342d 23h 15m 36s,"""1342 days"""
2019-11-01 00:52:48,560134869,424.72,1342d 23h 7m 12s,"""1342 days"""
