<a href="https://colab.research.google.com/github/LAworkspace/retail-recommender-MLOPS/blob/main/label_interested.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import polars as pl

events = pl.read_parquet("enriched_events_with_conversion.parquet")
events.head()


timestamp,visitorid,event,itemid,transactionid,hour_of_day,day_of_week,is_new_session,session_number,sessionid,views_last_24h,cart_conversion_rate,purchase_conversion_rate
datetime[ms],i64,str,i64,str,i8,i8,bool,i32,str,u32,f64,f64
2015-09-11 20:49:49.439,0,"""view""",285930,,20,5,True,1,"""0_1""",0,0.0,0.0
2015-09-11 20:52:39.591,0,"""view""",357564,,20,5,False,1,"""0_1""",0,0.0,0.0
2015-09-11 20:55:17.175,0,"""view""",67045,,20,5,False,1,"""0_1""",0,0.0,0.0
2015-08-13 17:46:06.444,1,"""view""",72028,,17,4,True,1,"""1_1""",0,0.0,0.0
2015-08-07 17:51:44.567,2,"""view""",325215,,17,5,True,1,"""2_1""",0,0.0,0.0


In [3]:
!pip install polars




In [4]:
import polars as pl

# Step 1: Create LazyFrame
events = events.lazy()

# Step 2: Mark (visitorid, itemid) pairs that had a cart or transaction
interested_pairs = events.filter(
    pl.col("event").is_in(["cart", "transaction"])
).select([
    pl.col("visitorid"),
    pl.col("itemid")
]).unique()

# Step 3: Join back to original data and mark "interested" as target
events = events.with_columns([
    pl.col("event").alias("original_event")  # backup original event column
])

events = events.join(
    interested_pairs,
    on=["visitorid", "itemid"],
    how="left"
).with_columns([
    (pl.col("itemid_right").is_not_null()).alias("interested")
]).drop("itemid_right")


In [6]:
import polars as pl

# Load enriched parquet file
events = pl.read_parquet("/content/enriched_events_with_conversion.parquet")

# Ensure timestamp is properly typed
events = events.with_columns([
    pl.col("timestamp").cast(pl.Datetime)
])


In [10]:
agg = events.group_by(["visitorid", "sessionid", "itemid"]).agg([
    pl.len().alias("total_events"),
    (pl.col("event") == "view").cast(pl.Int8).sum().alias("view_count"),
    (pl.col("event") == "cart").cast(pl.Int8).sum().alias("added_to_cart"),
    (pl.col("timestamp").max() - pl.col("timestamp").min()).dt.total_seconds().alias("dwell_time_secs")
])


In [11]:
agg = agg.with_columns([
    ((pl.col("view_count") >= 2) |
     (pl.col("added_to_cart") > 0) |
     (pl.col("dwell_time_secs") > 20)).cast(pl.Int8).alias("label_interested")
])


In [18]:
events = events.join(
    agg.select(["visitorid", "sessionid", "itemid", "label_interested"]),
    on=["visitorid", "sessionid", "itemid"],
    how="left"
)


DuplicateError: column with name 'label_interested_right' already exists

You may want to try:
- renaming the column prior to joining
- using the `suffix` parameter to specify a suffix different to the default one ('_right')

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'sink' <---
DF ["visitorid", "sessionid", "itemid", "label_interested"]; PROJECT */4 COLUMNS

In [13]:
print(events.schema)


Schema([('timestamp', Datetime(time_unit='us', time_zone=None)), ('visitorid', Int64), ('event', String), ('itemid', Int64), ('transactionid', String), ('hour_of_day', Int8), ('day_of_week', Int8), ('is_new_session', Boolean), ('session_number', Int32), ('sessionid', String), ('views_last_24h', UInt32), ('cart_conversion_rate', Float64), ('purchase_conversion_rate', Float64), ('label_interested', Int8)])


In [14]:
events.filter(pl.col("label_interested").is_not_null()).head(10)


timestamp,visitorid,event,itemid,transactionid,hour_of_day,day_of_week,is_new_session,session_number,sessionid,views_last_24h,cart_conversion_rate,purchase_conversion_rate,label_interested
datetime[μs],i64,str,i64,str,i8,i8,bool,i32,str,u32,f64,f64,i8
2015-09-11 20:49:49.439,0,"""view""",285930,,20,5,True,1,"""0_1""",0,0.0,0.0,0
2015-09-11 20:52:39.591,0,"""view""",357564,,20,5,False,1,"""0_1""",0,0.0,0.0,0
2015-09-11 20:55:17.175,0,"""view""",67045,,20,5,False,1,"""0_1""",0,0.0,0.0,0
2015-08-13 17:46:06.444,1,"""view""",72028,,17,4,True,1,"""1_1""",0,0.0,0.0,0
2015-08-07 17:51:44.567,2,"""view""",325215,,17,5,True,1,"""2_1""",0,0.0,0.0,1
2015-08-07 17:53:33.790,2,"""view""",325215,,17,5,False,1,"""2_1""",0,0.0,0.0,1
2015-08-07 17:56:52.664,2,"""view""",259884,,17,5,False,1,"""2_1""",0,0.0,0.0,0
2015-08-07 18:01:08.920,2,"""view""",216305,,18,5,False,1,"""2_1""",0,0.0,0.0,1
2015-08-07 18:08:25.669,2,"""view""",342816,,18,5,False,1,"""2_1""",0,0.0,0.0,1
2015-08-07 18:17:24.375,2,"""view""",342816,,18,5,False,1,"""2_1""",0,0.0,0.0,1


In [15]:
# Count how many rows have label_interested = 1
events.filter(pl.col("label_interested") == 1).shape


(667775, 14)

In [16]:
events.select([
    pl.col("label_interested").value_counts()
])


label_interested
struct[2]
"{0,2088326}"
"{1,667775}"
