In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

tables = {
    "OE_Activity": "../data/database_backups_csv/OE/OE_Activity.csv",
    "OE_Locations": "../data/database_backups_csv/OE/OE_Locations.csv",
    "OE_Products": "../data/database_backups_csv/OE/OE_Products.csv",
}

column_names = {
    "OE_Activity": ["ActivityCode","UserID","WorkCode","AssignmentID","ProductID","Quantity","Timestamp","LocationID"],
    "OE_Locations": ["LocationID","Aisle","Bay","Level","Slot"],
    "OE_Products": ["ProductID","ProductCode","UnitOfMeasure","Weight","Cube","Length","Width","Height"],
}

dfs = {}
for name, fp in tables.items():
    dfs[name] = pd.read_csv(fp, header=None, names=column_names[name])

In [3]:
for t in ["OE_Activity", "OE_Locations", "OE_Products"]:
    print("=" * 80)
    print(f"Table: {t}")

    df = dfs[t]
    print(f"Dimensions: ({df.shape[0]} rows, {df.shape[1]} columns)\n")

    display(df.head(3))

    # Columns and data types + quick summary stats
    schema_df = pd.DataFrame({
        "dtype": df.dtypes.astype(str),
        "n_missing": df.isna().sum(),
        "n_unique": df.nunique(dropna=True),
    })

    # Numeric summaries
    num_df = df.select_dtypes(include="number")
    schema_df["min"] = num_df.min()
    schema_df["max"] = num_df.max()
    schema_df["mean"] = num_df.mean()

    display(schema_df)
    print("\n")

Table: OE_Activity
Dimensions: (96132 rows, 8 columns)



Unnamed: 0,ActivityCode,UserID,WorkCode,AssignmentID,ProductID,Quantity,Timestamp,LocationID
0,PickPut,419,20,7954566,4289.0,1.0,2025-11-10 11:37:14.160,826367.0
1,AssignmentOpen,64,10,7954429,,,2025-11-10 11:38:34.043,
2,PickPut,419,20,7954541,6592.0,1.0,2025-11-10 11:39:42.330,14524.0


Unnamed: 0,dtype,n_missing,n_unique,min,max,mean
ActivityCode,object,0,2,,,
UserID,int64,0,40,64.0,504.0,405.0262
WorkCode,int64,0,3,10.0,30.0,26.42034
AssignmentID,int64,0,42241,7717782.0,8042473.0,7924902.0
ProductID,float64,894,7691,1.0,57791.0,24777.78
Quantity,float64,894,144,1.0,1143.0,8.453474
Timestamp,object,0,95804,,,
LocationID,float64,894,7669,1.0,8104198.0,893224.7




Table: OE_Locations
Dimensions: (33519 rows, 5 columns)



Unnamed: 0,LocationID,Aisle,Bay,Level,Slot
0,1,10,24.0,1.0,1.0
1,2,40,5.0,4.0,4.0
2,3,40,9.0,1.0,2.0


Unnamed: 0,dtype,n_missing,n_unique,min,max,mean
LocationID,int64,0,33519,1.0,8034868.0,795274.7684
Aisle,object,0,52,,,
Bay,float64,1,86,1.0,99.0,23.107107
Level,float64,1,12,1.0,50.0,3.687153
Slot,float64,1,35,1.0,35.0,3.019243




Table: OE_Products
Dimensions: (57671 rows, 8 columns)



Unnamed: 0,ProductID,ProductCode,UnitOfMeasure,Weight,Cube,Length,Width,Height
0,1,07062B2324X,CA,35.0,0.938,,,
1,2,0204800418,BX,2.7338,0.263,,,
2,3,07062B1322Q,EA,0.65,0.027,,,


Unnamed: 0,dtype,n_missing,n_unique,min,max,mean
ProductID,int64,0,57671,1.0,57671.0,28836.0
ProductCode,object,0,41981,,,
UnitOfMeasure,object,0,55,,,
Weight,float64,0,5572,0.0,7584.0,4.834848
Cube,float64,0,4009,0.0,421.296,0.623222
Length,float64,57671,0,,,
Width,float64,57671,0,,,
Height,float64,57671,0,,,






In [4]:
# Load and process distance matrix
path = "../data/distance_matrices/distance_matrix_OE.csv"
Distance = pd.read_csv(path, index_col=0)

for c in Distance.columns:
    Distance[c] = pd.to_numeric(Distance[c], errors="coerce")

display(Distance.head())

Unnamed: 0,08|03|||,08|05|||,08|07|||,08|09|||,10|04|||,10|06|||,10|08|||,10|10|||,10|12|||,10|14|||,...,|Start L3,|Start L4,|Start L5,|Start L6,|Start R2,|Start R3,|Start R4,|Start R5,|Start R6,|Start SB
08|03|||,0,414,389,364,304,287,271,255,240,223,...,1094,1068,897,953,1080,1045,1089,913,969,994
08|05|||,25,0,414,389,329,312,296,280,265,248,...,1119,1093,922,978,1104,1070,1113,938,994,1018
08|07|||,50,25,0,414,354,337,321,305,290,273,...,1144,1118,947,1003,1129,1095,1138,963,1019,1043
08|09|||,75,50,25,0,379,362,346,330,315,298,...,1169,1143,972,1028,1154,1119,1163,988,1044,1068
10|04|||,484,459,434,410,0,19,35,51,67,81,...,1120,1074,958,838,929,1133,1087,971,854,767


In [5]:
OE_Activity = dfs["OE_Activity"]
OE_Activity["ProductID"] = pd.to_numeric(OE_Activity["ProductID"], errors="coerce").astype("Int64")
OE_Activity["Quantity"]  = pd.to_numeric(OE_Activity["Quantity"],  errors="coerce").astype("Int64")
OE_Activity["LocationID"] = pd.to_numeric(OE_Activity["LocationID"], errors="coerce").astype("Int64")
OE_Activity["Timestamp"] = pd.to_datetime(OE_Activity["Timestamp"], errors="coerce")
OE_Activity = OE_Activity.dropna(subset=["Timestamp"]).copy()

# OE_Locations's Bay, level, slot all to int
OE_Locations = dfs["OE_Locations"]# Convert Bay, Level, Slot to integers (nullable Int64)
OE_Locations["Bay"]   = pd.to_numeric(OE_Locations["Bay"], errors="coerce").astype("Int64")
OE_Locations["Level"] = pd.to_numeric(OE_Locations["Level"], errors="coerce").astype("Int64")
OE_Locations["Slot"]  = pd.to_numeric(OE_Locations["Slot"], errors="coerce").astype("Int64")

OE_Products = dfs["OE_Products"][["ProductID", "ProductCode", "UnitOfMeasure", "Weight", "Cube"]]

In [6]:
# Reshape Distance matrix to long format for easier merging
dist_long = (
    Distance.stack(dropna=False)
    .rename("distance")
    .reset_index()
    .rename(columns={"level_0": "FromLoc", "level_1": "ToLoc"}))
display(dist_long.head())

  Distance.stack(dropna=False)


Unnamed: 0,FromLoc,ToLoc,distance
0,08|03|||,08|03|||,0
1,08|03|||,08|05|||,414
2,08|03|||,08|07|||,389
3,08|03|||,08|09|||,364
4,08|03|||,10|04|||,304


In [7]:
oe = OE_Activity.copy()

# Drop AssignmentOpen rows
oe = oe[oe["ActivityCode"] != "AssignmentOpen"].copy()

# Sort by UserID then Timestamp
oe = oe.sort_values(["UserID", "Timestamp"]).reset_index(drop=True)

# Previous timestamp / previous location within each user
g = oe.groupby("UserID", sort=False)

oe["Prev_Timestamp"]  = g["Timestamp"].shift(1)
oe["Prev_LocationID"] = g["LocationID"].shift(1)

# Time delta (sec) to previous timestamp
oe["Time_Delta_sec"] = (oe["Timestamp"] - oe["Prev_Timestamp"]).dt.total_seconds()

# If delta > 30 minutes, set to null
oe.loc[oe["Time_Delta_sec"] > 30 * 60, "Time_Delta_sec"] = np.nan

OE_Activity_prepped = oe.copy()

In [8]:
# Join with OE_Products on ProductID
OE_joined = OE_Activity_prepped.merge(OE_Products, on="ProductID", how="left")
# Join with OE_Locations on LocationID
OE_joined = OE_joined.merge(OE_Locations, on="LocationID", how="left")
# Add previous location details (Prev_Aisle, Prev_Bay, Prev_Level, Prev_Slot)
OE_joined = OE_joined.merge(
    OE_Locations[["LocationID", "Aisle", "Bay", "Level", "Slot"]].rename(columns={
        "LocationID": "Prev_LocationID",
        "Aisle": "Prev_Aisle",
        "Bay": "Prev_Bay",
        "Level": "Prev_Level",
        "Slot": "Prev_Slot",
    }),
    on="Prev_LocationID",
    how="left"
)

OE_joined.head()

Unnamed: 0,ActivityCode,UserID,WorkCode,AssignmentID,ProductID,Quantity,Timestamp,LocationID,Prev_Timestamp,Prev_LocationID,...,Weight,Cube,Aisle,Bay,Level,Slot,Prev_Aisle,Prev_Bay,Prev_Level,Prev_Slot
0,PickPut,64,20,7955278,3250,1,2025-11-10 12:23:59.540,34343,NaT,,...,0.6,0.234,42,12,6,7,,,,
1,PickPut,64,20,7955281,43178,1,2025-11-10 12:26:23.993,14453,2025-11-10 12:23:59.540,34343.0,...,0.7,0.13,34,26,6,1,42.0,12.0,6.0,7.0
2,PickPut,64,20,7955279,5842,1,2025-11-10 12:28:31.110,23294,2025-11-10 12:26:23.993,14453.0,...,5.0,0.843,42,5,2,1,34.0,26.0,6.0,1.0
3,PickPut,64,20,7955276,5842,1,2025-11-10 12:28:31.313,23294,2025-11-10 12:28:31.110,23294.0,...,5.0,0.843,42,5,2,1,42.0,5.0,2.0,1.0
4,PickPut,64,20,7955277,8344,1,2025-11-10 12:36:27.187,818,2025-11-10 12:28:31.313,23294.0,...,10.55,1.113,41,6,3,4,42.0,5.0,2.0,1.0


In [9]:
SUFFIX = "|||"
OE_detailed = OE_joined.copy()
OE_detailed["Aisle2"] = pd.to_numeric(OE_detailed["Aisle"], errors="coerce").astype("Int64").astype(str).str.zfill(2)
OE_detailed["Bay2"]   = pd.to_numeric(OE_detailed["Bay"],   errors="coerce").astype("Int64").astype(str).str.zfill(2)
OE_detailed["Prev_Aisle2"] = pd.to_numeric(OE_detailed["Prev_Aisle"], errors="coerce").astype("Int64").astype(str).str.zfill(2)
OE_detailed["Prev_Bay2"]   = pd.to_numeric(OE_detailed["Prev_Bay"],   errors="coerce").astype("Int64").astype(str).str.zfill(2)
OE_detailed["LocKey"]     = OE_detailed["Aisle2"] + "|" + OE_detailed["Bay2"] + SUFFIX
OE_detailed["PrevLocKey"] = OE_detailed["Prev_Aisle2"] + "|" + OE_detailed["Prev_Bay2"] + SUFFIX

# Join distance: prev -> curr (Distance.loc[curr, prev])
OE_detailed = OE_detailed.merge(
    dist_long,
    left_on=["LocKey", "PrevLocKey"],
    right_on=["FromLoc", "ToLoc"],
    how="left"
).rename(columns={"distance": "Distance"}).drop(columns=["FromLoc", "ToLoc"])

OE_detailed.head()

Unnamed: 0,ActivityCode,UserID,WorkCode,AssignmentID,ProductID,Quantity,Timestamp,LocationID,Prev_Timestamp,Prev_LocationID,...,Prev_Bay,Prev_Level,Prev_Slot,Aisle2,Bay2,Prev_Aisle2,Prev_Bay2,LocKey,PrevLocKey,Distance
0,PickPut,64,20,7955278,3250,1,2025-11-10 12:23:59.540,34343,NaT,,...,,,,42,12,,,42|12|||,<NA>|<NA>|||,
1,PickPut,64,20,7955281,43178,1,2025-11-10 12:26:23.993,14453,2025-11-10 12:23:59.540,34343.0,...,12.0,6.0,7.0,34,26,42.0,12.0,34|26|||,42|12|||,220.0
2,PickPut,64,20,7955279,5842,1,2025-11-10 12:28:31.110,23294,2025-11-10 12:26:23.993,14453.0,...,26.0,6.0,1.0,42,5,34.0,26.0,42|05|||,34|26|||,514.0
3,PickPut,64,20,7955276,5842,1,2025-11-10 12:28:31.313,23294,2025-11-10 12:28:31.110,23294.0,...,5.0,2.0,1.0,42,5,42.0,5.0,42|05|||,42|05|||,0.0
4,PickPut,64,20,7955277,8344,1,2025-11-10 12:36:27.187,818,2025-11-10 12:28:31.313,23294.0,...,5.0,2.0,1.0,41,6,42.0,5.0,41|06|||,42|05|||,77.0


In [11]:
from pathlib import Path
OE_detailed.to_parquet(Path("../data/processed/oe_detailed.parquet"), index=False)
OE_Activity_prepped.to_parquet(Path("../data/processed/oe_activity_prepped.parquet"), index=False)
OE_joined.to_parquet(Path("../data/processed/oe_joined.parquet"), index=False)

In [13]:
df = OE_Activity_prepped.copy()

# Keep only rows with valid product and time delta
df = df.dropna(subset=["ProductID", "Time_Delta_sec"]).copy()

# Identify where the same product occurs consecutively for the same user
df["Prev_ProductID"] = df.groupby("UserID")["ProductID"].shift(1)

same_product = df["ProductID"] == df["Prev_ProductID"]
df_pairs = df.loc[same_product].copy()

In [14]:
product_pick_times = (
    df_pairs.groupby("ProductID")
            .agg(
                n_pairs=("Time_Delta_sec", "size"),      # how many consecutive occurrences
                avg_pick_time_sec=("Time_Delta_sec", "mean"),
                median_pick_time_sec=("Time_Delta_sec", "median"),
                std_pick_time_sec=("Time_Delta_sec", "std")
            )
            .reset_index()
            .sort_values("ProductID")
)
display(product_pick_times.head(5))

Unnamed: 0,ProductID,n_pairs,avg_pick_time_sec,median_pick_time_sec,std_pick_time_sec
0,1,290,3.763331,0.2835,34.350807
1,3,95,15.635168,9.057,16.567655
2,4,37,17.089459,17.62,12.933894
3,7,5,15.0232,2.326,26.563315
4,8,2,21.3885,21.3885,29.337153


In [15]:
# Save product_pick_times to CSV for later analysis
product_pick_times.to_csv("../data/processed/product_pick_times.csv", index=False)