In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
of_activity = pd.read_csv("/Users/betsyfrdmn/Desktop/Lucas Code/CSV/OF_Activity.csv")
of_location = pd.read_csv("/Users/betsyfrdmn/Desktop/Lucas Code/CSV/OF_Locations.csv")
of_products = pd.read_csv("/Users/betsyfrdmn/Desktop/Lucas Code/CSV/OF_Products.csv")
distance_of = pd.read_csv("/Users/betsyfrdmn/Desktop/Lucas Code/CSV/distance_matrix_OF.csv",
                          index_col="Unnamed: 0")
distance_lookup = distance_of.stack().reset_index()
distance_lookup.columns = ["Prior_Location", "Location", "Distance"]

In [8]:
# Data Cleaning
of_df = of_activity.copy()
of_df = of_df.merge(of_location, on="LocationID", how="left")
of_df = of_df.merge(of_products, on="ProductID", how="left")


of_df["Timestamp"] = pd.to_datetime(of_df["Timestamp"])
of_df["Hour"] = of_df["Timestamp"].dt.hour


of_df["Aisle"] = of_df["Aisle"].astype(str).str.replace(r'\.0$', '', regex=True)
single_digits = of_df["Aisle"].str.match(r'^\d$')
of_df.loc[single_digits, "Aisle"] = of_df.loc[single_digits, "Aisle"].str.zfill(2)
of_df["Aisle"] = of_df["Aisle"].replace('nan', None)


of_df["Bay"] = of_df["Bay"].astype(str).str.replace(r'\.0$', '', regex=True)
single_digits = of_df["Bay"].str.match(r'^\d$')
of_df.loc[single_digits, "Bay"] = of_df.loc[single_digits, "Bay"].str.zfill(2)
of_df["Bay"] = of_df["Bay"].replace('nan', None)


of_df = of_df.sort_values(["UserID", "Timestamp"])
#of_df["Hour"] = of_df["Timestamp"].dt.floor("H")
of_df["Pick_Time"] = (
    of_df.groupby("UserID")["Timestamp"]
      .diff()
      .dt.total_seconds() / 3600
)
of_df["Shift"] = (
    (of_df["Pick_Time"] >= 3)
    .groupby(of_df["UserID"])
    .cumsum() + 1
)
of_df.loc[
    of_df.groupby(["UserID", "Shift"]).cumcount() == 0,
    "Pick_Time"
] = np.nan
of_df["Location"] = of_df["Aisle"] + "|" + of_df["Bay"] + "|||"


# 1. Ensure sorting is correct first
of_df = of_df.sort_values(["UserID", "Timestamp"])
# 2. Identify "Physical" locations (ignore AssignmentOpen)
# We create a temporary column that is Null when it's an AssignmentOpen
of_df["Temp_Loc"] = of_df["Location"]
of_df.loc[of_df["ActivityCode"] == "AssignmentOpen", "Temp_Loc"] = np.nan
# 3. Find the 'Last Valid Location' within each User/Shift group
# This "jumps over" the AssignmentOpen rows to find the most recent real location
of_df["Last_Valid_Loc"] = (
    of_df.groupby(["UserID", "Shift"])["Temp_Loc"]
    .ffill()
)
# 4. Prior_Location is the Last_Valid_Loc of the PREVIOUS row
of_df["Prior_Location"] = (
    of_df.groupby(["UserID", "Shift"])["Last_Valid_Loc"]
    .shift(1)
)
# 5. Perform the Distance Lookup (Merge)
# Ensure your distance_lookup table columns are ["Prior_Location", "Location", "Distance"]
of_df = of_df.merge(distance_lookup, on=["Prior_Location", "Location"], how="left")
# 6. Apply Final NULL Rules
# Rule A: No Distance if Pick_Time is NaN (includes the first pick of a shift)
of_df.loc[of_df["Pick_Time"].isna(), "Distance"] = np.nan
# Rule B: No Distance if the current row is AssignmentOpen
of_df.loc[of_df["ActivityCode"] == "AssignmentOpen", "Distance"] = np.nan
# 7. Cleanup temporary columns
of_df.drop(columns=["Temp_Loc", "Last_Valid_Loc"], inplace=True)

In [9]:
of_df.to_parquet('of_df.parquet')