In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor, export_text


# Insert warehouse name here
warehouse_name = "OE"
# Insert user id here
user = "betsyfrdmn"

user = "conan/Desktop/lucas"

# GitHub repo
base = "Lucas_Systems_Capstone_Project"

# Load your processed data
df = pd.read_parquet(f"/Users/{user}/{base}/data/processed/oe_detailed.parquet")

In [2]:
# Define features and target
features = ["Travel_Distance", "Weight", "Quantity", "Level"]
target = "Time_Delta_sec"

# Convert columns to numeric, this ensures everything is a float
for col in features + [target]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Drop rows with missing values globally
df = df.dropna(subset=features + [target])

# Filter out extreme time outliers (e.g., keep picks between 5s and 10 mins)
# df = df[(df[target] > 5) & (df[target] < 600)]

# Get list of unique WorkCodes
work_codes = df['WorkCode'].unique()

# Iterate and create a model for each code
for code in work_codes:
    print(f"\n" + "-"*40)
    print(f"MODELING WORK CODE: {code}")
    
    # Filter data for this specific work code
    model_df = df[df['WorkCode'] == code].copy()
    
    # Check if we have enough data to train
    if len(model_df) < 50:
        print(f"Skipping Work Code {code}: Not enough data (only {len(model_df)} rows).")
        continue
        
    # Split Data
    X = model_df[features]
    y = model_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

    # Train Model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Evaluation
    y_pred = model.predict(X_test)
    print(f"Rows Analyzed: {len(model_df)}")
    print(f"R-squared: {r2_score(y_test, y_pred):.4f}")
    print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f} seconds")

    # Explainability
    coefs = pd.DataFrame({
        "Factor": features,
        "Coef (Sec)": model.coef_
    }).sort_values(by="Coef (Sec)", ascending=False)

    print("\nImpact on Time (Explainability):")
    print(coefs)


----------------------------------------
MODELING WORK CODE: 30
Rows Analyzed: 67391
R-squared: 0.0929
Mean Absolute Error: 59.44 seconds

Impact on Time (Explainability):
            Factor  Coef (Sec)
1           Weight    3.373202
0  Travel_Distance    0.561978
2         Quantity    0.128198
3            Level   -5.283810

----------------------------------------
MODELING WORK CODE: 20
Rows Analyzed: 22280
R-squared: 0.1591
Mean Absolute Error: 60.50 seconds

Impact on Time (Explainability):
            Factor  Coef (Sec)
3            Level    6.552165
1           Weight    1.093409
0  Travel_Distance    0.376164
2         Quantity    0.000000

----------------------------------------
MODELING WORK CODE: 10
Rows Analyzed: 4272
R-squared: 0.0108
Mean Absolute Error: 80.40 seconds

Impact on Time (Explainability):
            Factor  Coef (Sec)
2         Quantity    0.444469
1           Weight    0.236189
0  Travel_Distance    0.090921
3            Level   -1.634435


In [17]:
# Setup
features = ["Travel_Distance", "Weight", "Quantity", "Level"]
target = "Time_Delta_sec"

# Convert to numeric to avoid TypeErrors
for col in features + [target]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Clean data
df_model = df.dropna(subset=features + [target])

# Storage for our results
performance_results = []
impact_results = []

# Iterate and create a model for each code
for code in df_model['WorkCode'].unique():
    model_df = df_model[df_model['WorkCode'] == code].copy()
    
    if len(model_df) < 50:
        continue
        
    X = model_df[features]
    y = model_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Save Performance Metrics
    performance_results.append({
        "WorkCode": code,
        "Rows": len(model_df),
        "R-squared": round(r2_score(y_test, y_pred), 4),
        "MAE (Sec)": round(mean_absolute_error(y_test, y_pred), 2)
    })

    # Save Coefficients
    for feat, coef in zip(features, model.coef_):
        impact_results.append({
            "WorkCode": code,
            "Factor": feat,
            "Seconds_Added": round(coef, 4)
        })

# Create and Display the DataFrames
df_performance = pd.DataFrame(performance_results)
df_impact = pd.DataFrame(impact_results).pivot(index='Factor', columns='WorkCode', values='Seconds_Added')

print("MODEL PERFORMANCE")
display(df_performance)

print("\nIMPACT ON TIME (SECONDS ADDED PER UNIT)")
display(df_impact)

MODEL PERFORMANCE


Unnamed: 0,WorkCode,Rows,R-squared,MAE (Sec)
0,30,67402,0.0916,57.92
1,20,22308,0.1404,62.09
2,10,5036,0.0572,132.39



IMPACT ON TIME (SECONDS ADDED PER UNIT)


WorkCode,10,20,30
Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Level,-0.5442,7.5106,-6.0965
Quantity,0.9614,0.0,0.1343
Travel_Distance,0.2956,0.3861,0.5726
Weight,3.6355,1.2487,3.4035


In [18]:
for code in work_codes:
    print(f"\n" + "-"*40)
    print(f"MODELING WORK CODE: {code}")
    
    # Filter data for this specific work code
    model_df = df[df['WorkCode'] == code].copy()
    
    # Check if we have enough data to train
    if len(model_df) < 50:
        print(f"Skipping Work Code {code}: Not enough data (only {len(model_df)} rows).")
        continue
        
    # Split Data
    X = model_df[features]
    y = model_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
    
    # max_depth=3 keeps it "Explainable" so the tree isn't too huge
    model = DecisionTreeRegressor(max_depth=3, random_state=4)
    model.fit(X_train, y_train)

    # Evaluation
    y_pred = model.predict(X_test)
    print(f"R-squared: {r2_score(y_test, y_pred):.4f}")
    
    # Explainability: The "Rules"
    tree_rules = export_text(model, feature_names=features)
    print("\nDecision Logic (The Rules):")
    print(tree_rules)


----------------------------------------
MODELING WORK CODE: 30
R-squared: 0.0945

Decision Logic (The Rules):
|--- Travel_Distance <= 50.50
|   |--- Travel_Distance <= 21.50
|   |   |--- Travel_Distance <= 5.50
|   |   |   |--- value: [35.65]
|   |   |--- Travel_Distance >  5.50
|   |   |   |--- value: [55.01]
|   |--- Travel_Distance >  21.50
|   |   |--- Weight <= 3.48
|   |   |   |--- value: [82.05]
|   |   |--- Weight >  3.48
|   |   |   |--- value: [189.01]
|--- Travel_Distance >  50.50
|   |--- Travel_Distance <= 154.50
|   |   |--- Level <= 2.50
|   |   |   |--- value: [199.25]
|   |   |--- Level >  2.50
|   |   |   |--- value: [140.25]
|   |--- Travel_Distance >  154.50
|   |   |--- Level <= 2.50
|   |   |   |--- value: [377.97]
|   |   |--- Level >  2.50
|   |   |   |--- value: [230.54]


----------------------------------------
MODELING WORK CODE: 20
R-squared: 0.1503

Decision Logic (The Rules):
|--- Travel_Distance <= 14.50
|   |--- Travel_Distance <= 3.50
|   |   |--- Le

In [19]:
# Setup (assuming features/target are already defined)
performance_list = []
importance_list = []

# Iterate and create a model for each code
for code in work_codes:
    # Filter data for this specific work code
    model_df = df[df['WorkCode'] == code].copy()
    
    if len(model_df) < 50:
        continue
        
    # Split Data
    X = model_df[features]
    y = model_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
    
    # Train Decision Tree
    model = DecisionTreeRegressor(max_depth=3, random_state=4)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Save Performance Metrics to list
    performance_list.append({
        "WorkCode": code,
        "Rows": len(model_df),
        "R-squared": round(r2_score(y_test, y_pred), 4),
        "MAE (Sec)": round(mean_absolute_error(y_test, y_pred), 2)
    })
    
    # Save Feature Importances to list
    for feat, importance in zip(features, model.feature_importances_):
        importance_list.append({
            "WorkCode": code,
            "Factor": feat,
            "Importance": round(importance, 4)
        })

# Create and Display the DataFrames
df_tree_performance = pd.DataFrame(performance_list)
df_tree_importance = pd.DataFrame(importance_list).pivot(index='Factor', columns='WorkCode', values='Importance')

print("DECISION TREE PERFORMANCE")
display(df_tree_performance)

print("\nFEATURE IMPORTANCE (0 to 1 scale)")
display(df_tree_importance)

DECISION TREE PERFORMANCE


Unnamed: 0,WorkCode,Rows,R-squared,MAE (Sec)
0,30,67402,0.0945,57.58
1,20,22308,0.1503,53.8
2,10,5036,-0.0457,131.8



FEATURE IMPORTANCE (0 to 1 scale)


WorkCode,10,20,30
Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Level,0.0,0.0005,0.0803
Quantity,0.3124,0.0,0.0
Travel_Distance,0.5087,0.9555,0.9085
Weight,0.1789,0.0441,0.0113


## Conan's Neural Nets

In [120]:
from sklearn.preprocessing import OneHotEncoder

In [125]:
df_nn = df.copy()
df_nn = df_nn[df_nn["WorkCode"] == "30"]

# Define features and target
numerical_features = ["Quantity", "Weight", "Cube"]
categorical_features = ["Aisle", "Bay", "Level"] # removed WorkCode, productID
target = "Time_Delta_sec"

df_nn = df_nn[numerical_features+categorical_features+[target]]
df_nn = df_nn[df_nn[target] <= 1800]

# Convert columns to numeric, this ensures everything is a float
for col in numerical_features + [target]:
    df_nn[col] = pd.to_numeric(df_nn[col], errors="coerce")

# OHE categorical columns
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
enc.fit(df_nn[categorical_features])
df_ohe = enc.transform(df_nn[categorical_features])
df_nn = pd.concat([df_nn.drop(columns=categorical_features), pd.DataFrame(df_ohe)], axis = 1)

# Drop rows with missing values globally
df_nn.dropna(inplace=True)

# Split Data
X = df_nn.drop(columns=[target])
y = df_nn[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [126]:
import torch
import torch.nn as nn

model = nn.Sequential(
    nn.Linear(100, 500),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(500, 200),
    nn.Dropout(0.2),
    nn.ReLU(),
    nn.Linear(200, 25),
    nn.ReLU(),
    nn.Linear(25, 1)
)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [127]:
from torch.utils.data import TensorDataset, DataLoader

X_train_tensor = torch.tensor(X_train.to_numpy(dtype=np.float64), dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train.to_numpy(dtype=np.float64), dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

for epoch in range(10):
    for inputs, labels in train_loader:
        
        # flatten if needed
        inputs = inputs.view(inputs.size(0), -1)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 10674.8838
Epoch 2, Loss: 1767.9384
Epoch 3, Loss: 26414.3105
Epoch 4, Loss: 9852.8701
Epoch 5, Loss: 13194.1055
Epoch 6, Loss: 7411.0088
Epoch 7, Loss: 2137.5549
Epoch 8, Loss: 56297.2539
Epoch 9, Loss: 49771.3711
Epoch 10, Loss: 16188.7588


In [128]:
X_test_tensor = torch.tensor(X_test.to_numpy(dtype=np.float64), dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test.to_numpy(dtype=np.float64), dtype=torch.float32).unsqueeze(1)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

model.eval()
criterion = nn.L1Loss()

test_loss = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        
        inputs = inputs.view(inputs.size(0), -1)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        test_loss += loss.item()

avg_mse = test_loss / len(test_loader)

print(f"Test MAE: {avg_mse:.4f}")

Test MAE: 45.6620
