In [1]:
import pandas as pd
import numpy as np
import pickle
import hashlib
from tqdm import tqdm
import warnings
import os
from pathlib import Path
import builtins
from sklearn.preprocessing import LabelEncoder
import joblib

warnings.filterwarnings("ignore")

# Clean up any shadowed builtins from previous kernel runs
import gc
gc.collect()

# Explicitly ensure str is the builtin
if not callable(builtins.str):
    del builtins.str
    builtins.str = str.__class__

# Deterministic A/B Assignment

In [2]:
import builtins  # Ensure builtins are available

def assign_variant(entity_id, split=0.5):
    """Deterministic A/B variant assignment using MD5 hash"""
    h = int(hashlib.md5(builtins.str(entity_id).encode()).hexdigest(), 16)
    return "A" if (h % 100) < split * 100 else "B"

# Load dataset

In [3]:
BASE_DIR = os.getcwd()
DATA_PATH = os.path.join(BASE_DIR, "ml_models", "decision_tree", "training_data.csv")

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Order_id,Availability,Costs,Customer demographics,Defect rates,Inspection results,Lead time,Lead times,Location,Manufacturing costs,...,Production volumes,Revenue generated,Routes,Shipping carriers,Shipping costs,Shipping times,Stock levels,Supplier name,Transportation modes,damage_risk
0,6af613b6-569c-5c22-9c37-2ed93f31d3af,55,188.684667,Non-binary,0.224447,Pending,29,8,Mumbai,43.469393,...,214,8763.759148,Route B,Carrier B,2.601292,6,59,Supplier 3,Road,1
1,b04965e6-a9bb-591f-8f8a-1adcb2c8dc39,55,187.492481,Non-binary,0.226086,Pending,29,8,Mumbai,51.332779,...,213,8499.239523,Route B,Carrier B,3.220109,3,58,Supplier 3,Road,1
2,4b166dbe-d99d-5091-abdd-95b83330ed3a,55,188.968124,Non-binary,0.231951,Pending,29,9,Mumbai,47.1343,...,213,8633.603195,Route B,Carrier B,3.257622,2,57,Supplier 3,Road,1
3,98123fde-012f-5ff3-8b50-881449dac91a,55,190.611596,Non-binary,0.233616,Pending,29,8,Mumbai,53.945541,...,213,8658.388203,Route B,Carrier B,3.040814,6,59,Supplier 3,Road,1
4,6ed955c6-506a-5343-9be4-2c0afae02eef,55,187.312448,Non-binary,0.221073,Pending,29,8,Mumbai,40.762402,...,216,8663.3748,Route B,Carrier B,2.703847,3,57,Supplier 3,Road,1


# Split features & Labels

In [4]:
TARGET_COL = "damage_risk"

# Create entity_id before dropping it from features
df["entity_id"] = df.index

# Models expect these specific features
EXPECTED_FEATURES = ['Shipping times', 'Shipping costs', 'Transportation modes', 'Routes',
                     'Order quantities', 'Production volumes', 'Manufacturing costs', 'Supplier name']

# Select expected features and prepare them
X = df[EXPECTED_FEATURES].copy()
y = df[TARGET_COL]

# Encode categorical columns
categorical_cols = ['Transportation modes', 'Routes', 'Supplier name']
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

print(f"‚úÖ Features prepared: {X.shape}")
print(f"   Columns: {X.columns.tolist()}")
print(f"   Target shape: {y.shape}")

‚úÖ Features prepared: (4000, 8)
   Columns: ['Shipping times', 'Shipping costs', 'Transportation modes', 'Routes', 'Order quantities', 'Production volumes', 'Manufacturing costs', 'Supplier name']
   Target shape: (4000,)


# Run A/B Experiment

In [5]:
# Clear any leftover file handles or shadowed builtins from previous runs
if 'f' in dir():
    try:
        if hasattr(f, 'close'):
            f.close()
    except:
        pass

# Load model path
MODEL_A_PATH = os.path.join(BASE_DIR, "ml_models", "logistic_regression", "model.pkl")
MODEL_B_PATH = os.path.join(BASE_DIR, "ml_models", "decision_tree", "model.pkl")

# Safe model loading function
def safe_load_model(path):
    """Load model from pickle or joblib with error handling"""
    import joblib
    try:
        return joblib.load(path)
    except Exception as e:
        print(f"Warning loading {path}: {e}")
        try:
            with open(path, 'rb') as file_handle:
                import pickle
                return pickle.load(file_handle)
        except Exception as e2:
            print(f"Failed to load model from {path}: {e2}")
            return None

# Load models
model_A = safe_load_model(MODEL_A_PATH)
model_B = safe_load_model(MODEL_B_PATH)

print(f"Model A loaded: {type(model_A)}")
print(f"Model B loaded: {type(model_B)}")

Model A loaded: <class 'sklearn.linear_model._logistic.LogisticRegression'>
Model B loaded: <class 'sklearn.tree._classes.DecisionTreeClassifier'>


# Run ML A/B Experiment (Offline Replay)

In [6]:
ml_results = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    variant = assign_variant(row["entity_id"])

    # Get features for prediction
    x = X.loc[idx].values.reshape(1, -1)
    target = y.loc[idx]

    if variant == "A":
        pred = model_A.predict_proba(x)[0][1]
    else:
        pred = model_B.predict_proba(x)[0][1]

    ml_results.append({
        "variant": variant,
        "prediction": pred,
        "target": target
    })

results_df = pd.DataFrame(ml_results)
print(f"\n‚úÖ A/B Experiment completed!")
print(f"   Total records: {len(results_df)}")
print(f"   Variant A: {(results_df['variant'] == 'A').sum()} records")
print(f"   Variant B: {(results_df['variant'] == 'B').sum()} records")
print("\nüìä First 5 results:")
print(results_df.head())

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4000/4000 [00:00<00:00, 10059.91it/s]



‚úÖ A/B Experiment completed!
   Total records: 4000
   Variant A: 2005 records
   Variant B: 1995 records

üìä First 5 results:
  variant  prediction  target
0       B    1.000000       1
1       A    0.515429       1
2       A    0.523708       1
3       B    1.000000       1
4       A    0.508615       1


# Evaluate ML A/B results

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Evaluate each variant separately
evaluation_results = []

for variant in ["A", "B"]:
    variant_data = results_df[results_df['variant'] == variant]
    
    y_true = variant_data['target'].values
    y_pred = (variant_data['prediction'] >= 0.5).astype(int)
    y_pred_proba = variant_data['prediction'].values
    
    metrics = {
        "Variant": variant,
        "Count": len(variant_data),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1-Score": f1_score(y_true, y_pred, zero_division=0),
        "ROC-AUC": roc_auc_score(y_true, y_pred_proba)
    }
    evaluation_results.append(metrics)

eval_df = pd.DataFrame(evaluation_results)

print("üéØ A/B Test Results:")
print(eval_df.to_string(index=False))

# Determine winner
variant_a_f1 = eval_df[eval_df['Variant'] == 'A']['F1-Score'].values[0]
variant_b_f1 = eval_df[eval_df['Variant'] == 'B']['F1-Score'].values[0]

winner = "A" if variant_a_f1 > variant_b_f1 else "B"
improvement = abs(variant_a_f1 - variant_b_f1) / max(variant_a_f1, variant_b_f1) * 100

print(f"\nüèÜ Winner: Variant {winner} (F1-Score: {max(variant_a_f1, variant_b_f1):.4f})")
print(f"   Improvement: {improvement:.2f}%")

üéØ A/B Test Results:
Variant  Count  Accuracy  Precision   Recall  F1-Score  ROC-AUC
      A   2005  0.663342   0.966206 0.665961  0.788468 0.655369
      B   1995  0.990977   0.999461 0.990924  0.995174 0.995562

üèÜ Winner: Variant B (F1-Score: 0.9952)
   Improvement: 20.77%


# Statistical Significance (Bootstrap)

In [9]:
def bootstrap_auc(df, n=1000):
    scores = []
    for _ in range(n):
        sample = df.sample(frac=1, replace=True)
        scores.append(roc_auc_score(sample['target'], sample['prediction']))
    return np.array(scores)

boot_A = bootstrap_auc(results_df[results_df['variant'] == 'A'])
boot_B = bootstrap_auc(results_df[results_df['variant'] == 'B'])

np.percentile(boot_B - boot_A, [2.5, 97.5])

array([0.27979901, 0.40267938])

In [11]:
# Create statistical significance report
from scipy import stats

auc_A = results_df[results_df['variant'] == 'A'].apply(lambda row: (row['target'], row['prediction']), axis=1).tolist()
auc_B = results_df[results_df['variant'] == 'B'].apply(lambda row: (row['target'], row['prediction']), axis=1).tolist()

# Perform Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(
    [pred for target, pred in auc_A],
    [pred for target, pred in auc_B],
    alternative='two-sided'
)
print(f"\nüìà Statistical Significance Report:")
print(f"U-statistic: {u_stat}")
print(f"P-value: {p_value}")
alpha = 0.05
if p_value < alpha:
    print("Result: Statistically significant difference between variants.")
else:
    print("Result: No statistically significant difference between variants.")


üìà Statistical Significance Report:
U-statistic: 273545.0
P-value: 0.0
Result: Statistically significant difference between variants.
