In [38]:
import pandas as pd
import numpy as np

In [39]:
def load_data(file_path):
    """Load data from a CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path)

pd.set_option('display.max_columns', None)
df = load_data('../data/supply_chain_data.csv')

# Sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

# Only SKU column put as the first column
cols_first = df.columns.tolist()
cols_first.insert(0, cols_first.pop(cols_first.index('SKU')))
df = df[cols_first]

# Rename SKU column to Order_id
df = df.rename(columns={'SKU': 'Order_id'})
df.head()

Unnamed: 0,Order_id,Availability,Costs,Customer demographics,Defect rates,Inspection results,Lead time,Lead times,Location,Manufacturing costs,Manufacturing lead time,Number of products sold,Order quantities,Price,Product type,Production volumes,Revenue generated,Routes,Shipping carriers,Shipping costs,Shipping times,Stock levels,Supplier name,Transportation modes
0,SKU0,55,187.752075,Non-binary,0.22641,Pending,29,7,Mumbai,46.279879,29,802,96,69.808006,haircare,215,8661.996792,Route B,Carrier B,2.956572,4,58,Supplier 3,Road
1,SKU1,95,503.065579,Female,4.854068,Pending,23,30,Mumbai,33.616769,30,736,37,14.843523,skincare,517,7460.900065,Route B,Carrier A,9.716575,2,53,Supplier 3,Road
2,SKU2,34,141.920282,Unknown,4.580593,Pending,12,10,Mumbai,30.688019,27,8,88,11.319683,haircare,971,9577.749626,Route C,Carrier B,8.054479,2,1,Supplier 1,Air
3,SKU3,68,254.776159,Non-binary,4.746649,Fail,24,13,Kolkata,35.624741,18,83,59,61.163343,skincare,937,7766.836426,Route A,Carrier C,1.729569,6,23,Supplier 5,Rail
4,SKU4,26,923.440632,Non-binary,3.14558,Fail,5,3,Delhi,92.065161,3,871,56,4.805496,skincare,414,2686.505152,Route A,Carrier A,3.890548,8,5,Supplier 1,Air


## Function to expand data

- Added multiple rows around 2000-4000 for data handling
- to ensure data not underfitting

In [40]:
def expand_data(df, target_rows=4000, random_seed=42):
    np.random.seed(random_seed)

    repeat_factor = target_rows // len(df) + 1
    df_expanded = pd.concat([df] * repeat_factor, ignore_index=True)
    df_expanded = df_expanded.sample(n=target_rows, random_state=random_seed).reset_index(drop=True)

    return df_expanded

## Add noises data for make a data into realize

In [41]:
def add_noise(df):
    df = df.copy()

    # Numeric noise addition
    noise_config = {
        "Price": 0.05,
        "Shipping costs": 0.1,
        "Manufacturing costs": 0.08,
        "Defect rates": 0.02
    }

    # Iterable noise addition
    for col, noise_level in noise_config.items():
        if col in df.columns:
            df[col] = df[col] * (1 + np.random.normal(0, noise_level, size=len(df)))
            df[col] = df[col].clip(lower=0)  # Ensure no negative values

    # Integer noise addition columns
    int_cols = [
        "Stock levels",
        "Order quantities",
        "Production volumes",
        "Shipping times",
        "Lead times"
    ]

    for col in int_cols:
        if col in df.columns:
            df[col] = (df[col] + np.random.randint(-2, 3, size=len(df))).clip(lower=0)

    return df

In [42]:
df = expand_data(df, target_rows=4000, random_seed=42)
df = add_noise(df)
df = df.sort_values(by='Order_id').reset_index(drop=True)

## Feature Engineering

- Ensure data noise into normal data for preventing duplicate data
- Order_id change to numeric random to get unique ID

In [43]:
import uuid


np.random.seed(42)

namespace = uuid.NAMESPACE_DNS
df["Order_id"] = [str(uuid.uuid5(namespace=namespace, name=str(i))) for i in range(len(df))]

numeric_cols = ["Costs", 'Manufacturing costs', 'Price', 'Revenue generated', 'Shipping costs']

for col in numeric_cols:
    # Added variety about +/- 1% from the original value
    noise = np.random.normal(1.0, 0.01, size=len(df))
    df[col] = df[col] * noise

In [44]:
df.head()

Unnamed: 0,Order_id,Availability,Costs,Customer demographics,Defect rates,Inspection results,Lead time,Lead times,Location,Manufacturing costs,Manufacturing lead time,Number of products sold,Order quantities,Price,Product type,Production volumes,Revenue generated,Routes,Shipping carriers,Shipping costs,Shipping times,Stock levels,Supplier name,Transportation modes
0,6af613b6-569c-5c22-9c37-2ed93f31d3af,55,188.684667,Non-binary,0.224447,Pending,29,8,Mumbai,43.469393,29,802,98,71.933529,haircare,214,8763.759148,Route B,Carrier B,2.601292,6,59,Supplier 3,Road
1,b04965e6-a9bb-591f-8f8a-1adcb2c8dc39,55,187.492481,Non-binary,0.226086,Pending,29,8,Mumbai,51.332779,29,802,95,70.933955,haircare,213,8499.239523,Route B,Carrier B,3.220109,3,58,Supplier 3,Road
2,4b166dbe-d99d-5091-abdd-95b83330ed3a,55,188.968124,Non-binary,0.231951,Pending,29,9,Mumbai,47.1343,29,802,98,72.899584,haircare,213,8633.603195,Route B,Carrier B,3.257622,2,57,Supplier 3,Road
3,98123fde-012f-5ff3-8b50-881449dac91a,55,190.611596,Non-binary,0.233616,Pending,29,8,Mumbai,53.945541,29,802,98,65.259225,haircare,213,8658.388203,Route B,Carrier B,3.040814,6,59,Supplier 3,Road
4,6ed955c6-506a-5343-9be4-2c0afae02eef,55,187.312448,Non-binary,0.221073,Pending,29,8,Mumbai,40.762402,29,802,95,77.743599,haircare,216,8663.3748,Route B,Carrier B,2.703847,3,57,Supplier 3,Road


In [45]:
df.describe()

Unnamed: 0,Availability,Costs,Defect rates,Lead time,Lead times,Manufacturing costs,Manufacturing lead time,Number of products sold,Order quantities,Price,Production volumes,Revenue generated,Shipping costs,Shipping times,Stock levels
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,48.37275,529.711616,2.28582,17.136,15.9225,47.22728,14.79025,461.5965,49.181,49.430165,567.561,5770.944224,5.538579,5.7615,47.859
std,30.58172,256.963159,1.451593,8.781104,8.841354,29.202608,8.86391,302.706627,26.642376,31.162825,261.431203,2720.729508,2.715884,2.997476,31.21395
min,1.0,101.211166,0.017856,1.0,0.0,0.932373,1.0,8.0,0.0,1.56065,102.0,1046.762838,0.753234,0.0,0.0
25%,22.75,316.570584,1.014768,10.0,8.0,22.345539,7.0,176.0,26.0,19.246846,356.0,2788.710268,3.303343,3.0,17.0
50%,43.0,521.373219,2.15542,18.0,16.0,44.810631,14.0,391.0,52.5,50.843918,568.0,5994.144648,5.390601,6.0,47.0
75%,75.0,765.712622,3.574428,25.0,24.0,69.528746,23.0,705.0,71.0,76.42783,795.25,8256.243186,7.696807,8.0,73.0
max,100.0,1021.467838,5.174818,30.0,32.0,118.544393,30.0,996.0,98.0,116.46441,987.0,10065.689055,12.160134,12.0,102.0


In [46]:
df["Defect rates"]

0       0.224447
1       0.226086
2       0.231951
3       0.233616
4       0.221073
          ...   
3995    0.349242
3996    0.359944
3997    0.357211
3998    0.367761
3999    0.342171
Name: Defect rates, Length: 4000, dtype: float64

In [47]:
# Create damage risk boolean label with 0 = SAFE shipment, 1 = HIGH RISK shipment
def assign_damage_risk(defect_rate):
    if defect_rate > 0.15:
        return 1  # HIGH RISK
    else:
        return 0  # SAFE

df["damage_risk"] = df["Defect rates"].apply(assign_damage_risk)
df["damage_risk"].value_counts()

1    3762
0     238
Name: damage_risk, dtype: int64

In [48]:
df.head()

Unnamed: 0,Order_id,Availability,Costs,Customer demographics,Defect rates,Inspection results,Lead time,Lead times,Location,Manufacturing costs,Manufacturing lead time,Number of products sold,Order quantities,Price,Product type,Production volumes,Revenue generated,Routes,Shipping carriers,Shipping costs,Shipping times,Stock levels,Supplier name,Transportation modes,damage_risk
0,6af613b6-569c-5c22-9c37-2ed93f31d3af,55,188.684667,Non-binary,0.224447,Pending,29,8,Mumbai,43.469393,29,802,98,71.933529,haircare,214,8763.759148,Route B,Carrier B,2.601292,6,59,Supplier 3,Road,1
1,b04965e6-a9bb-591f-8f8a-1adcb2c8dc39,55,187.492481,Non-binary,0.226086,Pending,29,8,Mumbai,51.332779,29,802,95,70.933955,haircare,213,8499.239523,Route B,Carrier B,3.220109,3,58,Supplier 3,Road,1
2,4b166dbe-d99d-5091-abdd-95b83330ed3a,55,188.968124,Non-binary,0.231951,Pending,29,9,Mumbai,47.1343,29,802,98,72.899584,haircare,213,8633.603195,Route B,Carrier B,3.257622,2,57,Supplier 3,Road,1
3,98123fde-012f-5ff3-8b50-881449dac91a,55,190.611596,Non-binary,0.233616,Pending,29,8,Mumbai,53.945541,29,802,98,65.259225,haircare,213,8658.388203,Route B,Carrier B,3.040814,6,59,Supplier 3,Road,1
4,6ed955c6-506a-5343-9be4-2c0afae02eef,55,187.312448,Non-binary,0.221073,Pending,29,8,Mumbai,40.762402,29,802,95,77.743599,haircare,216,8663.3748,Route B,Carrier B,2.703847,3,57,Supplier 3,Road,1


In [49]:
df["Defect rates"].describe()

count    4000.000000
mean        2.285820
std         1.451593
min         0.017856
25%         1.014768
50%         2.155420
75%         3.574428
max         5.174818
Name: Defect rates, dtype: float64

# Create Features

In [50]:
# Logistic features
delay_margin = df["Shipping times"] - df["Lead times"]
cost_per_hour = df["Shipping costs"] / df["Shipping times"]

# Container load features
load_pressure = df["Order quantities"] / df["Production volumes"]
stock_stress = df["Stock levels"] / df["Order quantities"]

# Manufacturing quality features
supplier_defect_avg = df["Defect rates"].rolling(window=5, min_periods=1).mean()
cost_quality_ratio  = df["Manufacturing costs"] / df["Production volumes"]

# Training pipelines

In [51]:
# Training pipelines == define features and target variable
y = df["damage_risk"]
X = df[[
    "Shipping times",
    "Shipping costs",
    "Transportation modes",
    "Routes",
    "Order quantities",
    "Production volumes",
    "Manufacturing costs",
    "Supplier name"
]]

# Data encoding

In [55]:
# Encoding categorical features
for obj in X.select_dtypes(include=['object']).columns:
    X[obj] = X[obj].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[obj] = X[obj].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[obj] = X[obj].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[obj] = X[obj].astype('category').cat.codes


In [None]:
X[''].value_counts()

0    1074
1     885
4     721
3     721
2     599
Name: Supplier name, dtype: int64

# Split data into X and y

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Data scaling (Optional if needed)

# Machine learning models

In [54]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000,
                                              solver='liblinear',
                                              random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

for moodel_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.preidct(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"Model: {moodel_name}")
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

ValueError: could not convert string to float: 'Road'