In [10]:
import pandas as pd
import numpy as np

In [11]:
def load_data(file_path):
    """Load data from a CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path)

pd.set_option('display.max_columns', None)
df = load_data('../data/supply_chain_data.csv')

# Sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

# Only SKU column put as the first column
cols_first = df.columns.tolist()
cols_first.insert(0, cols_first.pop(cols_first.index('SKU')))
df = df[cols_first]

# Rename SKU column to Order_id
df = df.rename(columns={'SKU': 'Order_id'})
df.head()

Unnamed: 0,Order_id,Availability,Costs,Customer demographics,Defect rates,Inspection results,Lead time,Lead times,Location,Manufacturing costs,Manufacturing lead time,Number of products sold,Order quantities,Price,Product type,Production volumes,Revenue generated,Routes,Shipping carriers,Shipping costs,Shipping times,Stock levels,Supplier name,Transportation modes
0,SKU0,55,187.752075,Non-binary,0.22641,Pending,29,7,Mumbai,46.279879,29,802,96,69.808006,haircare,215,8661.996792,Route B,Carrier B,2.956572,4,58,Supplier 3,Road
1,SKU1,95,503.065579,Female,4.854068,Pending,23,30,Mumbai,33.616769,30,736,37,14.843523,skincare,517,7460.900065,Route B,Carrier A,9.716575,2,53,Supplier 3,Road
2,SKU2,34,141.920282,Unknown,4.580593,Pending,12,10,Mumbai,30.688019,27,8,88,11.319683,haircare,971,9577.749626,Route C,Carrier B,8.054479,2,1,Supplier 1,Air
3,SKU3,68,254.776159,Non-binary,4.746649,Fail,24,13,Kolkata,35.624741,18,83,59,61.163343,skincare,937,7766.836426,Route A,Carrier C,1.729569,6,23,Supplier 5,Rail
4,SKU4,26,923.440632,Non-binary,3.14558,Fail,5,3,Delhi,92.065161,3,871,56,4.805496,skincare,414,2686.505152,Route A,Carrier A,3.890548,8,5,Supplier 1,Air


## Function to expand data

- Added multiple rows around 2000-4000 for data handling
- to ensure data not underfitting

In [12]:
def expand_data(df, target_rows=4000, random_seed=42):
    np.random.seed(random_seed)

    repeat_factor = target_rows // len(df) + 1
    df_expanded = pd.concat([df] * repeat_factor, ignore_index=True)
    df_expanded = df_expanded.sample(n=target_rows, random_state=random_seed).reset_index(drop=True)

    return df_expanded

## Add noises data for make a data into realize

In [13]:
def add_noise(df):
    df = df.copy()

    # Numeric noise addition
    noise_config = {
        "Price": 0.05,
        "Shipping costs": 0.1,
        "Manufacturing costs": 0.08,
        "Defect rates": 0.02
    }

    # Iterable noise addition
    for col, noise_level in noise_config.items():
        if col in df.columns:
            df[col] = df[col] * (1 + np.random.normal(0, noise_level, size=len(df)))
            df[col] = df[col].clip(lower=0)  # Ensure no negative values

    # Integer noise addition columns
    int_cols = [
        "Stock levels",
        "Order quantities",
        "Production volumes",
        "Shipping times",
        "Lead times"
    ]

    for col in int_cols:
        if col in df.columns:
            df[col] = (df[col] + np.random.randint(-2, 3, size=len(df))).clip(lower=0)

    return df

In [14]:
df = expand_data(df, target_rows=4000, random_seed=42)
df = add_noise(df)
df = df.sort_values(by='Order_id').reset_index(drop=True)

## Feature Engineering

- Ensure data noise into normal data for preventing duplicate data
- Order_id change to numeric random to get unique ID

In [15]:
import uuid


np.random.seed(42)

namespace = uuid.NAMESPACE_DNS
df["Order_id"] = [str(uuid.uuid5(namespace=namespace, name=str(i))) for i in range(len(df))]

numeric_cols = ["Costs", 'Manufacturing costs', 'Price', 'Revenue generated', 'Shipping costs']

for col in numeric_cols:
    # Added variety about +/- 1% from the original value
    noise = np.random.normal(1.0, 0.01, size=len(df))
    df[col] = df[col] * noise

In [16]:
df.head()

Unnamed: 0,Order_id,Availability,Costs,Customer demographics,Defect rates,Inspection results,Lead time,Lead times,Location,Manufacturing costs,Manufacturing lead time,Number of products sold,Order quantities,Price,Product type,Production volumes,Revenue generated,Routes,Shipping carriers,Shipping costs,Shipping times,Stock levels,Supplier name,Transportation modes
0,6af613b6-569c-5c22-9c37-2ed93f31d3af,55,188.684667,Non-binary,0.224447,Pending,29,8,Mumbai,43.469393,29,802,98,71.933529,haircare,214,8763.759148,Route B,Carrier B,2.601292,6,59,Supplier 3,Road
1,b04965e6-a9bb-591f-8f8a-1adcb2c8dc39,55,187.492481,Non-binary,0.226086,Pending,29,8,Mumbai,51.332779,29,802,95,70.933955,haircare,213,8499.239523,Route B,Carrier B,3.220109,3,58,Supplier 3,Road
2,4b166dbe-d99d-5091-abdd-95b83330ed3a,55,188.968124,Non-binary,0.231951,Pending,29,9,Mumbai,47.1343,29,802,98,72.899584,haircare,213,8633.603195,Route B,Carrier B,3.257622,2,57,Supplier 3,Road
3,98123fde-012f-5ff3-8b50-881449dac91a,55,190.611596,Non-binary,0.233616,Pending,29,8,Mumbai,53.945541,29,802,98,65.259225,haircare,213,8658.388203,Route B,Carrier B,3.040814,6,59,Supplier 3,Road
4,6ed955c6-506a-5343-9be4-2c0afae02eef,55,187.312448,Non-binary,0.221073,Pending,29,8,Mumbai,40.762402,29,802,95,77.743599,haircare,216,8663.3748,Route B,Carrier B,2.703847,3,57,Supplier 3,Road


In [17]:
# Simpan hasil modifikasi ke CSV baru
df.to_csv('../data/supply_chain_data_expanded.csv', index=False)
print(f"✅ Data berhasil disimpan ke: supply_chain_data_expanded.csv")
print(f"Total baris: {len(df)}")
print(f"Semua Order_id adalah UUID valid: {df['Order_id'].apply(lambda x: len(str(x)) == 36).all()}")

✅ Data berhasil disimpan ke: supply_chain_data_expanded.csv
Total baris: 4000
Semua Order_id adalah UUID valid: True
