In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("simulated_dataset.csv")  # Load your dataset

In [3]:
df.describe()

Unnamed: 0,Length_m,Weight_kg_m,Tolerances,Order_Quantity,LME_Price_EUR,Lead_Time_weeks,Quote_Price_SEK
count,1000.0,1000.0,1000.0,1000.0,1000.0,950.0,1000.0
mean,27.3317,1.329647,0.122532,96386.388,3.34707,6.996842,3.02954
std,6.703232,0.466752,0.04387,36285.80374,0.448859,3.200861,0.641656
min,15.8,0.837,0.05,32328.0,2.55,2.0,1.89
25%,21.3,1.082,0.085,65393.5,2.97,4.0,2.48
50%,27.55,1.294,0.1205,96735.0,3.36,7.0,3.02
75%,33.1,1.521,0.16125,127452.0,3.72,10.0,3.58
max,38.9,5.166144,0.2,160864.0,4.14,12.0,4.16


In [5]:
df = pd.read_csv("quotes_dataset_messy.csv")  # Load your dataset

# Split dataset: rows with and without missing Lead Time
df_lead_train = df[df["Lead Time (weeks)"].notnull()]
df_lead_missing = df[df["Lead Time (weeks)"].isnull()]

# Features to use for prediction (drop leakages and target)
features = ["Alloy", "Finish", "Length", "Weight", "Profile Name", "Tolerances", "GD&T",
            "Order Quantity", "LME price", "Customer Category", "Quote Price (SEK)"]

# Encode categorical variables
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train = df_lead_train[features].copy()
X_missing = df_lead_missing[features].copy()

X_train[["Alloy", "Finish", "Profile Name", "GD&T", "Customer Category"]] = encoder.fit_transform(
    X_train[["Alloy", "Finish", "Profile Name", "GD&T", "Customer Category"]]
)
X_missing[["Alloy", "Finish", "Profile Name", "GD&T", "Customer Category"]] = encoder.transform(
    X_missing[["Alloy", "Finish", "Profile Name", "GD&T", "Customer Category"]]
)

y_train = df_lead_train["Lead Time (weeks)"]

# Train a random forest regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and impute
predicted_lead_time = model.predict(X_missing)
df.loc[df["Lead Time (weeks)"].isnull(), "Lead Time (weeks)"] = predicted_lead_time

# Confirm no missing values remain
missing_after = df["Lead Time (weeks)"].isnull().sum()
missing_after


np.int64(0)

In [6]:
output_file = "quotes_dataset_imputation_done.csv"
df.to_csv(output_file, index=False)