In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("quotes_dataset_messy.csv")  # Load your dataset

# Split dataset: rows with and without missing Lead Time
df_lead_train = df[df["Lead Time (weeks)"].notnull()]
df_lead_missing = df[df["Lead Time (weeks)"].isnull()]

# Features to use for prediction (drop leakages and target)
features = ["Alloy", "Finish", "Length", "Weight", "Profile Name", "Tolerances", "GD&T",
            "Order Quantity", "LME price", "Customer Category", "Quote Price (SEK)"]

# Encode categorical variables
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train = df_lead_train[features].copy()
X_missing = df_lead_missing[features].copy()

X_train[["Alloy", "Finish", "Profile Name", "GD&T", "Customer Category"]] = encoder.fit_transform(
    X_train[["Alloy", "Finish", "Profile Name", "GD&T", "Customer Category"]]
)
X_missing[["Alloy", "Finish", "Profile Name", "GD&T", "Customer Category"]] = encoder.transform(
    X_missing[["Alloy", "Finish", "Profile Name", "GD&T", "Customer Category"]]
)

y_train = df_lead_train["Lead Time (weeks)"]

# Train a random forest regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and impute
predicted_lead_time = model.predict(X_missing)
df.loc[df["Lead Time (weeks)"].isnull(), "Lead Time (weeks)"] = predicted_lead_time

# Confirm no missing values remain
missing_after = df["Lead Time (weeks)"].isnull().sum()
missing_after


np.int64(0)

In [3]:
output_file = "quotes_dataset_imputation_done.csv"
df.to_csv(output_file, index=False)