In [1]:
import numpy as np
import pandas as pd

In [2]:
# Utility functions for injecting messiness

def inject_missing_lead_time(df, frac=0.05, col='Lead_Time_weeks'):
    """Randomly set a fraction of 'Lead Time (weeks)' as NaN."""
    df.loc[df.sample(frac=frac).index, col] = np.nan
    return df

def inject_extreme_weights(df, frac=0.01, col='Weight_kg_per_m', multiplier=4):
    """Set a fraction of 'Weight' values to unrealistic extremes."""
    df.loc[df.sample(frac=frac).index, col] = df[col].mean() * multiplier
    return df

def inject_profile_name_noise(df, frac=0.07, col='Profile_Name'):
    """Randomly shuffle some 'Profile Name' values."""
    unique_profiles = df[col].unique()
    shuffle_indices = np.random.choice(df.index, size=int(frac * len(df)), replace=False)
    for idx in shuffle_indices:
        df.at[idx, col] = np.random.choice(unique_profiles)
    return df

def inject_tolerance_noise(df, col='Tolerances', loc=0, scale=0.007, min_val=0.05, max_val=0.2):
    """Inject Gaussian noise into 'Tolerances' and clip values."""
    tolerance_noise = np.random.normal(loc=loc, scale=scale, size=len(df))
    df[col] = (df[col] + tolerance_noise).clip(min_val, max_val)
    return df

def swap_gdt_values(df, frac=0.07, col='GD_T', values=None):
    """Randomly swap a fraction of 'GD&T' values."""
    if values is None:
        values = ["low", "medium", "high"]
    gd_indices = np.random.choice(df.index, size=int(frac * len(df)), replace=False)
    for idx in gd_indices:
        current = df.at[idx, col]
        options = [val for val in values if val != current]
        df.at[idx, col] = np.random.choice(options)
    return df

In [3]:
# Main workflow

def main(input_file="simulated_quotes_dataset.csv", output_file="quotes_dataset_messy.csv"):
    df = pd.read_csv(input_file)
    df = inject_missing_lead_time(df)
    df = inject_extreme_weights(df)
    df = inject_profile_name_noise(df)
    df = inject_tolerance_noise(df)
    df = swap_gdt_values(df)
    df.to_csv(output_file, index=False)
    return output_file

# Run the messiness injection
messy_file = main()

In [4]:
# Quick checks and summary

def describe_and_nulls(file_path):
    df = pd.read_csv(file_path)
    print(df.describe())
    print("\nMissing values per column:")
    print(df.isnull().sum())

describe_and_nulls("quotes_dataset_messy.csv")

          Length_m  Weight_kg_per_m   Tolerances  Order_Quantity  \
count  1000.000000      1000.000000  1000.000000     1000.000000   
mean     27.300900         1.330036     0.123740    97114.012000   
std       6.638435         0.467476     0.042904    38319.355706   
min      15.800000         0.837000     0.050000    32328.000000   
25%      21.700000         1.065000     0.086846    63302.250000   
50%      27.400000         1.299500     0.124830    98105.500000   
75%      33.000000         1.523250     0.161106   131545.750000   
max      38.800000         5.173416     0.200000   160992.000000   

       LME_Price_EUR  Lead_Time_weeks  Quote_Price_SEK  
count    1000.000000       950.000000      1000.000000  
mean        3.355440         7.188421         3.016820  
std         0.453297         3.137064         0.677761  
min         2.550000         2.000000         1.890000  
25%         2.977500         5.000000         2.410000  
50%         3.370000         7.000000        