In [None]:
import numpy as np
import pandas as pd

In [None]:
# Utility functions for injecting messiness

def inject_missing_lead_time(df, frac=0.05, col='Lead Time (weeks)'):
    """Randomly set a fraction of 'Lead Time (weeks)' as NaN."""
    df.loc[df.sample(frac=frac).index, col] = np.nan
    return df

def inject_extreme_weights(df, frac=0.01, col='Weight', multiplier=4):
    """Set a fraction of 'Weight' values to unrealistic extremes."""
    df.loc[df.sample(frac=frac).index, col] = df[col].mean() * multiplier
    return df

def inject_profile_name_noise(df, frac=0.07, col='Profile Name'):
    """Randomly shuffle some 'Profile Name' values."""
    unique_profiles = df[col].unique()
    shuffle_indices = np.random.choice(df.index, size=int(frac * len(df)), replace=False)
    for idx in shuffle_indices:
        df.at[idx, col] = np.random.choice(unique_profiles)
    return df

def inject_tolerance_noise(df, col='Tolerances', loc=0, scale=0.007, min_val=0.05, max_val=0.2):
    """Inject Gaussian noise into 'Tolerances' and clip values."""
    tolerance_noise = np.random.normal(loc=loc, scale=scale, size=len(df))
    df[col] = (df[col] + tolerance_noise).clip(min_val, max_val)
    return df

def swap_gdt_values(df, frac=0.07, col='GD&T', values=None):
    """Randomly swap a fraction of 'GD&T' values."""
    if values is None:
        values = ["low", "medium", "high"]
    gd_indices = np.random.choice(df.index, size=int(frac * len(df)), replace=False)
    for idx in gd_indices:
        current = df.at[idx, col]
        options = [val for val in values if val != current]
        df.at[idx, col] = np.random.choice(options)
    return df

In [None]:
# Main workflow

def main(input_file="simulated_quotes_dataset.csv", output_file="quotes_dataset_messy.csv"):
    df = pd.read_csv(input_file)
    df = inject_missing_lead_time(df)
    df = inject_extreme_weights(df)
    df = inject_profile_name_noise(df)
    df = inject_tolerance_noise(df)
    df = swap_gdt_values(df)
    df.to_csv(output_file, index=False)
    return output_file

# Run the messiness injection
messy_file = main()

In [None]:
# Quick checks and summary

def describe_and_nulls(file_path):
    df = pd.read_csv(file_path)
    print(df.describe())
    print("\nMissing values per column:")
    print(df.isnull().sum())

describe_and_nulls("quotes_dataset_messy.csv")