In [None]:
import pandas as pd
import numpy as np

In [None]:
def read_and_prepare_data(input_file):
    df = pd.read_csv(input_file, sep=",", quotechar='"', decimal=",", encoding="utf-8")
    return df


In [None]:
def extract_numeric_ranges(df):
    """
    Extracts and converts numeric columns from the input DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame with relevant numeric columns.

    Returns:
        tuple: Series for length, weight, order quantity, price, and raw material price.
    """    
    length_values = df["Längd (m)"].astype(float)
    weight_values = df["Vikt (kg/m)"].astype(float)
    order_values = df["ca antal (Årsvolym st)"].astype(int)
    price_values = df["Pris (kr/st) SEK"].astype(float)
    raw_str = df["Råvara (euro/kg)"].astype(str)
    raw_num_str = raw_str.str.extract(r'([\d,\.]+)')[0]
    raw_num_str = raw_num_str.str.replace(",", ".")
    raw_price_values = pd.to_numeric(raw_num_str, errors='coerce').dropna()
    return length_values, weight_values, order_values, price_values, raw_price_values


In [None]:
def compute_extended_ranges(length_values, weight_values, order_values, price_values, raw_price_values):
    """
    Computes extended min and max ranges for each numeric feature for data simulation.

    Args:
        length_values (pd.Series): Length values.
        weight_values (pd.Series): Weight values.
        order_values (pd.Series): Order quantity values.
        price_values (pd.Series): Price values.
        raw_price_values (pd.Series): Raw material price values.

    Returns:
        tuple: Extended min and max values for length, weight, order quantity, price, and raw material price.
    """    
    min_length, max_length = length_values.min(), length_values.max()
    min_weight, max_weight = weight_values.min(), weight_values.max()
    min_order, max_order   = order_values.min(), order_values.max()
    min_price, max_price   = price_values.min(), price_values.max()
    min_raw, max_raw       = raw_price_values.min(), raw_price_values.max()
    ext_min_length = max(0, min_length * 0.85)
    ext_max_length = max_length * 1.15
    ext_min_weight = max(0, min_weight * 0.85)
    ext_max_weight = max_weight * 1.15
    ext_min_order  = max(0, int(np.floor(min_order * 0.85)))
    ext_max_order  = int(np.ceil(max_order * 1.15))
    ext_min_price  = max(0, min_price * 0.85)
    ext_max_price  = max_price * 1.15
    ext_min_raw    = max(0, min_raw * 0.85)
    ext_max_raw    = max_raw * 1.15
    return (ext_min_length, ext_max_length, ext_min_weight, ext_max_weight,
            ext_min_order, ext_max_order, ext_min_price, ext_max_price, ext_min_raw, ext_max_raw)


In [None]:
def prepare_categorical_options(df):
    """
    Prepare categorical options for data simulation.

    Args:
        df (pd.DataFrame): Input DataFrame with a 'Profil_namn' column.

    Returns:
        tuple: Lists of alloy options, finish options, GD&T levels, customer categories, and unique profile names.
    """    
    alloy_options   = ['Iron', 'Aluminium', 'Copper', 'Nickel', 'Titanium', 'Zinc', 'Steel', 'Brass', 'Lead', 'Tin']
    finish_options  = ['Powder coated', 'Wet painted', 'Electroplated', 'Anodized', 'Polished', 'Brushed', 'Chrome plated',
                       'Plasma sprayed', 'Phosphated', 'Hot-dip galvanized', 'Blackening']
    gd_t_options    = ['low', 'medium', 'high']
    customer_cats   = ['micro', 'small', 'medium', 'large']
    profile_options = df["Profil_namn"].unique().tolist()
    return alloy_options, finish_options, gd_t_options, customer_cats, profile_options


In [None]:
def generate_simulated_data(num_rows, ext_ranges, categorical_options):
    """
    Generates a synthetic dataset of quotes using provided ranges and categorical options.

    Args:
        num_rows (int): Number of rows to generate.
        ext_ranges (tuple): Extended min and max values for numeric features.
        categorical_options (tuple): Lists of categorical options.

    Returns:
        pd.DataFrame: Simulated dataset as a DataFrame.
    """    
    (ext_min_length, ext_max_length, ext_min_weight, ext_max_weight,
     ext_min_order, ext_max_order, ext_min_price, ext_max_price, ext_min_raw, ext_max_raw) = ext_ranges
    alloy_options, finish_options, gd_t_options, customer_cats, profile_options = categorical_options

    alloy_col = np.random.choice(alloy_options, size=num_rows)
    finish_col = np.random.choice(finish_options, size=num_rows)
    length_col = np.round(np.random.uniform(ext_min_length, ext_max_length, size=num_rows), 1)
    weight_col = np.round(np.random.uniform(ext_min_weight, ext_max_weight, size=num_rows), 3)
    profile_col = np.random.choice(profile_options, size=num_rows)
    tolerance_col = np.round(np.random.uniform(0.05, 0.20, size=num_rows), 3)
    gd_t_col = np.random.choice(gd_t_options, size=num_rows)
    order_col = np.random.randint(ext_min_order, ext_max_order + 1, size=num_rows)
    lme_col = np.round(np.random.uniform(ext_min_raw, ext_max_raw, size=num_rows), 2)
    customer_col = np.random.choice(customer_cats, size=num_rows)
    lead_time_col = np.random.randint(2, 13, size=num_rows)
    order_col_float = order_col.astype(float)
    price_col = ext_max_price - ((order_col_float - ext_min_order) / (ext_max_order - ext_min_order)) * (ext_max_price - ext_min_price)
    price_col = np.round(price_col, 2)
    date_range = pd.date_range(start="2025-01-01", end="2025-12-31")
    date_col = pd.to_datetime(np.random.choice(date_range, size=num_rows, replace=True)).date

    simulated_df = pd.DataFrame({
        "Alloy": alloy_col,
        "Finish": finish_col,
        "Length": length_col,
        "Weight": weight_col,
        "Profile Name": profile_col,
        "Tolerances": tolerance_col,
        "GD&T": gd_t_col,
        "Order Quantity": order_col,
        "LME price": lme_col,
        "Customer Category": customer_col,
        "Lead Time (weeks)": lead_time_col,
        "Quote Price (SEK)": price_col,
        "Quote Date": date_col
    })
    return simulated_df


In [None]:
def save_simulated_data(simulated_df, output_file):
    """
    Saves the simulated DataFrame to a CSV file.

    Args:
        simulated_df (pd.DataFrame): The simulated dataset.
        output_file (str): Path to the output CSV file.

    Returns:
        None
    """    
    simulated_df.to_csv(output_file, index=False)
    print(f"Synthetic dataset of {len(simulated_df)} quotes saved to {output_file}")


In [None]:
def main():
    """
    Main function to run the data simulation workflow.

    Loads input data, extracts numeric and categorical ranges, generates simulated data,
    and saves the result to a CSV file.
    """    
    np.random.seed(42)
    input_file = "all_quotes_extracted.csv"
    output_file = "simulated_quotes_dataset.csv"
    num_rows = 1000

    df = read_and_prepare_data(input_file)
    length_values, weight_values, order_values, price_values, raw_price_values = extract_numeric_ranges(df)
    ext_ranges = compute_extended_ranges(length_values, weight_values, order_values, price_values, raw_price_values)
    categorical_options = prepare_categorical_options(df)
    simulated_df = generate_simulated_data(num_rows, ext_ranges, categorical_options)
    save_simulated_data(simulated_df, output_file)

# To run the modular simulation, just call main()
main()