In [2]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Parameters
n_labs = 100
n_months = 12
lab_types = ['Research', 'Industrial', 'Pharmaceutical']

# Create lab and month identifiers
labs = [f"Lab_{i+1}" for i in range(n_labs)]
months = pd.date_range(start="2024-06-01", periods=n_months, freq='MS')

# Data container
data = []

# Simulate data
for lab in labs:
    lab_type = np.random.choice(lab_types)
    for month in months:
        avg_usage_hours_per_day = np.round(np.random.uniform(4, 16), 1)  # 4 to 16 hours
        contamination_level = np.round(np.random.uniform(0.1, 1.0), 2)  # scaled 0 to 1
        num_filters_installed = np.random.randint(2, 10)
        filter_age_avg = np.random.randint(1, 12)
        service_calls_last_month = np.random.poisson(1)
        compliance_score = np.round(np.random.uniform(70, 100), 1)
        filter_failures = np.random.poisson(0.5)
        stock_available = np.random.randint(0, 15)

        # Simulated replacements_next_month with noise
        base = 1
        replacements = (
            base +
            (avg_usage_hours_per_day * 0.4) +
            (contamination_level * 8) +
            (filter_failures * 2) -
            (stock_available * 0.2)
        )
        replacements = max(0, int(np.round(replacements + np.random.normal(0, 2))))  # add noise

        data.append([
            lab, lab_type, month, avg_usage_hours_per_day, contamination_level,
            num_filters_installed, filter_age_avg, service_calls_last_month,
            compliance_score, filter_failures, stock_available, replacements
        ])

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "lab_id", "lab_type", "month", "avg_usage_hours_per_day", "contamination_level",
    "num_filters_installed", "filter_age_avg", "service_calls_last_month",
    "compliance_score", "filter_failures", "stock_available", "replacements_next_month"
])

# Preview top rows
print(df.head())

# Save to CSV
df.to_csv("simulated_lab_filter_data.csv", index=False)
print("\n✅ Dataset saved as 'simulated_lab_filter_data.csv'")


  lab_id        lab_type      month  avg_usage_hours_per_day  \
0  Lab_1  Pharmaceutical 2024-06-01                     13.6   
1  Lab_1  Pharmaceutical 2024-07-01                     11.8   
2  Lab_1  Pharmaceutical 2024-08-01                      9.2   
3  Lab_1  Pharmaceutical 2024-09-01                     10.2   
4  Lab_1  Pharmaceutical 2024-10-01                      6.8   

   contamination_level  num_filters_installed  filter_age_avg  \
0                 0.27                      9               5   
1                 0.15                      9               6   
2                 0.36                      4              10   
3                 0.63                      4               5   
4                 0.32                      5               7   

   service_calls_last_month  compliance_score  filter_failures  \
0                         0              74.7                0   
1                         0              99.8                1   
2                         