In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [2]:
# Simulating 5 different machines
num_machines = 5
machines = []

for i in range(num_machines):
    machines.append({
        "machine_id": f"MCH_{i+1}",
        "component_type": random.choice(["Motor", "Pump", "Compressor"]),
        "manufacturer": random.choice(["ABB", "Siemens", "GE", "Bosch"]),
        "model_number": f"MDL_{random.randint(1000, 9999)}",
        "install_date": datetime(2020, 1, 1) + timedelta(days=random.randint(0, 365)),
        "rated_temp_range": "0-100C",
        "rated_load_range": "0-2000RPM",
        "maintenance_policy": random.choice(["Monthly", "Quarterly"])
    })

asset_metadata = pd.DataFrame(machines)

In [3]:
# 1 data point per minute for 7 days
minutes = 60 * 24 * 7
timestamps = [datetime(2023, 1, 1) + timedelta(minutes=i) for i in range(minutes)]

data = []

for ts in timestamps:
    for machine in machines:
        # Simulated sensors
        rpm = np.random.normal(1500, 100)
        torque = np.random.normal(300, 30)
        vibration = np.random.normal(0.02, 0.005)
        temperature = np.random.normal(60, 5)
        acoustic = np.random.normal(40, 5)
        voltage = np.random.normal(400, 20)
        current = np.random.normal(10, 1)
        oil_temp = np.random.normal(70, 4)
        pressure = np.random.normal(100, 10)
        energy = np.random.normal(5.5, 0.3)

        # Environmental
        ambient_temp = np.random.normal(25, 3)
        humidity = np.random.uniform(30, 70)
        air_quality = np.random.normal(10, 2)
        corrosive_gas = np.random.normal(1, 0.1)

        # Usage
        is_weekend = ts.weekday() >= 5
        shift = random.choice(["Morning", "Afternoon", "Night"])
        idle_ratio = np.random.uniform(0, 0.3)

        data.append([
            machine["machine_id"], ts, rpm, torque, vibration, temperature,
            acoustic, voltage, current, oil_temp, pressure, energy,
            ambient_temp, humidity, air_quality, corrosive_gas,
            is_weekend, shift, idle_ratio
        ])

sensor_columns = [
    "machine_id", "timestamp", "rpm", "torque", "vibration", "motor_temp",
    "acoustic", "voltage", "current", "oil_temp", "pressure", "energy_consumption",
    "ambient_temp", "humidity", "air_quality", "corrosive_gas",
    "is_weekend", "shift", "idle_active_ratio"
]

sensor_df = pd.DataFrame(data, columns=sensor_columns)

In [4]:
def generate_balanced_failures(sensor_df, machines, failure_window_minutes=30, failure_ratio=0.1):
    import pandas as pd
    import random

    # Convert timestamp if not already in datetime format
    if not pd.api.types.is_datetime64_any_dtype(sensor_df['timestamp']):
        sensor_df['timestamp'] = pd.to_datetime(sensor_df['timestamp'])

    total_rows = len(sensor_df)
    samples_per_failure = failure_window_minutes  # assuming 1 sample per minute per machine
    target_failure_count = int((failure_ratio * total_rows) / samples_per_failure)

    print(f"🔧 Generating {target_failure_count} failure events to achieve {failure_ratio*100:.1f}% failure data.")

    failure_logs = set()
    failure_df_rows = []
    attempts = 0
    attempt_limit = target_failure_count * 10  # avoid infinite loop

    while len(failure_logs) < target_failure_count and attempts < attempt_limit:
        failed_machine = random.choice(machines)
        machine_id = failed_machine["machine_id"]

        # Get timestamps for this machine and reset index for safe iloc use
        valid_times = sensor_df[sensor_df["machine_id"] == machine_id]["timestamp"].reset_index(drop=True)

        # Ensure enough data points exist for a failure window
        if len(valid_times) <= failure_window_minutes:
            attempts += 1
            continue

        max_index = len(valid_times) - failure_window_minutes
        failure_start_idx = random.randint(0, max_index - 1)
        failure_time = valid_times.iloc[failure_start_idx]
        key = (machine_id, failure_time)

        if key not in failure_logs:
            failure_logs.add(key)
            failure_df_rows.append({
                "machine_id": machine_id,
                "failure_time": failure_time,
                "failure_type": random.choice(["Mechanical", "Electrical", "Software"]),
                "maintenance_type": random.choice(["Corrective", "Preventive", "Predictive"]),
                "failure_cause": random.choice(["Overheating", "Wear", "Voltage Spike", "Bearing Failure"]),
                "time_since_last_maintenance": random.randint(1, 90),
                "num_failures_last_30d": random.randint(0, 5)
            })

        attempts += 1

    failure_df = pd.DataFrame(failure_df_rows)
    return failure_df

In [5]:
failure_df = generate_balanced_failures(sensor_df, machines, failure_window_minutes=30, failure_ratio=0.1)

🔧 Generating 168 failure events to achieve 10.0% failure data.


In [6]:
asset_metadata

Unnamed: 0,machine_id,component_type,manufacturer,model_number,install_date,rated_temp_range,rated_load_range,maintenance_policy
0,MCH_1,Compressor,Bosch,MDL_9308,2020-03-06,0-100C,0-2000RPM,Quarterly
1,MCH_2,Motor,Siemens,MDL_9199,2020-07-17,0-100C,0-2000RPM,Monthly
2,MCH_3,Motor,ABB,MDL_6207,2020-01-19,0-100C,0-2000RPM,Monthly
3,MCH_4,Compressor,Siemens,MDL_9368,2020-08-26,0-100C,0-2000RPM,Quarterly
4,MCH_5,Motor,Siemens,MDL_5530,2020-09-09,0-100C,0-2000RPM,Quarterly


In [7]:
asset_metadata.shape

(5, 8)

In [8]:
sensor_df.head()

Unnamed: 0,machine_id,timestamp,rpm,torque,vibration,motor_temp,acoustic,voltage,current,oil_temp,pressure,energy_consumption,ambient_temp,humidity,air_quality,corrosive_gas,is_weekend,shift,idle_active_ratio
0,MCH_1,2023-01-01,1403.433806,319.832361,0.023706,47.952193,39.533965,397.156302,9.24245,68.892352,106.615953,5.610174,27.057053,53.339529,9.360822,0.999307,True,Afternoon,0.169275
1,MCH_2,2023-01-01,1520.305673,290.586552,0.008563,64.784802,47.793493,393.666363,10.011808,70.050812,113.380832,5.129479,27.624459,63.390786,7.42748,0.971597,True,Morning,0.001526
2,MCH_3,2023-01-01,1335.46004,305.705821,0.017575,65.934942,37.69422,396.438669,8.756053,74.941207,102.263863,5.237927,27.656433,58.833337,11.526021,1.070471,True,Afternoon,0.280549
3,MCH_4,2023-01-01,1501.939496,264.988237,0.026516,59.766062,35.982248,436.365242,8.539258,68.850339,77.962458,5.703989,26.685059,35.716278,12.049406,0.964965,True,Afternoon,0.155828
4,MCH_5,2023-01-01,1673.793491,306.533801,0.019776,59.599731,35.026363,431.72704,10.447039,62.999437,99.583449,5.591501,24.961187,56.303254,11.337381,0.898246,True,Night,0.292969


In [9]:
sensor_df.shape

(50400, 19)

In [10]:
failure_df

Unnamed: 0,machine_id,failure_time,failure_type,maintenance_type,failure_cause,time_since_last_maintenance,num_failures_last_30d
0,MCH_4,2023-01-07 00:37:00,Mechanical,Preventive,Voltage Spike,55,2
1,MCH_4,2023-01-01 00:25:00,Electrical,Preventive,Wear,77,2
2,MCH_2,2023-01-02 03:01:00,Electrical,Corrective,Bearing Failure,54,4
3,MCH_4,2023-01-07 08:31:00,Mechanical,Preventive,Wear,32,4
4,MCH_5,2023-01-06 02:01:00,Mechanical,Corrective,Voltage Spike,55,5
...,...,...,...,...,...,...,...
163,MCH_4,2023-01-06 17:44:00,Software,Predictive,Overheating,87,1
164,MCH_1,2023-01-02 20:16:00,Mechanical,Predictive,Wear,36,0
165,MCH_2,2023-01-07 03:29:00,Software,Corrective,Wear,7,5
166,MCH_5,2023-01-02 14:17:00,Mechanical,Predictive,Bearing Failure,38,2


In [11]:
failure_df.shape

(168, 7)

In [12]:
asset_metadata.to_csv("asset_metadata.csv", index=False)
sensor_df.to_csv("sensor_data.csv", index=False)
failure_df.to_csv("failure_logs.csv", index=False)

In [13]:
s_df = pd.read_csv("sensor_data.csv")
f_df = pd.read_csv("failure_logs.csv")
am_df = pd.read_csv("asset_metadata.csv")

In [14]:
s_df.head()

Unnamed: 0,machine_id,timestamp,rpm,torque,vibration,motor_temp,acoustic,voltage,current,oil_temp,pressure,energy_consumption,ambient_temp,humidity,air_quality,corrosive_gas,is_weekend,shift,idle_active_ratio
0,MCH_1,2023-01-01 00:00:00,1403.433806,319.832361,0.023706,47.952193,39.533965,397.156302,9.24245,68.892352,106.615953,5.610174,27.057053,53.339529,9.360822,0.999307,True,Afternoon,0.169275
1,MCH_2,2023-01-01 00:00:00,1520.305673,290.586552,0.008563,64.784802,47.793493,393.666363,10.011808,70.050812,113.380832,5.129479,27.624459,63.390786,7.42748,0.971597,True,Morning,0.001526
2,MCH_3,2023-01-01 00:00:00,1335.46004,305.705821,0.017575,65.934942,37.69422,396.438669,8.756053,74.941207,102.263863,5.237927,27.656433,58.833337,11.526021,1.070471,True,Afternoon,0.280549
3,MCH_4,2023-01-01 00:00:00,1501.939496,264.988237,0.026516,59.766062,35.982248,436.365242,8.539258,68.850339,77.962458,5.703989,26.685059,35.716278,12.049406,0.964965,True,Afternoon,0.155828
4,MCH_5,2023-01-01 00:00:00,1673.793491,306.533801,0.019776,59.599731,35.026363,431.72704,10.447039,62.999437,99.583449,5.591501,24.961187,56.303254,11.337381,0.898246,True,Night,0.292969


In [15]:
f_df.head()

Unnamed: 0,machine_id,failure_time,failure_type,maintenance_type,failure_cause,time_since_last_maintenance,num_failures_last_30d
0,MCH_4,2023-01-07 00:37:00,Mechanical,Preventive,Voltage Spike,55,2
1,MCH_4,2023-01-01 00:25:00,Electrical,Preventive,Wear,77,2
2,MCH_2,2023-01-02 03:01:00,Electrical,Corrective,Bearing Failure,54,4
3,MCH_4,2023-01-07 08:31:00,Mechanical,Preventive,Wear,32,4
4,MCH_5,2023-01-06 02:01:00,Mechanical,Corrective,Voltage Spike,55,5


In [16]:
am_df.head()

Unnamed: 0,machine_id,component_type,manufacturer,model_number,install_date,rated_temp_range,rated_load_range,maintenance_policy
0,MCH_1,Compressor,Bosch,MDL_9308,2020-03-06,0-100C,0-2000RPM,Quarterly
1,MCH_2,Motor,Siemens,MDL_9199,2020-07-17,0-100C,0-2000RPM,Monthly
2,MCH_3,Motor,ABB,MDL_6207,2020-01-19,0-100C,0-2000RPM,Monthly
3,MCH_4,Compressor,Siemens,MDL_9368,2020-08-26,0-100C,0-2000RPM,Quarterly
4,MCH_5,Motor,Siemens,MDL_5530,2020-09-09,0-100C,0-2000RPM,Quarterly


In [17]:
s_df.columns

Index(['machine_id', 'timestamp', 'rpm', 'torque', 'vibration', 'motor_temp',
       'acoustic', 'voltage', 'current', 'oil_temp', 'pressure',
       'energy_consumption', 'ambient_temp', 'humidity', 'air_quality',
       'corrosive_gas', 'is_weekend', 'shift', 'idle_active_ratio'],
      dtype='object')

In [18]:
am_df.columns

Index(['machine_id', 'component_type', 'manufacturer', 'model_number',
       'install_date', 'rated_temp_range', 'rated_load_range',
       'maintenance_policy'],
      dtype='object')

In [19]:
f_df.columns

Index(['machine_id', 'failure_time', 'failure_type', 'maintenance_type',
       'failure_cause', 'time_since_last_maintenance',
       'num_failures_last_30d'],
      dtype='object')