### Cleaning the Equipment Dataset

In [4]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Sensor Dimension Table with correct SensorType-Unit matching
sensor_types = ['Temperature', 'Vibration', 'Pressure', 'Voltage']
units = {'Temperature': 'Celsius', 'Vibration': 'mm/s', 'Pressure': 'Pa', 'Voltage': 'Volt'}

# Generate SensorTypes first
sensor_type_list = np.random.choice(sensor_types, 10)

sensor_dim_df = pd.DataFrame({
    "SensorID": [f"SNS_{i:03}" for i in range(1, 11)],
    "SensorType": sensor_type_list,
    "Unit": [units[stype] for stype in sensor_type_list],
    "CalibrationDate": pd.to_datetime(np.random.choice(pd.date_range('2022-01-01', '2023-12-31'), 10))
})

# Location Dimension Table

lines = ['Line 1', 'Line 2', 'Line 3']
location_dim_df = pd.DataFrame({
    "LocationID": [f"LOC_{i:02}" for i in range(1, 7)],
    "LocationName": np.random.choice(['Warehouse', 'Assembly', 'Testing', 'Maintenance'], 6),
    "ProductionLine": np.random.choice(lines, 6)
})

# Equipment Dimension Table
manufacturers = ['GE', 'Siemens', 'Bosch', 'Honeywell']
equipment_types = ['Motor', 'Pump', 'Compressor', 'Conveyor']
equipment_dim_df = pd.DataFrame({
    "EquipmentID": [f"EQT_{i:03}" for i in range(1, 21)],
    "EquipmentType": np.random.choice(equipment_types, 20),
    "Manufacturer": np.random.choice(manufacturers, 20),
    "InstallationDate": pd.to_datetime(np.random.choice(pd.date_range('2018-01-01', '2022-12-31'), 20)),
    "IsCritical": np.random.choice([True, False], 20)
})

# Time Dimension Table
start_date = datetime(2024, 1, 1)
time_dim_df = pd.DataFrame({
    "TimeID": [i for i in range(1, 366)],
    "Timestamp": [start_date + timedelta(days=i) for i in range(365)],
})
time_dim_df["Hour"] = np.random.choice(range(24), 365)
time_dim_df["Day"] = time_dim_df["Timestamp"].dt.day
time_dim_df["Month"] = time_dim_df["Timestamp"].dt.month
time_dim_df["Year"] = time_dim_df["Timestamp"].dt.year
time_dim_df["Weekday"] = time_dim_df["Timestamp"].dt.day_name()

# Fact Table (Sensor Data)
fact_rows = 1000
fact_df = pd.DataFrame({
    "SensorDataID": [f"SD_{i:04}" for i in range(1, fact_rows + 1)],
    "Timestamp": pd.to_datetime(np.random.choice(time_dim_df["Timestamp"], fact_rows)),
    "EquipmentID": np.random.choice(equipment_dim_df["EquipmentID"], fact_rows),
    "TimeID": np.random.choice(time_dim_df["TimeID"], fact_rows),
    "SensorID": np.random.choice(sensor_dim_df["SensorID"], fact_rows),
    "LocationID": np.random.choice(location_dim_df["LocationID"], fact_rows),
    "Temperature": np.random.normal(loc=75, scale=10, size=fact_rows),
    "Vibration": np.random.normal(loc=5, scale=2, size=fact_rows),
    "Pressure": np.random.normal(loc=2, scale=0.5, size=fact_rows),
    "Voltage": np.random.normal(loc=220, scale=15, size=fact_rows),
})

# Add Calculated Fields
fact_df["HealthScore"] = 100 - (
    (fact_df["Temperature"] - 75).abs() * 0.5 +
    (fact_df["Vibration"] - 5).abs() * 2 +
    (fact_df["Pressure"] - 2).abs() * 10 +
    (fact_df["Voltage"] - 220).abs() * 0.1
)
fact_df["HealthScore"] = fact_df["HealthScore"].clip(lower=0, upper=100)
fact_df["PredictedFailure"] = fact_df["HealthScore"] < 60
fact_df["EstimatedDowntimeMins"] = fact_df["PredictedFailure"].apply(lambda x: np.random.randint(30, 180) if x else 0)
fact_df["EstimatedMaintenanceCost"] = fact_df["PredictedFailure"].apply(lambda x: np.random.randint(100, 1000) if x else 0)

# Optional: Save to CSV
sensor_dim_df.to_csv(r"Data\Raw\sensor_dimension.csv", index=False)
location_dim_df.to_csv(r"Data\Raw\location_dimension.csv", index=False)
equipment_dim_df.to_csv(r"Data\Raw\equipment_dimension.csv", index=False)
time_dim_df.to_csv(r"Data\Raw\time_dimension.csv", index=False)
fact_df.to_csv(r"Data\Raw\fact_table.csv", index=False)



In [33]:
sensor_dim_df.head()

Unnamed: 0,SensorID,SensorType,Unit,CalibrationDate
0,SNS_001,Pressure,Pa,2023-04-12
1,SNS_002,Voltage,Volt,2022-08-03
2,SNS_003,Temperature,Celsius,2022-11-27
3,SNS_004,Pressure,Pa,2023-04-04
4,SNS_005,Pressure,Pa,2022-03-29


In [34]:
location_dim_df.head()

Unnamed: 0,LocationID,LocationName,ProductionLine
0,LOC_01,Warehouse,Line 1
1,LOC_02,Assembly,Line 2
2,LOC_03,Testing,Line 2
3,LOC_04,Maintenance,Line 2
4,LOC_05,Packaging,Line 2


In [35]:
equipment_dim_df.head()

Unnamed: 0,EquipmentID,EquipmentType,Manufacturer,InstallationDate,IsCritical
0,EQT_001,Motor,Siemens,2020-02-16,False
1,EQT_002,Conveyor,Bosch,2021-10-01,False
2,EQT_003,Pump,Honeywell,2019-07-19,False
3,EQT_004,Pump,Bosch,2020-06-16,False
4,EQT_005,Motor,Honeywell,2021-09-25,True


In [36]:
time_dim_df.head()

Unnamed: 0,TimeID,Timestamp,Hour,Day,Month,Year,Weekday
0,1,2024-01-01,7,1,1,2024,Monday
1,2,2024-01-02,20,2,1,2024,Tuesday
2,3,2024-01-03,15,3,1,2024,Wednesday
3,4,2024-01-04,12,4,1,2024,Thursday
4,5,2024-01-05,17,5,1,2024,Friday


In [37]:
fact_df

Unnamed: 0,SensorDataID,Timestamp,EquipmentID,TimeID,SensorID,LocationID,Temperature,Vibration,Pressure,Voltage,HealthScore,PredictedFailure,EstimatedDowntimeMins,EstimatedMaintenanceCost
0,SD_0001,2024-05-08,EQT_008,109,SNS_006,LOC_02,94.374549,1.039699,1.426055,247.027209,73.949948,False,0,0
1,SD_0002,2024-02-27,EQT_018,152,SNS_003,LOC_05,73.439602,3.996542,2.380632,200.089409,91.415505,False,0,0
2,SD_0003,2024-05-01,EQT_012,17,SNS_006,LOC_05,60.423923,4.856926,2.429243,177.712353,83.904622,False,0,0
3,SD_0004,2024-01-01,EQT_015,133,SNS_010,LOC_06,77.710378,3.713351,2.417804,200.356708,89.929144,False,0,0
4,SD_0005,2024-08-26,EQT_009,148,SNS_010,LOC_04,68.309954,3.456910,1.911094,236.956233,90.984113,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,SD_0996,2024-09-20,EQT_005,189,SNS_004,LOC_04,90.653727,7.907428,2.571239,217.593264,80.405218,False,0,0
996,SD_0997,2024-05-28,EQT_008,287,SNS_004,LOC_05,75.534175,5.114377,2.021140,234.527745,97.839984,False,0,0
997,SD_0998,2024-03-21,EQT_013,208,SNS_009,LOC_05,61.865109,5.912940,1.173630,214.501836,82.793163,False,0,0
998,SD_0999,2024-08-02,EQT_008,114,SNS_001,LOC_05,73.038119,5.486788,1.769184,221.620366,95.575285,False,0,0


In [38]:
#missing values in the fact table
missing_values = fact_df.isnull().sum()
print("Missing values in fact table:\n", missing_values)

Missing values in fact table:
 SensorDataID                0
Timestamp                   0
EquipmentID                 0
TimeID                      0
SensorID                    0
LocationID                  0
Temperature                 0
Vibration                   0
Pressure                    0
Voltage                     0
HealthScore                 0
PredictedFailure            0
EstimatedDowntimeMins       0
EstimatedMaintenanceCost    0
dtype: int64
