### Import

In [1]:
import numpy as np
import pandas as pd

### Number of synthetic samples to generate

In [2]:
N = 10000
np.random.seed(42)

### Proportions for each overall risk category:

In [3]:
n_low = int(0.40 * N)
n_med = int(0.35 * N)
n_high = N - n_low - n_med

### Generate sensor readings for each risk category

#### Low Risk (Healthy)

In [4]:
# Ball Bearing: high values; Vibration: low values; Humidity: normal
ball_low = np.random.uniform(64.00, 93.74, n_low)
vib_low = np.random.uniform(2.00, 21.27, n_low)
hum_low = np.random.uniform(73.91, 74.72, n_low)
risk_low = ['Low'] * n_low

#### Medium Risk (Early degradation)

In [5]:
# Ball Bearing: moderate (between 43.40 and 64.00)
# Vibration: moderate (between 21.27 and 39.09)
# Humidity: slightly off the normal (simulate two halves, we pick one side randomly)
ball_med = np.random.uniform(43.40, 64.00, n_med)
vib_med = np.random.uniform(21.27, 39.09, n_med)
# For humidity, randomly choose from lower or upper medium risk ranges
hum_med_lower = np.random.uniform(73.5, 73.91, n_med//2)
hum_med_upper = np.random.uniform(74.72, 75.0, n_med - n_med//2)
hum_med = np.concatenate([hum_med_lower, hum_med_upper])
risk_med = ['Medium'] * n_med

#### High Risk (Severe degradation)

In [6]:
# Ball Bearing: low values (below 43.40)
# Vibration: high values (above 39.09)
# Humidity: extreme values (either low or high)
ball_high = np.random.uniform(16.93, 43.40, n_high)
vib_high = np.random.uniform(39.09, 100.00, n_high)
# For humidity, randomly choose from the lower extreme or higher extreme range
hum_high_lower = np.random.uniform(72.40, 73.5, n_high//2)
hum_high_upper = np.random.uniform(75.0, 75.40, n_high - n_high//2)
hum_high = np.concatenate([hum_high_lower, hum_high_upper])
risk_high = ['High'] * n_high

### Combine the data into a single DataFrame

In [7]:
synthetic_df = pd.DataFrame({
    'ball_bearing': np.concatenate([ball_low, ball_med, ball_high]),
    'vibration': np.concatenate([vib_low, vib_med, vib_high]),
    'humidity': np.concatenate([hum_low, hum_med, hum_high]),
    'risk_level': np.concatenate([risk_low, risk_med, risk_high])
})

### Shuffle the dataset

In [8]:
synthetic_df = synthetic_df.sample(frac=1, random_state=42).reset_index(drop=True)

### Display sample rows and risk level distribution

In [9]:
print(synthetic_df.head(15))
print("\nRisk Level Distribution:")
print(synthetic_df['risk_level'].value_counts())

    ball_bearing  vibration   humidity risk_level
0      53.287651  31.511211  74.755610     Medium
1      54.138179  23.779652  73.539365     Medium
2      82.554662   8.822824  74.516011        Low
3      53.451727  26.665425  73.719980     Medium
4      60.749528  33.679418  73.780989     Medium
5      50.086320  36.609365  74.773490     Medium
6      68.373965   9.913854  74.458865        Low
7      54.843685  23.920574  73.699882     Medium
8      61.045640  23.979704  74.801270     Medium
9      66.419313   6.637721  74.245280        Low
10     75.544048   7.538247  74.255419        Low
11     44.991798  23.769579  74.919743     Medium
12     46.952508  32.830431  73.743325     Medium
13     54.935261  32.161738  73.585390     Medium
14     68.799065   4.013336  74.166726        Low

Risk Level Distribution:
risk_level
Low       4000
Medium    3500
High      2500
Name: count, dtype: int64


### Data store

In [10]:
synthetic_df.to_csv('synthetic_combined_dataset_lift_predictive.csv', index=False)