### Library

In [22]:
import ucimlrepo
import pandas as pd
import random

### Data Acquisition

In [23]:
full_dataset = ucimlrepo.fetch_ucirepo(id=601)

features = pd.DataFrame(full_dataset.data.features) # type: ignore
targets = pd.DataFrame(full_dataset.data.targets) # type: ignore

dataset = pd.concat([features, targets], axis=1)

dataset

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF
0,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [24]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Type                 10000 non-null  object 
 1   Air temperature      10000 non-null  float64
 2   Process temperature  10000 non-null  float64
 3   Rotational speed     10000 non-null  int64  
 4   Torque               10000 non-null  float64
 5   Tool wear            10000 non-null  int64  
 6   Machine failure      10000 non-null  int64  
 7   TWF                  10000 non-null  int64  
 8   HDF                  10000 non-null  int64  
 9   PWF                  10000 non-null  int64  
 10  OSF                  10000 non-null  int64  
 11  RNF                  10000 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 937.6+ KB


In [25]:
dataset.describe(include='all')

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
unique,3,,,,,,,,,,,
top,L,,,,,,,,,,,
freq,6000,,,,,,,,,,,
mean,,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dataset Cleaning

#### Dataset Update 1: Failure Type and Drop One-hot Encoding Features

In [26]:
import numpy as np

conditions = [
    dataset['TWF'] == 1,
    dataset['HDF'] == 1,
    dataset['PWF'] == 1,
    dataset['OSF'] == 1,
    dataset['RNF'] == 1,
]

choices = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']

dataset['Failure Type'] = np.select(conditions, choices, default='No Failure')

dataset

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,No Failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0,No Failure
9996,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0,No Failure
9997,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0,No Failure
9998,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0,No Failure


In [27]:
dataset = dataset.drop(columns=['TWF', 'HDF', 'PWF', 'OSF', 'RNF'])

dataset

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,No Failure
...,...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0,No Failure
9996,H,298.9,308.4,1632,31.8,17,0,No Failure
9997,M,299.0,308.6,1645,33.4,22,0,No Failure
9998,H,299.0,308.7,1408,48.5,25,0,No Failure


#### Dataset Update 2: Drop Random Failures

In [28]:
dataset = dataset[dataset['Failure Type'] != 'RNF'].reset_index(drop=True)

dataset['Failure Type'].value_counts()

Failure Type
No Failure    9652
HDF            115
PWF             91
OSF             78
TWF             46
Name: count, dtype: int64

#### Type-based Dataset Split

In [29]:
l_dataset = dataset[dataset['Type'] == "L"].copy()
m_dataset = dataset[dataset['Type'] == "M"].copy()
h_dataset = dataset[dataset['Type'] == "H"].copy()

print(f"Size: {len(l_dataset)}, {len(m_dataset)}, {len(h_dataset)}")

Size: 5988, 2995, 999


#### Dataset: Data Addition

In [30]:
def resampling_function(dataset: pd.DataFrame, number_of_sampling: int, weights=None, random_state=42):
    addition_list = dataset[dataset['Failure Type'] == "No Failure"].sample(number_of_sampling, weights=weights, random_state=random_state)
    
    dataset = pd.concat([dataset, addition_list], ignore_index=True)

    print(len(dataset))

    return dataset

#### Resampling: All Dataset

In [31]:
l_dataset = resampling_function(l_dataset, 12)
m_dataset = resampling_function(m_dataset, 5)
h_dataset = resampling_function(h_dataset, 1)

6000
3000
1000


#### Dataset Update 3: Add Machine ID 

In [32]:
def add_machine_id(machine_dataset: pd.DataFrame, total_machine: int, machine_type: str) -> pd.DataFrame:
    """
    Assigns Product IDs by distributing rows evenly across a fixed number of machines.
    
    Args:
        machine_dataset: DataFrame for one product type (e.g., only 'L' rows)
        total_machine: Number of unique machines to create for this type
        machine_type: 'L', 'M', or 'H'
    
    Returns:
        DataFrame with new 'Product ID' column and sorted for chronological order per machine
    """
    # Reset index to ensure clean row numbering
    machine_dataset = machine_dataset.reset_index(drop=True)
    
    # Number of rows per machine (integer division -> most machines get this many)
    rows_per_machine = len(machine_dataset) // total_machine
    
    # Some machines will get one extra row if not perfectly divisible
    # We assign machine numbers sequentially, cycling properly
    machine_dataset['Product ID'] = [
        f"{machine_type.upper()}_{str((i // rows_per_machine) + 1).zfill(3)}"
        if rows_per_machine > 0
        else f"{machine_type.upper()}_001"  # fallback if too few rows
        for i in range(len(machine_dataset))
    ]
    
    # Ensure we don't exceed total_machine (in case of remainder)
    # Map any overflow to the last machines
    machine_numbers = (pd.Series(range(len(machine_dataset))) // rows_per_machine) + 1
    machine_numbers = machine_numbers.clip(upper=total_machine)  # cap at total_machine
    machine_dataset['Product ID'] = [
        f"{machine_type.upper()}_{str(num).zfill(3)}" for num in machine_numbers
    ]
    
    # Sort by Product ID first, then by Tool wear (assumes higher tool wear = later in time)
    machine_dataset = machine_dataset.sort_values(
        by=["Product ID", "Tool wear"], 
        ignore_index=True
    )
    
    return machine_dataset

In [33]:
l_machine = add_machine_id(l_dataset, total_machine=12, machine_type="L")
m_machine = add_machine_id(m_dataset, total_machine=6, machine_type="M")
h_machine = add_machine_id(h_dataset, total_machine=2, machine_type="H")

In [34]:
l_machine['Product ID'].unique()

array(['L_001', 'L_002', 'L_003', 'L_004', 'L_005', 'L_006', 'L_007',
       'L_008', 'L_009', 'L_010', 'L_011', 'L_012'], dtype=object)

In [35]:
m_machine['Product ID'].unique()

array(['M_001', 'M_002', 'M_003', 'M_004', 'M_005', 'M_006'], dtype=object)

In [36]:
h_machine['Product ID'].unique()

array(['H_001', 'H_002'], dtype=object)

#### Dataset: Combine

In [37]:
update_dataset = pd.concat([l_machine, m_machine, h_machine], ignore_index=True)

update_dataset = update_dataset.sort_values(by=["Product ID"]).reset_index(drop=True)

update_dataset.to_csv("[Update] Updated Dataset.csv", index=False)

In [39]:
l_machine.describe(include='all')

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type,Product ID
count,6000,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000,6000
unique,1,,,,,,,5,12
top,L,,,,,,,No Failure,L_001
freq,6000,,,,,,,5769,500
mean,,300.013317,310.008883,1539.641667,39.98635,108.311,0.039167,,
std,,1.98734,1.4746,180.462817,10.005518,64.03724,0.194008,,
min,,295.3,305.7,1181.0,3.8,0.0,0.0,,
25%,,298.4,308.8,1424.0,33.2,53.0,0.0,,
50%,,300.1,310.1,1504.0,40.0,109.0,0.0,,
75%,,301.5,311.0,1612.0,46.8,163.0,0.0,,


In [40]:
m_machine.describe(include='all')

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type,Product ID
count,3000,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000,3000
unique,1,,,,,,,5,6
top,M,,,,,,,No Failure,M_001
freq,3000,,,,,,,2921,500
mean,,300.031567,310.021367,1537.640667,40.0115,107.318667,0.027667,,
std,,2.016278,1.497908,178.949751,9.986487,63.064328,0.164043,,
min,,295.3,305.7,1168.0,9.7,0.0,0.0,,
25%,,298.3,308.9,1422.0,33.275,53.0,0.0,,
50%,,300.1,310.05,1504.0,40.2,106.0,0.0,,
75%,,301.6,311.1,1611.25,46.8,162.0,0.0,,


In [41]:
h_machine.describe(include='all')

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type,Product ID
count,1000,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000,1000
unique,1,,,,,,,5,2
top,H,,,,,,,No Failure,H_001
freq,1000,,,,,,,980,500
mean,,299.8617,309.9208,1538.344,39.8237,107.539,0.021,,
std,,2.021769,1.487656,173.248205,9.639571,63.015502,0.143456,,
min,,295.5,305.9,1212.0,12.8,0.0,0.0,,
25%,,298.2,308.8,1425.0,33.0,53.0,0.0,,
50%,,299.8,309.9,1500.0,40.3,107.0,0.0,,
75%,,301.3,311.0,1612.25,46.6,160.0,0.0,,
