### Library

In [3]:
import ucimlrepo
import pandas as pd
import random

### Data Acquisition

In [4]:
full_dataset = ucimlrepo.fetch_ucirepo(id=601)

features = pd.DataFrame(full_dataset.data.features) # type: ignore
targets = pd.DataFrame(full_dataset.data.targets) # type: ignore

dataset = pd.concat([features, targets], axis=1)

dataset

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF
0,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [5]:
dataset.describe(include='all')

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
unique,3,,,,,,,,,,,
top,L,,,,,,,,,,,
freq,6000,,,,,,,,,,,
mean,,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dataset Cleaning

#### Dataset Update 1: Failure Type and Drop One-hot Encoding Features

In [6]:
import numpy as np

conditions = [
    dataset['TWF'] == 1,
    dataset['HDF'] == 1,
    dataset['PWF'] == 1,
    dataset['OSF'] == 1,
    dataset['RNF'] == 1,
]

choices = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']

dataset['Failure Type'] = np.select(conditions, choices, default='No Failure')

dataset

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,No Failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0,No Failure
9996,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0,No Failure
9997,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0,No Failure
9998,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0,No Failure


In [7]:
dataset = dataset.drop(columns=['TWF', 'HDF', 'PWF', 'OSF', 'RNF'])

dataset

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,No Failure
...,...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0,No Failure
9996,H,298.9,308.4,1632,31.8,17,0,No Failure
9997,M,299.0,308.6,1645,33.4,22,0,No Failure
9998,H,299.0,308.7,1408,48.5,25,0,No Failure


#### Dataset Update 2: Drop Random Failures

In [8]:
dataset = dataset[dataset['Failure Type'] != 'RNF'].reset_index(drop=True)

dataset['Failure Type'].value_counts()

Failure Type
No Failure    9652
HDF            115
PWF             91
OSF             78
TWF             46
Name: count, dtype: int64

#### Type-based Dataset Split

In [9]:
l_dataset = dataset[dataset['Type'] == "L"]
m_dataset = dataset[dataset['Type'] == "M"]
h_dataset = dataset[dataset['Type'] == "H"]

print(f"Size: {len(l_dataset)}, {len(m_dataset)}, {len(h_dataset)}")

Size: 5988, 2995, 999


#### Dataset: Data Addition

In [10]:
def resampling_function(dataset: pd.DataFrame, number_of_sampling: int, weights=None, random_state=42):
    addition_list = dataset[dataset['Failure Type'] == "No Failure"].sample(number_of_sampling, weights=weights, random_state=random_state)
    
    dataset = pd.concat([dataset, addition_list], ignore_index=True)

    print(len(dataset))

    return dataset

#### Resampling: All Dataset

In [11]:
l_dataset = resampling_function(l_dataset, 12)
m_dataset = resampling_function(m_dataset, 5)
h_dataset = resampling_function(h_dataset, 1)

6000
3000
1000


#### Dataset Update 3: Add Machine ID 

In [12]:
import pandas as pd

def add_machine_id(machine_dataset: pd.DataFrame, total_machine: int, machine_type: str) -> pd.DataFrame:
    """
    Assigns Product IDs by distributing rows evenly across a fixed number of machines.
    
    Args:
        machine_dataset: DataFrame for one product type (e.g., only 'L' rows)
        total_machine: Number of unique machines to create for this type
        machine_type: 'L', 'M', or 'H'
    
    Returns:
        DataFrame with new 'Product ID' column and sorted for chronological order per machine
    """
    # Reset index to ensure clean row numbering
    machine_dataset = machine_dataset.reset_index(drop=True)
    
    # Number of rows per machine (integer division -> most machines get this many)
    rows_per_machine = len(machine_dataset) // total_machine
    
    # Some machines will get one extra row if not perfectly divisible
    # We assign machine numbers sequentially, cycling properly
    machine_dataset['Product ID'] = [
        f"{machine_type.upper()}_{str((i // rows_per_machine) + 1).zfill(3)}"
        if rows_per_machine > 0
        else f"{machine_type.upper()}_001"  # fallback if too few rows
        for i in range(len(machine_dataset))
    ]
    
    # Ensure we don't exceed total_machine (in case of remainder)
    # Map any overflow to the last machines
    machine_numbers = (pd.Series(range(len(machine_dataset))) // rows_per_machine) + 1
    machine_numbers = machine_numbers.clip(upper=total_machine)  # cap at total_machine
    machine_dataset['Product ID'] = [
        f"{machine_type.upper()}_{str(num).zfill(3)}" for num in machine_numbers
    ]
    
    # Sort by Product ID first, then by Tool wear (assumes higher tool wear = later in time)
    machine_dataset = machine_dataset.sort_values(
        by=["Product ID", "Tool wear"], 
        ignore_index=True
    )
    
    return machine_dataset

In [13]:
l_machine = add_machine_id(l_dataset, total_machine=12, machine_type="L")
m_machine = add_machine_id(m_dataset, total_machine=6, machine_type="M")
h_machine = add_machine_id(h_dataset, total_machine=2, machine_type="H")

In [14]:
l_machine['Product ID'].unique()

array(['L_001', 'L_002', 'L_003', 'L_004', 'L_005', 'L_006', 'L_007',
       'L_008', 'L_009', 'L_010', 'L_011', 'L_012'], dtype=object)

In [15]:
m_machine['Product ID'].unique()

array(['M_001', 'M_002', 'M_003', 'M_004', 'M_005', 'M_006'], dtype=object)

In [16]:
h_machine['Product ID'].unique()

array(['H_001', 'H_002'], dtype=object)

#### Dataset Update 4: Maintenance Gap

In [17]:
l_machine.head()

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type,Product ID
0,L,298.8,308.9,1398,51.5,0,0,No Failure,L_001
1,L,298.3,308.1,1586,35.5,0,0,No Failure,L_001
2,L,298.0,308.3,1662,32.7,0,0,No Failure,L_001
3,L,297.4,308.4,2151,17.7,0,0,No Failure,L_001
4,L,297.5,309.6,1312,50.7,0,0,No Failure,L_001


In [None]:
def add_maintenance_gap(machine_dataset: pd.DataFrame):
    time_maintenance = 20
    list_machine_id = machine_dataset['Product ID'].unique()

    updated_machine_dataset = pd.DataFrame(columns=machine_dataset.columns)

    if machine_dataset.empty:
        raise ValueError("Dataset is Empty")

    for machine_id in list_machine_id:
        machine_data = machine_dataset[machine_dataset['Product ID'] == machine_id].reset_index(drop=True)
        
        for index in range(len(machine_data)):
            updated_machine_dataset.loc[len(updated_machine_dataset)] = machine_data.iloc[index]

            if machine_data.loc[index, 'Failure Type'] != "No Failure":
                
                maintenance_row = {
                    'Product ID': machine_id,
                    'Type': machine_data.loc[0, 'Type'],
                    'Air temperature [K]': 0,
                    'Process temperature [K]': 0,
                    'Rotational speed [rpm]': 0,
                    'Torque [Nm]': 0,
                    'Tool wear [min]': 0,
                    'Target': 0,
                    'Failure Type': 'Maintenance'
                }

                for _ in range(time_maintenance):
                    updated_machine_dataset.loc[len(updated_machine_dataset)] = maintenance_row # type:ignore

    return updated_machine_dataset

In [23]:
l_machine_full_updated = add_maintenance_gap(l_machine)

l_machine_full_updated.head()

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type,Product ID
0,L,298.8,308.9,1398,51.5,0,0,No Failure,L_001
1,L,298.3,308.1,1586,35.5,0,0,No Failure,L_001
2,L,298.0,308.3,1662,32.7,0,0,No Failure,L_001
3,L,297.4,308.4,2151,17.7,0,0,No Failure,L_001
4,L,297.5,309.6,1312,50.7,0,0,No Failure,L_001


In [24]:
m_machine_full_updated = add_maintenance_gap(m_machine)

m_machine_full_updated.head()

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type,Product ID
0,M,298.1,308.6,1551,42.8,0,0,No Failure,M_001
1,M,297.6,308.3,1538,40.2,0,0,No Failure,M_001
2,M,297.6,309.2,1442,48.1,0,0,No Failure,M_001
3,M,296.7,307.8,1286,60.5,0,0,No Failure,M_001
4,M,296.2,307.2,1633,31.1,0,0,No Failure,M_001


In [25]:
h_machine_full_updated = add_maintenance_gap(h_machine)

h_machine_full_updated.head()

Unnamed: 0,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,Failure Type,Product ID
0,H,296.4,307.4,1586,35.5,0,0,No Failure,H_001
1,H,295.6,306.0,1396,52.4,0,0,No Failure,H_001
2,H,298.7,310.2,1613,36.0,0,0,No Failure,H_001
3,H,298.2,307.6,1318,55.7,0,0,No Failure,H_001
4,H,299.4,309.3,1743,28.2,0,0,No Failure,H_001


#### Dataset Update 5: Time Feature