In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve, precision_recall_curve)

plt.style.use('fivethirtyeight')
sns.set_palette("Set2")

In [5]:
# Load
df = pd.read_csv("ai4i2020.csv")

print("Data Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\n Colunm list")
print(df.columns.tolist())

print("\nFailure Type Distribution:")
failure_columns = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
failure_counts = df[failure_columns].sum()

failure_mapping = {
    'TWF': 'Tool Wear Failure',
    'HDF': 'Heat Dissipation Failure',
    'PWF': 'Power Failure', 
    'OSF': 'Overstrain Failure',
    'RNF': 'Random Failures'
}

print("Individual failure type counts:")
for code, count in failure_counts.items():
    print(f"{failure_mapping[code]}: {count} occurrences")

print(f"\nTotal individual failure occurrences: {failure_counts.sum()}")


Data Shape: (10000, 14)

First few rows:
   UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0    1     M14860    M                298.1                    308.6   
1    2     L47181    L                298.2                    308.7   
2    3     L47182    L                298.1                    308.5   
3    4     L47183    L                298.2                    308.6   
4    5     L47184    L                298.2                    308.7   

   Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  TWF  \
0                    1551         42.8                0                0    0   
1                    1408         46.3                3                0    0   
2                    1498         49.4                5                0    0   
3                    1433         39.5                7                0    0   
4                    1408         40.0                9                0    0   

   HDF  PWF  OSF  RNF  
0    0    0    

In [8]:
data = df.copy()

data['Power'] = data['Rotational speed [rpm]'] * data['Torque [Nm]']
data['Temperature_Ratio'] = data['Process temperature [K]'] / data['Air temperature [K]']
data['Tool_Wear_Rate'] = data['Tool wear [min]'] / (data['Air temperature [K]'] - 273.15)

le_type = LabelEncoder()
data['Type_encoded'] = le_type.fit_transform(data['Type'])

features = ['Air temperature [K]', 'Process temperature [K]', 
           'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]',
           'Power', 'Temperature_Ratio', 'Tool_Wear_Rate', 'Type_encoded']

X = data[features]
y = data['Machine failure']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Failure rate: {y.mean():.4f}")


Features shape: (10000, 9)
Target shape: (10000,)
Failure rate: 0.0339
