In [16]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load explored data from Week 2
data = pd.read_csv("explored_data_week2.csv")  # Or your raw dataset
print("Data loaded successfully!")

Data loaded successfully!


In [17]:
# Remove UDI and Product ID (non-predictive)
data_clean = data.drop(['UDI', 'Product ID'], axis=1)
print("Columns dropped. Remaining columns:")
print(data_clean.columns)

Columns dropped. Remaining columns:
Index(['Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target',
       'Failure Type'],
      dtype='object')


In [18]:
# One-hot encode 'Type' and 'Failure Type'
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Type', 'Failure Type'])
    ],
    remainder='passthrough'
)

# Apply transformation
encoded_data = preprocessor.fit_transform(data_clean)
encoded_cols = (preprocessor.named_transformers_['cat']
                .get_feature_names_out(['Type', 'Failure Type']))
remaining_cols = [col for col in data_clean.columns 
                 if col not in ['Type', 'Failure Type']]
encoded_df = pd.DataFrame(encoded_data, 
                         columns=list(encoded_cols) + remaining_cols)

print("\nEncoded Data Head:")
display(encoded_df.head())


Encoded Data Head:


Unnamed: 0,Type_H,Type_L,Type_M,Failure Type_Heat Dissipation Failure,Failure Type_No Failure,Failure Type_Overstrain Failure,Failure Type_Power Failure,Failure Type_Random Failures,Failure Type_Tool Wear Failure,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,298.1,308.6,1551.0,42.8,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,298.2,308.7,1408.0,46.3,3.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,298.1,308.5,1498.0,49.4,5.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,298.2,308.6,1433.0,39.5,7.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,298.2,308.7,1408.0,40.0,9.0,0.0


In [19]:
# Scale numerical features
scaler = StandardScaler()
num_cols = ['Air temperature [K]', 'Process temperature [K]', 
            'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
encoded_df[num_cols] = scaler.fit_transform(encoded_df[num_cols])

print("\nScaled Numerical Features:")
display(encoded_df[num_cols].describe())


Scaled Numerical Features:


Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,6.242317e-13,3.66299e-14,-2.359002e-16,5.428547e-16,1.051603e-16
std,1.00005,1.00005,1.00005,1.00005,1.00005
min,-2.352278,-2.901986,-2.068196,-3.630149,-1.695984
25%,-0.8523974,-0.8125581,-0.6458012,-0.6808401,-0.8633176
50%,0.04753123,0.0636534,-0.1995597,0.01134481,0.0007698234
75%,0.7474757,0.7376623,0.4084443,0.6834663,0.8491466
max,2.247357,2.557486,7.51484,3.672902,2.278819


In [20]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = encoded_df.drop('Target', axis=1)
y = encoded_df['Target']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (8000, 14), Test shape: (2000, 14)


In [21]:
# Save processed data for Week 4
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("Processed data saved for modeling!")

Processed data saved for modeling!
