In [1]:
# Enable autoreload for development
%load_ext autoreload
%autoreload 2

In [2]:
# Setup
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [4]:
# Load and clean data
from src.load_data import load_data

In [5]:
df = load_data()
df

Unnamed: 0,No.,Diameter (mm),Speed (rpm),Feed (mm/rev),Thrust force (N),Torque (Nm),Flank wear (mm),Workpiece
0,1,9.0,500,0.13,1088.1,10.67,0.10,cast iron
1,2,9.0,500,0.18,1435.1,14.66,0.13,cast iron
2,3,9.0,500,0.25,1588.3,16.04,0.06,cast iron
3,4,9.0,500,0.36,1669.8,17.12,0.09,cast iron
4,5,9.0,400,0.13,1150.9,11.22,0.12,cast iron
...,...,...,...,...,...,...,...,...
160,161,10.0,1000,0.50,1960.0,18.13,0.13,copper
161,162,7.5,1000,0.50,784.0,7.35,0.10,copper
162,163,5.0,1000,0.50,651.0,6.17,0.07,copper
163,164,10.0,1000,0.71,2009.0,20.58,0.17,copper


In [6]:
from src.clean_data import clean_dataset, add_features
df_clean = clean_dataset(df)

In [7]:
df_clean = add_features(df_clean)  # ⬅️ Feature Engineering : this includes new features

In [8]:
# Identify one-hot workpiece columns (e.g., 'workpiece_b', 'workpiece_c', etc.)
workpiece_cols = [col for col in df_clean.columns if col.startswith('workpiece_')]

In [9]:
df_clean["thrust_force_(n)"] = df_clean["thrust_force_(n)"] / 1000

In [10]:
# Confirm all required columns exist
print("Cleaned Columns:", df_clean.columns.tolist())

Cleaned Columns: ['no.', 'diameter_(mm)', 'speed_(rpm)', 'feed_(mm/rev)', 'thrust_force_(n)', 'torque_(nm)', 'flank_wear_(mm)', 'workpiece_copper', 'workpiece_mild steel', 'speed_per_dia', 'log_feed', 'torque_feed_ratio', 'material_hardness', 'feed_squared', 'speed_squared']


In [11]:
df_clean.shape

(164, 15)

In [12]:
# Features and targets
X = df_clean[['diameter_(mm)', 'speed_(rpm)', 'feed_(mm/rev)',
              'speed_per_dia', 'log_feed',
              'torque_feed_ratio', 'material_hardness',
              'feed_squared', 'speed_squared'] + workpiece_cols]
import numpy as np
# Target transformation: log-transform all 3 targets
y = df_clean[['thrust_force_(n)', 'torque_(nm)', 'flank_wear_(mm)']].copy()

# Log-transform each to handle skewness and stabilize variance
for col in y.columns:
    y[col] = np.log1p(y[col])  # log1p avoids issues with zeros

Interpretation: (a)Torque values seem to follow a roughly normal distribution with no significant outliers.
(b)Flank wear had outliers or noise at the high end, which were removed during cleaning.

In [16]:
# Model tuning
from src.tune_models import tune_random_forest
best_rf, best_params, cv_results = tune_random_forest(X, y)
print("Best Params:", best_params)

import pandas as pd
cv_df = pd.DataFrame(cv_results)
cv_df = cv_df.sort_values(by='mean_test_score', ascending=False)
cv_df[['params', 'mean_test_score', 'rank_test_score']].head()

Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}


Unnamed: 0,params,mean_test_score,rank_test_score
2,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.011457,1
56,"{'max_depth': 30, 'max_features': 'sqrt', 'min...",0.011457,1
38,"{'max_depth': 20, 'max_features': 'sqrt', 'min...",0.011457,1
65,"{'max_depth': 30, 'max_features': 'log2', 'min...",0.011457,1
47,"{'max_depth': 20, 'max_features': 'log2', 'min...",0.011457,1


In [17]:
# Model training and evaluation
from src.model_train import train_models

results = train_models(X, y)

# Extract test set and predictions (still log-scale here)
X_test = results['X_test']
y_test_log = results['y_test']      # ✅ rename for clarity
models = results['models']
metrics = results['metrics']

# ✅ Inverse transform log-scale targets for real-world evaluation
import numpy as np
y_test_orig = np.expm1(np.clip(y_test_log.values, 0, 20))

In [18]:
import pandas as pd

rows = []

for model_name, model_metrics in metrics.items():
    for i, target in enumerate(y.columns):
        row = {
            'Model': model_name,
            'Target': target,
            'R': model_metrics.get('R', [None]*len(y.columns))[i],
            'R2': model_metrics.get('R2', [None]*len(y.columns))[i],
            'MAE': model_metrics.get('MAE', [None]*len(y.columns))[i],
            'MSE': model_metrics.get('MSE', [None]*len(y.columns))[i],
            'RMSE': model_metrics.get('RMSE', [None]*len(y.columns))[i]
        }
        rows.append(row)

summary_df = pd.DataFrame(rows)
display(summary_df.sort_values(by='Target'))

Unnamed: 0,Model,Target,R,R2,MAE,MSE,RMSE
2,RandomForest,flank_wear_(mm),0.655633,0.373639,0.023349,0.001014,0.031846
5,AdaBoost,flank_wear_(mm),0.538123,0.213846,0.027667,0.001273,0.035677
8,KNN,flank_wear_(mm),0.669224,0.074072,0.027925,0.001499,0.038719
0,RandomForest,thrust_force_(n),0.98004,0.959958,0.137513,0.028925,0.170072
3,AdaBoost,thrust_force_(n),0.952195,0.905523,0.21225,0.068246,0.261239
6,KNN,thrust_force_(n),0.487379,0.170305,0.569911,0.599337,0.774169
1,RandomForest,torque_(nm),0.958963,0.913695,1.457048,5.304897,2.303236
4,AdaBoost,torque_(nm),0.925562,0.849684,2.27722,9.239424,3.039642
7,KNN,torque_(nm),0.575624,0.329752,5.516239,41.197778,6.41855
