## 1. Machine Learning Model Selection and Evaluation

We begin by importing necessary libraries

In [1]:
# Importing necessary libraries
# General libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

 In the cell below, `cleaned_data.csv` as `model_df`:

In [2]:
# Loading the data
model_df = pd.read_csv('cleaned_data.csv')
model_df

Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity,quarter_Quarter1,quarter_Quarter2,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5,department_finishing,department_sweing,month,day_of_week
0,8,0.80,1.014552,1108.0,0.750589,98,0.0,0,0,59.0,0.940725,1,0,0,0,0,0,1,1,3
1,1,0.75,-1.016778,1039.0,-1.077682,0,0.0,0,0,8.0,0.886500,1,0,0,0,0,1,0,1,3
2,11,0.80,-0.333878,968.0,-0.271092,50,0.0,0,0,30.5,0.800570,1,0,0,0,0,0,1,1,3
3,12,0.80,-0.333878,968.0,-0.271092,50,0.0,0,0,30.5,0.800570,1,0,0,0,0,0,1,1,3
4,6,0.80,0.990783,1170.0,-0.790895,50,0.0,0,0,56.0,0.800382,1,0,0,0,0,0,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,10,0.75,-1.111853,1039.0,-1.077682,0,0.0,0,0,8.0,0.628333,0,1,0,0,0,1,0,3,2
1193,8,0.70,-1.020434,1039.0,-1.077682,0,0.0,0,0,8.0,0.625625,0,1,0,0,0,1,0,3,2
1194,7,0.65,-1.020434,1039.0,-1.077682,0,0.0,0,0,8.0,0.625625,0,1,0,0,0,1,0,3,2
1195,9,0.75,-1.111853,1039.0,-0.826743,0,0.0,0,0,15.0,0.505889,0,1,0,0,0,1,0,3,2


### Data Splitting: 

* Split the data into training and test sets (80% training, 20% testing). 

* Set the random_state to 42

In [3]:
# Split the data into features (X) and target variable (y)
X = model_df.drop(columns='actual_productivity', axis=1)
y = model_df['actual_productivity']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Selection and Training

we train multiple regression models to predict productivity:

#### * Linear Regression

A basic regression model as a base model. 

In [4]:
# Initialize and train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = lr.predict(X_test)

In [5]:
def evaluate_model(y_test, y_pred, model_name="Model"):
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5  # Root Mean Squared Error
    r2 = r2_score(y_test, y_pred)
    
    # Display the results
    print(f"--- {model_name} Performance ---")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # Return metrics as a dictionary
    #return {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}

In [6]:
# Evaluate on test set
test_metrics = evaluate_model(y_test, y_pred_lr, model_name="Linear Regression")
test_metrics

--- Linear Regression Performance ---
Mean Absolute Error (MAE): 0.0928
Mean Squared Error (MSE): 0.0166
Root Mean Squared Error (RMSE): 0.1288
R² Score: 0.3376


#### * Ridge and Lasso Regression

Regularized linear models to handle multicollinearity and overfitting. 


In [7]:
# Initialize Ridge and Lasso models
ridge_model = Ridge(alpha=1.0)  
lasso_model = Lasso(alpha=0.1)  

# Train the models
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge_model.predict(X_test)
y_pred_lasso = lasso_model.predict(X_test)

In [8]:
# Evaluate on test set
Ridge_test_metrics = evaluate_model(y_test, y_pred_ridge, model_name="Ridge Regression")
Lasso_test_metrics = evaluate_model(y_test, y_pred_lasso, model_name="Lasso Regression")

--- Ridge Regression Performance ---
Mean Absolute Error (MAE): 0.0931
Mean Squared Error (MSE): 0.0165
Root Mean Squared Error (RMSE): 0.1286
R² Score: 0.3398
--- Lasso Regression Performance ---
Mean Absolute Error (MAE): 0.1052
Mean Squared Error (MSE): 0.0190
Root Mean Squared Error (RMSE): 0.1380
R² Score: 0.2404


#### * Random Forest Regressor

An ensemble model using decision trees for robustness.


In [9]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rfr = rf_model.predict(X_test)

In [10]:
# Evaluate on testing data
evaluate_model(y_test, y_pred_rfr, model_name="Random Forest Regressor")

--- Random Forest Regressor Performance ---
Mean Absolute Error (MAE): 0.0673
Mean Squared Error (MSE): 0.0112
Root Mean Squared Error (RMSE): 0.1057
R² Score: 0.5539


#### * Gradient Boosting Regressor

A boosting model for iterative accuracy improvement. 


In [11]:
# Initialize the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test)

In [12]:
# Evaluate on testing data
evaluate_model(y_test, y_pred_gb, model_name="Gradient Boosting (Test)")

--- Gradient Boosting (Test) Performance ---
Mean Absolute Error (MAE): 0.0761
Mean Squared Error (MSE): 0.0134
Root Mean Squared Error (RMSE): 0.1159
R² Score: 0.4638


#### * XGBoost Regressor
An advanced boosting algorithm with optimizations for speed and performance. 


In [13]:
# Initialize the XGBoost Regressor
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    objective='reg:squarederror'
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

In [14]:
# Evaluate on testing data
evaluate_model(y_test, y_pred_xgb, model_name="XGBoost Regressor")

--- XGBoost Regressor Performance ---
Mean Absolute Error (MAE): 0.0752
Mean Squared Error (MSE): 0.0128
Root Mean Squared Error (RMSE): 0.1132
R² Score: 0.4886


#### * Support Vector Regressor (SVR)
A kernel-based model to capture complex relationships


In [15]:
# Initialize the SVR model
svr_model = SVR(kernel='rbf', C=10, epsilon=0.1)

# Fit the model
svr_model.fit(X_train, y_train.ravel())

# Predict on the testing sets
y_pred_svr = svr_model.predict(X_test)

In [16]:
# Evaluate on testing data
evaluate_model(y_test, y_pred_svr, model_name=" SVR ")

---  SVR  Performance ---
Mean Absolute Error (MAE): 0.1069
Mean Squared Error (MSE): 0.0192
Root Mean Squared Error (RMSE): 0.1386
R² Score: 0.2335


#### *AdaBoost Regressor

An ensemble learning technique that combines multiple weak learners to create a strong model

In [17]:
# Initialize the base learner (Decision Tree Regressor)
base_learner = DecisionTreeRegressor(max_depth=3)

# Initialize AdaBoost Regressor
ada_boost_model = AdaBoostRegressor(base_estimator=base_learner, n_estimators=50, learning_rate=0.1)

# Fit the model
ada_boost_model.fit(X_train, y_train.ravel())

# Predict on testing sets
y_pred_ada = ada_boost_model.predict(X_test)

In [18]:
# Evaluate on testing data
evaluate_model(y_test, y_pred_ada, model_name="AdaBoost ")

--- AdaBoost  Performance ---
Mean Absolute Error (MAE): 0.0907
Mean Squared Error (MSE): 0.0150
Root Mean Squared Error (RMSE): 0.1223
R² Score: 0.4030
