In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scripts.custom_models import random_forest_tuning
from scripts.custom_models import light_gbm_tuning
from scripts.custom_models import create_stacked_model
from scripts.custom_models import evaluate_model

# Complexity Average Prediction
### Loading the dataset

In [2]:
df = pd.read_excel('../../../data/Selected_Features_Complexity_Average.xlsx')

### Data preparation

In [3]:
X = df.drop(columns=['Complexity Average'])
y = df['Complexity Average']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Random Forest
#### Training the model

In [14]:
rf_model = random_forest_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [15]:
rf_pred = rf_model.predict(X_test_scaled)
mse_rf, mae_rf, r2_rf = evaluate_model(y_test, rf_pred)

print(f"Mean Squared Error: {mse_rf}")
print(f"Mean Absolute Error: {mae_rf}")
print(f"R2 Score: {r2_rf}")

Mean Squared Error: 0.2736138320863835
Mean Absolute Error: 0.3801807307800528
R2 Score: 0.6216695138965629


### LightGBM
#### Training the model

In [16]:
light_gbm_model = light_gbm_tuning(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 397
[LightGBM] [Info] Number of data points in the train set: 16093, number of used features: 21
[LightGBM] [Info] Start training from score 1.987583


#### Evaluating the model

In [17]:
light_gbm_pred = light_gbm_model.predict(X_test_scaled)
mse_gbm, mae_gbm, r2_gbm = evaluate_model(y_test, light_gbm_pred)

print(f"Mean Squared Error: {mse_gbm}")
print(f"Mean Absolute Error: {mae_gbm}")
print(f"R2 Score: {r2_gbm}")

Mean Squared Error: 0.2612489932901297
Mean Absolute Error: 0.36958685248458034
R2 Score: 0.6387665862072216


### Stacked Model
#### Training the model

In [18]:
stacked_model = create_stacked_model(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001501 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 397
[LightGBM] [Info] Number of data points in the train set: 16093, number of used features: 21
[LightGBM] [Info] Start training from score 1.987583


#### Evaluating the model

In [19]:
stacked_pred = stacked_model.predict(X_test_scaled)
mse_stacked, mae_stacked, r2_stacked = evaluate_model(y_test, stacked_pred)

print(f"Mean Squared Error: {mse_stacked}")
print(f"Mean Absolute Error: {mae_stacked}")
print(f"R2 Score: {r2_stacked}")

Mean Squared Error: 0.25814023993570256
Mean Absolute Error: 0.36707762783307846
R2 Score: 0.6430651121946972


### Conclusion
#### Taking the best model and its evaluation metrics

In [20]:
best_model = min([
    ('Random Forest', mse_rf, mae_rf, r2_rf, rf_model),
    ('LightGBM', mse_gbm, mae_gbm, r2_gbm, light_gbm_model),
    ('Stacked Model', mse_stacked, mae_stacked, r2_stacked, stacked_model)
], key=lambda x: x[1])

print(f"Best Model: {best_model[0]}")

Best Model: Stacked Model


#### Saving the best model

In [5]:
joblib.dump(best_model[4], '../../../app/saved_models/complexity_average_model.pkl')
joblib.dump(scaler, '../../../app/saved_models/complexity_average_scaler.pkl')

['../../../app/saved_models/complexity_average_scaler.pkl']