In [1]:
import pandas as pd
import joblib
from keras import Sequential
from keras.src.layers import Dense, BatchNormalization, Dropout

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scripts.custom_models import catboost_model_tuning
from scripts.custom_models import elastic_net_model_tuning
from scripts.custom_models import random_forest_tuning
from scripts.custom_models import light_gbm_tuning
from scripts.custom_models import knn_model_tuning
from scripts.custom_models import create_stacked_model
from scripts.custom_models import evaluate_model

# Rating Average Prediction
### Loading the dataset

In [2]:
df = pd.read_excel('../../../data/Selected_Features_Rating_Average.xlsx')

### Data preparation

In [3]:
X = df.drop(columns=['Rating Average'])
y = df['Rating Average']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Random Forest
#### Training the model

In [12]:
rf_model = random_forest_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [13]:
rf_pred = rf_model.predict(X_test_scaled)
mse_rf, mae_rf, r2_rf = evaluate_model(y_test, rf_pred)

print(f"Mean Squared Error: {mse_rf}")
print(f"Mean Absolute Error: {mae_rf}")
print(f"R2 Score: {r2_rf}")

Mean Squared Error: 0.5947760893517157
Mean Absolute Error: 0.5684910181911146
R2 Score: 0.2578759199541437


### LightGBM
#### Training the model

In [14]:
light_gbm_model = light_gbm_tuning(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 374
[LightGBM] [Info] Number of data points in the train set: 16116, number of used features: 13
[LightGBM] [Info] Start training from score 6.405470


#### Evaluating the model

In [15]:
light_gbm_pred = light_gbm_model.predict(X_test_scaled)
mse_gbm, mae_gbm, r2_gbm = evaluate_model(y_test, light_gbm_pred)

print(f"Mean Squared Error: {mse_gbm}")
print(f"Mean Absolute Error: {mae_gbm}")
print(f"R2 Score: {r2_gbm}")

Mean Squared Error: 0.5826194540367701
Mean Absolute Error: 0.5650555130421119
R2 Score: 0.27304420254161343


### Catboost
#### Training the model

In [16]:
catboost_model = catboost_model_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [17]:
catboost_pred = catboost_model.predict(X_test_scaled)
mse_cat, mace_cat, r2_cat = evaluate_model(y_test, catboost_pred)

print(f"Mean Squared Error: {mse_cat}")
print(f"Mean Absolute Error: {mace_cat}")
print(f"R2 Score: {r2_cat}")

Mean Squared Error: 0.5806379149300978
Mean Absolute Error: 0.5647620459025978
R2 Score: 0.27551664202420423


### Elastic net
#### Training the model

In [18]:
elastic_net_model = elastic_net_model_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [19]:
elastic_pred = elastic_net_model.predict(X_test_scaled)
mse_elastic, mae_elastic, r2_elastic = evaluate_model(y_test, elastic_pred)

print(f"Mean Squared Error: {mse_elastic}")
print(f"Mean Absolute Error: {mae_elastic}")
print(f"R2 Score: {r2_elastic}")

Mean Squared Error: 0.6762433675402758
Mean Absolute Error: 0.6227091461874976
R2 Score: 0.1562261899768963


### Knn
#### Training the model

In [20]:
knn_model = knn_model_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [21]:
knn_pred = knn_model.predict(X_test_scaled)
mse_knn, mae_knn, r2_knn = evaluate_model(y_test, knn_pred)

print(f"Mean Squared Error: {mse_knn}")
print(f"Mean Absolute Error: {mae_knn}")
print(f"R2 Score: {r2_knn}")

Mean Squared Error: 0.6268210457980393
Mean Absolute Error: 0.6004626474364472
R2 Score: 0.2178922450072267


### Stacked Model
#### Training the model

In [22]:
stacked_model = create_stacked_model(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 374
[LightGBM] [Info] Number of data points in the train set: 16116, number of used features: 13
[LightGBM] [Info] Start training from score 6.405470


#### Evaluating the model

In [23]:
stacked_pred = stacked_model.predict(X_test_scaled)
mse_stacked, mae_stacked, r2_stacked = evaluate_model(y_test, stacked_pred)

print(f"Mean Squared Error: {mse_stacked}")
print(f"Mean Absolute Error: {mae_stacked}")
print(f"R2 Score: {r2_stacked}")

Mean Squared Error: 0.5753851226988431
Mean Absolute Error: 0.5624325153766202
R2 Score: 0.28207074477325844


### Conclusion
#### Taking the best model and its evaluation metrics

In [24]:
best_model = min([
    ('Random Forest', mse_rf, mae_rf, r2_rf, rf_model),
    ('LightGBM', mse_gbm, mae_gbm, r2_gbm, light_gbm_model),
    ('Catboost', mse_cat, mace_cat, r2_cat, catboost_model),
    ('Elastic Net', mse_elastic, mae_elastic, r2_elastic, elastic_net_model),
    ('Knn', mse_knn, mae_knn, r2_knn, knn_model),
    ('Stacked Model', mse_stacked, mae_stacked, r2_stacked, stacked_model)
], key=lambda x: x[1])

print(f"Best Model: {best_model[0]}")

Best Model: Stacked Model


#### Saving the best model

In [5]:
joblib.dump(best_model[4], '../../../app/saved_models/rating_average_model.pkl')
joblib.dump(scaler, '../../../app/saved_models/rating_average_scaler.pkl')

['../../../app/saved_models/rating_average_scaler.pkl']