In [5]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scripts.custom_models import catboost_model_tuning
from scripts.custom_models import elastic_net_model_tuning
from scripts.custom_models import random_forest_tuning
from scripts.custom_models import light_gbm_tuning
from scripts.custom_models import knn_model_tuning
from scripts.custom_models import create_stacked_model
from scripts.custom_models import evaluate_model

# Owned Users Prediction
### Loading the dataset

In [6]:
df = pd.read_excel('../../../data/Selected_Features_Owned_Users.xlsx')

### Data preparation

In [7]:
X = df.drop(columns=['Owned Users'])
y = df['Owned Users']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Random Forest
#### Training the model

In [6]:
rf_model = random_forest_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [7]:
rf_pred = rf_model.predict(X_test_scaled)
mse_rf, mae_rf, r2_rf = evaluate_model(y_test, rf_pred)

print(f"Mean Squared Error: {mse_rf}")
print(f"Mean Absolute Error: {mae_rf}")
print(f"R2 Score: {r2_rf}")

Mean Squared Error: 475836.64034838945
Mean Absolute Error: 243.97572828226308
R2 Score: 0.9776419217312802


### LightGBM
#### Training the model

In [8]:
light_gbm_model = light_gbm_tuning(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 586
[LightGBM] [Info] Number of data points in the train set: 16116, number of used features: 21
[LightGBM] [Info] Start training from score 1434.624411


#### Evaluating the model

In [9]:
light_gbm_pred = light_gbm_model.predict(X_test_scaled)
mse_gbm, mae_gbm, r2_gbm = evaluate_model(y_test, light_gbm_pred)

print(f"Mean Squared Error: {mse_gbm}")
print(f"Mean Absolute Error: {mae_gbm}")
print(f"R2 Score: {r2_gbm}")

Mean Squared Error: 3110995.83728732
Mean Absolute Error: 290.1990815788666
R2 Score: 0.8538240174762389


### Catboost
#### Training the model

In [10]:
catboost_model = catboost_model_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [11]:
catboost_pred = catboost_model.predict(X_test_scaled)
mse_cat, mace_cat, r2_cat = evaluate_model(y_test, catboost_pred)

print(f"Mean Squared Error: {mse_cat}")
print(f"Mean Absolute Error: {mace_cat}")
print(f"R2 Score: {r2_cat}")

Mean Squared Error: 2534222.237316818
Mean Absolute Error: 304.3432873625702
R2 Score: 0.8809248083737834


### Elastic net
#### Training the model

In [12]:
elastic_net_model = elastic_net_model_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [13]:
elastic_pred = elastic_net_model.predict(X_test_scaled)
mse_elastic, mae_elastic, r2_elastic = evaluate_model(y_test, elastic_pred)

print(f"Mean Squared Error: {mse_elastic}")
print(f"Mean Absolute Error: {mae_elastic}")
print(f"R2 Score: {r2_elastic}")

Mean Squared Error: 482530.7273472649
Mean Absolute Error: 302.1182012678711
R2 Score: 0.9773273874807255


### Knn
#### Training the model

In [14]:
knn_model = knn_model_tuning(X_train_scaled, y_train)

#### Evaluating the model

In [15]:
knn_pred = knn_model.predict(X_test_scaled)
mse_knn, mae_knn, r2_knn = evaluate_model(y_test, knn_pred)

print(f"Mean Squared Error: {mse_knn}")
print(f"Mean Absolute Error: {mae_knn}")
print(f"R2 Score: {r2_knn}")

Mean Squared Error: 2708795.9377108454
Mean Absolute Error: 665.9317341783538
R2 Score: 0.8727221351744014


### Stacked Model
#### Training the model

In [16]:
stacked_model = create_stacked_model(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 586
[LightGBM] [Info] Number of data points in the train set: 16116, number of used features: 21
[LightGBM] [Info] Start training from score 1434.624411


#### Evaluating the model

In [17]:
stacked_pred = stacked_model.predict(X_test_scaled)
mse_stacked, mae_stacked, r2_stacked = evaluate_model(y_test, stacked_pred)

print(f"Mean Squared Error: {mse_stacked}")
print(f"Mean Absolute Error: {mae_stacked}")
print(f"R2 Score: {r2_stacked}")

Mean Squared Error: 450486.94298113906
Mean Absolute Error: 275.61134321399396
R2 Score: 0.978833024874179


### Conclusion
#### Taking the best model and its evaluation metrics

In [18]:
best_model = min([
    ('Random Forest', mse_rf, mae_rf, r2_rf, rf_model),
    ('LightGBM', mse_gbm, mae_gbm, r2_gbm, light_gbm_model),
    ('Catboost', mse_cat, mace_cat, r2_cat, catboost_model),
    ('Elastic Net', mse_elastic, mae_elastic, r2_elastic, elastic_net_model),
    ('Knn', mse_knn, mae_knn, r2_knn, knn_model),
    ('Stacked Model', mse_stacked, mae_stacked, r2_stacked, stacked_model)
], key=lambda x: x[1])

print(f"Best Model: {best_model[0]}")

Best Model: Stacked Model


#### Saving the best model

In [9]:
joblib.dump(best_model[4], '../../../app/saved_models/owned_users_model.pkl')
joblib.dump(scaler, '../../../app/saved_models/owned_users_scaler.pkl')

['../../../app/saved_models/owned_users_scaler.pkl']