In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import numpy as np

import joblib

from sklearn.metrics import mean_squared_error

In [2]:
X_train = pd.read_csv('data/train.csv', dtype={'Class': bool})
X_test = pd.read_csv('data/test.csv', dtype={'Class': bool})

In [3]:
features = [
        'ship_mode', 
        'segment',
        'region', 
        'category', 
        'sub_category', 
        'sales', 
        'quantity', 
        'discount',
        'state'
]

target = 'profit'

In [4]:
for feature in X_train.select_dtypes(include=['object','category']).columns.tolist():
    X_train[feature] = pd.Categorical(X_train[feature], categories=X_train[feature].unique())
    X_test[feature] = pd.Categorical(X_test[feature], categories=X_test[feature].unique())

In [5]:
y_train = X_train[target]
X_train = X_train[X_train.columns.drop(target)]
X_train = X_train[features]

In [6]:
model_et = joblib.load('models/best_model_et.pkl')
model_gbr = joblib.load('models/best_model_gbr.pkl')
model_knr = joblib.load('models/best_model_knr.pkl')
model_lgb = joblib.load('models/best_model_lightgbm.pkl')
model_rf = joblib.load('models/best_model_randomforest.pkl')

In [7]:
for (model, model_name) in [(model_et, 'Extra Trees'), (model_gbr, 'GB Regressor'), (model_knr, 'KN Regressor'), (model_lgb, 'Light GBM Regressor'), (model_rf, 'RF Regressor')]:
    y_pred = model.predict(X_train)

    mse = np.sqrt(mean_squared_error(y_train, y_pred))
    sc = model.score(X_train, y_train)
    print(f"RMSE {model_name}:", mse)
    print(f"R squared {model_name}:", sc)
    print('\n', '-'*60, '\n')


RMSE Extra Trees: 50.743084997961994
R squared Extra Trees: 0.9479061099883674

 ------------------------------------------------------------ 

RMSE GB Regressor: 19.82340761372613
R squared GB Regressor: 0.9920495872379146

 ------------------------------------------------------------ 

RMSE KN Regressor: 1.034017196584682
R squared KN Regressor: 0.9999783684033028

 ------------------------------------------------------------ 

RMSE Light GBM Regressor: 28.41843877059997
R squared Light GBM Regressor: 0.983660698564506

 ------------------------------------------------------------ 

RMSE RF Regressor: 55.572287165679036
R squared RF Regressor: 0.9375187662156954

 ------------------------------------------------------------ 



All models seems overfitted, so we should look for models that use penalty coefficients (like Lasso/Ridge regressions) to reduce overfitting.

In [8]:
X_test['profit'] = model_knr.predict(X_test[features])
X_test[['id', 'profit']].to_csv('data/submission_knn.csv', index=False)

In [9]:
X_test['profit'] = model_gbr.predict(X_test[features])
X_test[['id', 'profit']].to_csv('data/submission_gbr.csv', index=False)

In [10]:
X_test['profit'] = model_lgb.predict(X_test[features])
X_test[['id', 'profit']].to_csv('data/submission_lgbm.csv', index=False)