In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

from scipy.stats import uniform, randint, loguniform
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostRegressor

from DataTransformer import DataTransformer
from TrainUtils import *


%load_ext autoreload
%autoreload 2

In [3]:
train_df, target = load_data()

transformer = DataTransformer()
X = transformer.fit_transform(train_df, obj_to_num=False)
X = X.iloc[np.random.choice(np.arange(len(X)), 300, False)]
y = target[X.index]

cat_features = np.where(X.loc[:, X.columns.values].dtypes == "object")[0]
to_categorical(X)
print(f"Is there nan: {np.any(X.isnull())}")

Is there nan: False


In [4]:
parameters = {
    "n_estimators": randint(2000, 10000),
    "learning_rate": uniform(1e-3,  1e-1),
    "depth": randint(1, 5),
    "l2_leaf_reg": loguniform(10, 40)
    }

model = CatBoostRegressor( task_type="CPU", 
                           logging_level='Silent', 
                           random_seed=0,
                           cat_features=cat_features)

grid_model = RandomizedSearchCV(model, parameters, cv=5, n_jobs=-1)
grid_model.fit(X, y);

In [5]:
print(grid_model.best_params_)

train_df, target = load_data()

X = transformer.fit_transform(train_df, obj_to_num=False)
y = target
cat_features = np.where(X.loc[:, X.columns.values].dtypes == "object")[0]
to_categorical(X)

gs_model = CatBoostRegressor(**grid_model.best_params_,
                               task_type="CPU", 
                               logging_level='Silent', 
                               random_seed=0,
                               cat_features=cat_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)
gs_model.fit(X_train, y_train);

{'depth': 3, 'l2_leaf_reg': 16.209120761949496, 'learning_rate': 0.03435134427611224, 'n_estimators': 3024}


In [6]:
evaluate(gs_model, X_train, y_train)
evaluate(gs_model, X_test, y_test)

RMSLE: 0.07797091101853514
RMSLE: 0.14448462063807313


In [8]:
gs_model = CatBoostRegressor(**grid_model.best_params_,
                               task_type="CPU", 
                               logging_level='Silent', 
                               random_seed=0,
                               cat_features=cat_features)
train_df, y = load_data()
X = transformer.fit_transform(train_df, obj_to_num=False)
gs_model.fit(X, y);

In [9]:
submission(transformer=transformer, gs_model=gs_model, obj_to_num=False)

RMSLE submission: 0.12439862990394106
