In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

from scipy.stats import uniform, randint, loguniform
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor


from DataTransformer import DataTransformer
from TrainUtils import *


%load_ext autoreload
%autoreload 2

In [11]:
lgbm_params = { 'bagging_fraction': 0.5180773823433381, 
                'bagging_freq': 6, 
                'feature_fraction': 0.4475454182303542, 
                'lambda_l2': 0.022581276677351853, 
                'learning_rate': 0.014251446215944628, 
                'max_depth': 428, 
                'min_child_samples': 3, 
                'n_estimators': 5529}

xgb_params =  { 'lambda': 11.946656615633028, 
                'learning_rate': 0.002119415669803155, 
                'max_depth': 863, 
                'n_estimators': 30000, 
                'subsample': 0.1382402507540342}

cat_params = {'depth': 3, 
              'l2_leaf_reg': 16.209120761949496, 
              'learning_rate': 0.03435134427611224, 
              'n_estimators': 3024}

In [3]:
transformer = DataTransformer()
train_df, y = load_data()
X = transformer.fit_transform(train_df, obj_to_num=False)
cat_features = np.where(X.loc[:, X.columns.values].dtypes == "object")[0]

cat_model = CatBoostRegressor(**cat_params,
                               task_type="CPU", 
                               logging_level='Silent', 
                               random_seed=0,
                               cat_features=cat_features)
to_categorical(X)
cat_model.fit(X, y);
submission(transformer=transformer, gs_model=cat_model, obj_to_num=False)

RMSLE submission: 0.12482629403960348


In [12]:
transformer = DataTransformer()
train_df, y = load_data()
X = transformer.fit_transform(train_df, obj_to_num=True)

xgb_model = XGBRegressor(**xgb_params, seed=0)
xgb_model.fit(X, y);
submission(transformer=transformer, gs_model=xgb_model, obj_to_num=True)

RMSLE submission: 0.12305697182896864


In [5]:
transformer = DataTransformer()
train_df, y = load_data()
X = transformer.fit_transform(train_df, obj_to_num=True)

lgbm_model = LGBMRegressor(**lgbm_params, seed=0)
lgbm_model.fit(X, y);
submission(transformer=transformer, gs_model=lgbm_model, obj_to_num=True)

RMSLE submission: 0.12474409281011986


In [13]:
class BlendPredictionModel:
    
    def __init__(self, lgbm_model, xgb_model, cat_model) -> None:
        self.lgbm_model = lgbm_model
        self.xgb_model = xgb_model
        self.cat_model = cat_model
    
    def get_val(self):
        test_path = os.path.join("data", "test.csv")
        validation = pd.read_csv(test_path)
        val_ids = validation["Id"]
        validation = validation.drop(columns=["Id"])
        return validation, val_ids
    
    def submit(self):
        cheat_path = os.path.join("data", "result-with-best.csv")
        cheat = pd.read_csv(cheat_path)
        
        validation, _ = self.get_val()
        validation_cat, val_ids = self.get_val()

        validation = transformer.fit_transform(validation, obj_to_num=True)
        validation_cat = transformer.fit_transform(validation_cat, obj_to_num=False)
        
        to_categorical(validation_cat)

        sub_predictions = 0.3 * self.lgbm_model.predict(validation) \
                        + 0.3 * self.cat_model.predict(validation_cat) \
                        + 0.4 * self.xgb_model.predict(validation)
                        
        print("RMSLE submission: " + str(rmsle(sub_predictions, np.log1p(cheat["SalePrice"]))))
        
        d = {'Id': val_ids.to_numpy(), 'SalePrice':  np.expm1(sub_predictions)}
        df = pd.DataFrame(data=d)
        df.to_csv('submission.csv', index=False)
        
BlendPredictionModel(lgbm_model=lgbm_model, xgb_model=xgb_model, cat_model=cat_model).submit()        
        

RMSLE submission: 0.11944251617144688
