In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import warninXGBRegressor
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from functools import partial
from tqdm.cli import tqdm
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer 
from sklearn.impute import MissingIndicator
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from src.utils import fprint
from src.utils import percent_of
from src.utils import mape
from src.utils import zip_dataframes
from src.utils import unzip_dataframes
from src.utils import create_submit_df
from src.train import cross_validate
from src.train import full_tain_pred
from src.train import preprocessing
from src.preprocessing import *

In [8]:
data_folder = Path("data")
submissions_folder = Path("submissions")

train_df = pd.read_csv(data_folder / "train.csv", index_col=0)
test_df = pd.read_csv(data_folder / "test_no_target.csv", index_col=0)
zipcodes_df = pd.read_csv(data_folder / "zipcodes.csv", index_col=0)

train_df = pd.merge(train_df.reset_index(), zipcodes_df.drop_duplicates("zipcode"), on="zipcode", how="left")
test_df = pd.merge(test_df.reset_index(), zipcodes_df.drop_duplicates("zipcode"), on="zipcode", how="left")

In [9]:
cat_features = ["type", "gearbox", "model", "fuel", "brand", "city"]
cont_missing_features = ["engine_capacity", "damage", "insurance_price", "latitude", "longitude"]
cat_missing_features = ["type", "gearbox", "model", "fuel", "city"]

In [50]:
train_df, oof_df = train_test_split(train_df, test_size=0.1)

## Simple model stack

In [13]:
print("xgboost")
model = XGBRegressor(random_state=42, n_estimators=500, max_depth=5, objective="reg:gamma")
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
    cat_encode,
]
_, xgb_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=test_df)

print("lgbm")
lgmb_params = {
    "random_state": 42,
    "boosting": "dart",
    "objective": 'regression_l1',
    "num_leaves": 100,
    "feature_fraction": 0.9,
    "max_depth":10,
    "learning_rate": 0.05,
    "num_iterations": 1000,
    "subsample": 0.5
}
model = LGBMRegressor(**lgmb_params)
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
    cat_encode,
]
_, lgbm_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=test_df)

print("catboost")
model = CatBoostRegressor(
    random_state=42,
    depth=10,
    loss_function="MAE",
    cat_features=[1, 3, 5, 7, 8, 12],
    verbose=False,
)
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
]
_, catboost_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=test_df)

xgboost
fold 1 score: 24.673832535385817
fold 2 score: 25.37739112267307
fold 3 score: 24.486894280236303
fold 4 score: 25.486009119389365
mean score: 25.006031764421138
score variance: 0.18703652237525126
lgbm
fold 1 score: 23.226202334681457
fold 2 score: 23.692474610286446
fold 3 score: 22.78935303686965
fold 4 score: 23.81608510307259
mean score: 23.381028771227534
score variance: 0.16508098065677146
catboost
fold 1 score: 24.27824891214072
fold 2 score: 24.967295714730316
fold 3 score: 23.688812733265888
fold 4 score: 24.59646350787276
mean score: 24.38270521700242
score variance: 0.21995913889024285


In [68]:
# blend_test_preds = np.mean(xgb_test_preds + lgbm_test_preds + catboost_test_preds, axis=0)
blend_test_preds = np.sum(np.vstack([
    np.mean(xgb_test_preds, axis=0) * 0.1,
    np.mean(lgbm_test_preds, axis=0) * 0.5,
    np.mean(catboost_test_preds, axis=0) * 0.4,
]), axis=0)


submit_df = create_submit_df(test_df, blend_test_preds)
submit_df.to_csv(submissions_folder / "blend-weighted-v2.csv", index=False)
submit_df

Unnamed: 0,Id,Predicted
0,60314,10632.964866
1,12566,10484.488434
2,17760,7037.014591
3,8876,860.208773
4,80392,7164.828163
...,...,...
49995,93878,1348.041656
49996,99783,3441.993370
49997,57399,1621.436865
49998,97106,10933.860221


## Blending

In [52]:
print("xgboost")
model = XGBRegressor(random_state=42, n_estimators=500, max_depth=5, objective="reg:gamma")
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
    cat_encode,
]
_, xgb_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=oof_df)

print("lgbm")
lgmb_params = {
    "random_state": 42,
    "boosting": "dart",
    "objective": 'regression_l1',
    "num_leaves": 100,
    "feature_fraction": 0.9,
    "max_depth":10,
    "learning_rate": 0.05,
    "num_iterations": 1000,
    "subsample": 0.5
}
model = LGBMRegressor(**lgmb_params)
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
    cat_encode,
]
_, lgbm_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=oof_df)

print("catboost")
model = CatBoostRegressor(
    random_state=42,
    depth=10,
    loss_function="MAE",
    cat_features=[1, 3, 5, 7, 8, 12],
    verbose=False,
)
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
]
_, catboost_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=oof_df)

xgboost
fold 1 score: 24.75755505774779
fold 2 score: 24.77572740367156
fold 3 score: 24.885461584819875
fold 4 score: 25.056334263997854
mean score: 24.86876957755927
score variance: 0.014121162560960349
lgbm
fold 1 score: 22.983216974724126
fold 2 score: 23.723987384155222
fold 3 score: 23.322243259556558
fold 4 score: 23.690887988835254
mean score: 23.43008390181779
score variance: 0.09142942084713285
catboost
fold 1 score: 24.09054395176948
fold 2 score: 24.9566332941939
fold 3 score: 24.156775908718487
fold 4 score: 25.030442190679693
mean score: 24.55859883634039
score variance: 0.19040115520155704


In [53]:
x_oof = np.vstack(xgb_test_preds + lgbm_test_preds + catboost_test_preds).T

In [75]:
meta_model = XGBRegressor(n_estimators=500, max_depth=100, objective="reg:gamma")
# meta_model = LGBMRegressor(random_state=42, max_depth=-1, num_boost_round=1000)
# meta_model = LGBMRegressor(random_state=42, objective="regression_l1", max_depth=10, num_boost_round=500)
# meta_model = RandomForestRegressor()
meta_model.fit(x_oof, oof_df["price"])

mape(oof_df["price"], meta_model.predict(x_oof))

0.03769758581114346

In [76]:
meta_model.feature_importances_

array([0.00612318, 0.04186169, 0.00529715, 0.00637356, 0.00521029,
       0.00460346, 0.00849811, 0.00881098, 0.44561347, 0.24566181,
       0.04495763, 0.17698868], dtype=float32)

In [65]:
train_df = pd.read_csv(data_folder / "train.csv", index_col=0)
test_df = pd.read_csv(data_folder / "test_no_target.csv", index_col=0)
zipcodes_df = pd.read_csv(data_folder / "zipcodes.csv", index_col=0)

train_df = pd.merge(train_df.reset_index(), zipcodes_df.drop_duplicates("zipcode"), on="zipcode", how="left")
test_df = pd.merge(test_df.reset_index(), zipcodes_df.drop_duplicates("zipcode"), on="zipcode", how="left")

In [66]:
print("xgboost")
model = XGBRegressor(random_state=42, n_estimators=500, max_depth=5, objective="reg:gamma")
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
    cat_encode,
]
_, xgb_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=test_df)

print("lgbm")
lgmb_params = {
    "random_state": 42,
    "boosting": "dart",
    "objective": 'regression_l1',
    "num_leaves": 100,
    "feature_fraction": 0.9,
    "max_depth":10,
    "learning_rate": 0.05,
    "num_iterations": 1000,
    "subsample": 0.5
}
model = LGBMRegressor(**lgmb_params)
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
    cat_encode,
]
_, lgbm_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=test_df)

print("catboost")
model = CatBoostRegressor(
    random_state=42,
    depth=10,
    loss_function="MAE",
    cat_features=[1, 3, 5, 7, 8, 12],
    verbose=False,
)
kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
preproc_funcs = [
    indicate_missing,
    impute_nan_with_zero,
    drop_columns,
    drop_price_outliers,
]
_, catboost_test_preds = cross_validate(model, train_df, kfold, mape, preproc_funcs, test_df=test_df)

xgboost
fold 1 score: 24.673832535385817
fold 2 score: 25.37739112267307
fold 3 score: 24.486894280236303
fold 4 score: 25.486009119389365
mean score: 25.006031764421138
score variance: 0.18703652237525126
lgbm
fold 1 score: 23.226202334681457
fold 2 score: 23.692474610286446
fold 3 score: 22.78935303686965
fold 4 score: 23.81608510307259
mean score: 23.381028771227534
score variance: 0.16508098065677146
catboost
fold 1 score: 24.27824891214072
fold 2 score: 24.967295714730316
fold 3 score: 23.688812733265888
fold 4 score: 24.59646350787276
mean score: 24.38270521700242
score variance: 0.21995913889024285


In [77]:
test_preds = meta_model.predict(np.vstack(xgb_test_preds + lgbm_test_preds + catboost_test_preds).T)
submit_df = create_submit_df(test_df, test_preds)
submit_df.to_csv(submissions_folder / "true-blend-v5.csv", index=False)
submit_df

Unnamed: 0,Id,Predicted
0,60314,11224.854492
1,12566,9697.104492
2,17760,8442.008789
3,8876,863.562256
4,80392,8032.683105
...,...,...
49995,93878,1247.747314
49996,99783,4140.040039
49997,57399,1604.617188
49998,97106,13630.916992
