In [None]:
%load_ext autoreload
%autoreload 2

import os
import zipfile
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor

# DataFrame Mapper imports
from sklearn_pandas import gen_features
from sklearn_pandas import DataFrameMapper

# Transformers imports: imported for the sake of the exemple
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from crossval_ensemble.custom_pipeline import CustomTransformedTargetRegressor
from crossval_ensemble.crossval_pipeline import CrossvalClassificationPipeline, CrossvalRegressionPipeline

pd.set_option('display.max_columns', None)

# Load data

Run the following commands into terminal : (cf https://github.com/Kaggle/kaggle-api)
```
export KAGGLE_USERNAME=datadinosaur
export KAGGLE_KEY=xxxxxxxxxxxxxx

kaggle competitions download -c house-prices-advanced-regression-techniques
```

In [None]:
with zipfile.ZipFile('./house-prices-advanced-regression-techniques.zip', 'r') as zip_ref:
    os.makedirs('./data/', exist_ok=True)
    zip_ref.extractall('./data/')

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_sub = pd.read_csv('./data/sample_submission.csv')

In [None]:
train.head()

# Define preprocessing

In [None]:
# Combine train and test data for consistence
combined_data = pd.concat([train, test]).drop(columns=['Id', 'SalePrice'])
CONT_COLS = combined_data.select_dtypes(include='number').columns.tolist()
CAT_COLS = combined_data.select_dtypes(include='object').columns.tolist()
cols = CAT_COLS + CONT_COLS

In [None]:
CONT_COLS_list = [[col] for col in CONT_COLS]
CAT_COLS_list = [[col] for col in CAT_COLS]

gen_numeric = gen_features(
    columns=CONT_COLS_list,
    classes=[
        {
            "class": SimpleImputer,
            "strategy": "constant",
            "fill_value": 0.0
        },
        {
            "class": StandardScaler
        }
    ]
)

gen_categories = gen_features(
    columns=CAT_COLS_list,
    classes=[
        {
            "class": SimpleImputer,
            "strategy": "constant",
            "fill_value": "-10"
        },
        {
            "class":OrdinalEncoder,
            "handle_unknown": 'use_encoded_value',
            "unknown_value": -10,
            "encoded_missing_value": -10,
            "dtype": int
        }
    ]
)

# DataFrameMapper construction
preprocess_mapper = DataFrameMapper(
    [
        *gen_numeric,
        *gen_categories,
    ],
    input_df=True,
    df_out=True
)

# Train model

In [None]:
X_train = train[cols]
y_train = train['SalePrice']

X_test = test[cols]
y_test = sample_sub['SalePrice']

In [None]:
model = CrossvalRegressionPipeline(steps=[
    ('prepro', preprocess_mapper),
    ('estimator', CustomTransformedTargetRegressor(
        regressor=CatBoostRegressor(
            iterations=100,
            loss_function='RMSE',
            eval_metric='RMSE',
            use_best_model=True,
            verbose=False,
            random_seed=0
        ),
        transformer=None,  # FunctionTransformer(func=np.log, inverse_func=np.exp)
    ))
], n_folds=5)

model.fit(X_train, y_train, cat_features=CAT_COLS, early_stopping_rounds=100, plot=False)
preds = model.predict(X_test)

In [None]:
y_oof_pred = np.arange(len(y_train))

for fold, fold_dict in model.crossval_dict.items():
    valid_idx = fold_dict['valid_idx']
    y_oof_pred[valid_idx] = fold_dict['pipeline'].predict(X_train.iloc[valid_idx])

In [None]:
np.sqrt(mean_squared_error(np.log(y_train), np.log(y_oof_pred)))

# Submit predictions

In [None]:
sample_sub['SalePrice'] = preds

sample_sub.to_csv('./data/submission.csv', index=False)

Run following command to submit prediction
```
kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"
```

# Comparison simple Catboost

In [None]:
X_train_, X_valid_, y_train_, y_valid_ = train_test_split(X_train, y_train, test_size=0.2)

X_train_ = preprocess_mapper.fit_transform(X_train_)
X_valid_ = preprocess_mapper.transform(X_valid_)
X_test_ = preprocess_mapper.transform(X_test)

estimator = CatBoostRegressor(
    iterations=2000,
    loss_function='MAE',
    eval_metric='MAPE',
    use_best_model=True,
    verbose=500,
    random_seed=0
)

estimator.fit(X_train_, y_train_, eval_set=(X_valid_, y_valid_), cat_features=CAT_COLS, early_stopping_rounds=100)

y_pred = estimator.predict(X_test_)

In [None]:
sample_sub['SalePrice'] = y_pred

sample_sub.to_csv('./data/submission_catboost.csv', index=False)

Run following command to submit prediction
```
kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"
```