## Model
1. data preparation
2. model selection
3. test

In [63]:
import pandas as pd
import numpy as np

### Settings

In [24]:
TRAIN_DATA_PATH = "./Dataset/train.csv" # TODO ===> Feature Engineering Result
Y_LABEL_COL_NAME = "monthly_rent"

### Get Training Data

In [43]:
def get_X_y_df(data, y_column_name=Y_LABEL_COL_NAME):
    feature_names = data.columns.to_list()
    feature_names.remove(y_column_name)
    X = data.loc[:, feature_names]
    y = data.loc[:, [y_column_name]]
    return X, y


In [44]:

data_df = pd.read_csv(TRAIN_DATA_PATH)

X_df, y_df = get_X_y_df(data_df)

### Trainer、Model Selection、Cross Validation

In [64]:
from sklearn.model_selection import cross_val_score, KFold

In [59]:
class Trainer:
    def __init__(self, model, X=X_df.values, y=y_df.values, k_folds=5):
        """
        X : array-like of shape (n_samples, n_features)
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
        """
        self.X_train = X
        self.y_train = y
        self.k_fold = k_folds
        self.model = model
        self.trained = None

    def cross_validate(self, metric="rmse", k_fold=None):
        """Root Mean Square Error of Cross Validation"""
        if metric == "rmse":
            kf = KFold(self.k_fold if k_fold is None else k_fold, 
                       random_state=42, 
                       shuffle=True).get_n_splits(self.X_train)
            # TODO other metrics?
            rmse = np.sqrt(-cross_val_score(self.model, self.X_train, self.y_train, scoring="neg_mean_squared_error", cv = kf))
            return (rmse)
        else:
            raise NotImplementedError

    def train(self):
        self.trained = self.model.fit(self.X_train, self.y_train)
        return self.trained
    
    def predict(self, X_test):
        return self.trained.predict(X_test)

### Models

In [48]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [62]:
def model_selection(trainer: Trainer, trainer_name="", k_fold=None, metric="rmse"):
    cv = trainer.cross_validate(k_fold, metric=metric)
    mean_cv = cv.mean()
    std_cv = cv.std()
    print(f"trainer: {trainer_name} with {metric} | mean: {round(mean_cv, 5)}, std: {round(std_cv, 5)}")

# TODO: parameter grid search

In [68]:
# Gradient Boosting
gboost_model = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)
gboost = Trainer(gboost_model)

In [69]:
# XGBoost
xgboost_model = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)
xgboost = Trainer(xgboost_model)

In [55]:
# Random Forest
rf_model = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)
rf = Trainer(rf_model)

In [65]:
# Kernel Ridge
krr_model = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
krr = Trainer(krr_model)

In [66]:
# Lasso
lasso_model = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso = Trainer(lasso_model)

In [67]:
# Support Vector Regressor
svr_model = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))
svr = Trainer(svr_model)