## Model
1. data preparation
2. model selection
3. test

In [1]:
import pandas as pd
import numpy as np

### Settings

In [2]:
TRAIN_DATA_PATH = "./preprocessed/train.csv" # TODO ===> Feature Engineering Result
Y_LABEL_COL_NAME = "monthly_rent"

### Get Training Data

In [3]:
def get_X_y_df(data, y_column_name=Y_LABEL_COL_NAME):
    feature_names = data.columns.to_list()
    feature_names.remove(y_column_name)
    X = data.loc[:, feature_names]
    y = data.loc[:, [y_column_name]]
    return X, y


In [4]:
df_list = []
for data_path in [f"./preprocessed/train_{i}.csv" for i in range(1, 4)]:
    df_list.append(pd.read_csv(data_path))

df = pd.DataFrame({"index": [i for i in range(0, 60000)]})
for _df in df_list:
    df = df.merge(_df, on='index')

In [5]:
df = df.drop(columns=['index', 'flat_model_x']).rename(columns={"flat_model_y": "flat_model"})

In [6]:
df.to_csv(TRAIN_DATA_PATH, index=False)

In [7]:

data_df = pd.read_csv(TRAIN_DATA_PATH)

X_df, y_df = get_X_y_df(data_df)

In [8]:
X_df

Unnamed: 0,rent_approval_date,town,block,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,region
0,8,2595.146199,257,3,10,67.0,17,1.344518,103.738630,149,4
1,16,2438.227223,119,4,10,92.0,12,1.330186,103.938717,8,1
2,21,2516.680515,157,3,5,67.0,5,1.332242,103.845643,128,0
3,7,2686.857477,250,6,3,149.0,27,1.370239,103.962894,91,1
4,22,2665.537634,34,3,5,68.0,6,1.320502,103.863341,12,0
...,...,...,...,...,...,...,...,...,...,...,...
59995,8,2416.700057,441,3,10,67.0,13,1.366050,103.854168,31,3
59996,27,2904.113924,95,4,7,83.0,53,1.286493,103.821434,52,0
59997,17,2638.489123,862,5,5,122.0,22,1.355064,103.936507,118,1
59998,24,2438.227223,67,5,15,123.0,11,1.318974,103.944076,10,1


### Trainer、Model Selection、Cross Validation

In [9]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

In [10]:
class Trainer:
    def __init__(self, model, X=X_df.values, y=y_df.values, k_folds=5):
        """
        X : array-like of shape (n_samples, n_features)
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
        """
        self.X_train = X
        self.y_train = y
        self.k_fold = k_folds
        self.model = model
        self.trained = None

    def cross_validate(self, metric="rmse", k_fold=None):
        """Root Mean Square Error of Cross Validation"""
        if metric == "rmse":
            kf = KFold(self.k_fold if k_fold is None else k_fold, 
                       random_state=42, 
                       shuffle=True).get_n_splits(self.X_train)
            # TODO other metrics?
            rmse = np.sqrt(-cross_val_score(self.model, self.X_train, self.y_train, scoring="neg_mean_squared_error", cv = kf))
            return (rmse)
        else:
            raise NotImplementedError

    def train(self):
        self.trained = self.model.fit(self.X_train, self.y_train)
        return self.trained
    
    def predict(self, X_test):
        if self.trained is None:
            self.model.fit(self.X_train, self.y_train)
        return self.model.predict(X_test)
    
    def rmse_on_train(self):
        y_pred = self.predict(self.X_train)
        return np.sqrt(mean_squared_error(y_pred, self.y_train))

### Models

In [11]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [12]:
def model_selection(trainer: Trainer, trainer_name="", k_fold=None, metric="rmse"):
    cv = trainer.cross_validate(k_fold, metric=metric)
    mean_cv = cv.mean()
    std_cv = cv.std()
    print(f"trainer: {trainer_name} with {metric} | mean: {round(mean_cv, 5)}, std: {round(std_cv, 5)}")

# TODO: parameter grid search

In [13]:
# Gradient Boosting
gboost_model = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)
gboost = Trainer(gboost_model)

In [95]:
gboost.cross_validate()

  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

In [109]:
# XGBoost
xgboost_model = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)
xgboost = Trainer(xgboost_model)

In [97]:
xgboost.cross_validate()



array([485.30099108, 479.90683504, 492.47248925, 479.1823819 ,
       484.1739538 ])

In [111]:
xgboost.rmse_on_train()



460.3878115296174

In [112]:
# Random Forest
rf_model = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)
rf = Trainer(rf_model)

In [113]:
rf.rmse_on_train()

  return fit_method(estimator, *args, **kwargs)


412.95734832291424

In [14]:
# Lasso
lasso_model = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso = Trainer(lasso_model)

In [15]:
lasso.rmse_on_train()

519.8426195922993

In [68]:
# Support Vector Regressor
svr_model = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))
svr = Trainer(svr_model)