### Merge Data (Only Necessary When EDA changed)

In [20]:
import pandas as pd
import numpy as np

In [186]:
max_num = 6
df_list = []
for data_path in [f"./preprocessed/train_{i}.csv" for i in range(1, max_num + 1)]:
    df_list.append(pd.read_csv(data_path))

df = pd.DataFrame({"index": [i for i in range(0, 60000)]})
for _df in df_list:
    df = df.merge(_df, on='index')

In [187]:
df = df.drop(columns=['index', 'flat_model_y']).rename(columns={"flat_model_x": "flat_model"})

In [188]:
df.to_csv(TRAIN_DATA_PATH, index=False)

In [189]:
test_df_list = []
for data_path in [f"./preprocessed/test_{i}.csv" for i in range(1, max_num + 1)]:
    test_df_list.append(pd.read_csv(data_path))

test_df = pd.DataFrame({"index": [i for i in range(0, 60000)]})
for _df in test_df_list:
    test_df = test_df.merge(_df, on='index')

test_df = test_df.drop(columns=['index', 'flat_model_y']).rename(columns={"flat_model_x": "flat_model"})

In [190]:
test_df.to_csv(TEST_DATA_PATH, index=False)

### Settings

In [191]:
TRAIN_DATA_PATH = "./preprocessed/train.csv"
TEST_DATA_PATH = "./preprocessed/test.csv"
Y_LABEL_COL_NAME = "monthly_rent"
RAW_DATA_PATH = "./Dataset/train.csv"

## Model
1. data preparation
2. model selection
3. test

In [192]:
import pandas as pd
import numpy as np

def get_X_y_df(data, y_column_name=Y_LABEL_COL_NAME):
    feature_names = data.columns.to_list()
    feature_names.remove(y_column_name)
    X = data.loc[:, feature_names]
    y = data.loc[:, [y_column_name]]
    return X, y


### Get Training Data

In [193]:
data_df = pd.read_csv(TRAIN_DATA_PATH)

_X_df, _y_df = get_X_y_df(data_df)

In [194]:
_X_df = _X_df.drop(columns=['planned_mrt_opening_year'])

### Feature Selection

In [195]:
_X_df.head()

Unnamed: 0,rent_approval_date,rent_approval_year,rent_approval_month,town,street_name,block,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,region,num_school_500m,nearest_school_dist,num_shopping_mall_500m,nearest_shopping_mall_dist,nearest_mrt_distance,planned_mrt_distance
0,8,2021,9,2595.146199,2550,257,3,2369.965462,67.0,1983,1.344518,103.73863,149,4,1,334.85,0,1202.67,0.006289,0.006071
1,16,2022,5,2438.227223,2347,119,4,2369.965462,92.0,1978,1.330186,103.938717,8,1,0,607.72,0,1114.34,0.008087,0.008135
2,21,2022,10,2516.680515,2498,157,3,2636.211052,67.0,1971,1.332242,103.845643,128,0,1,425.76,1,468.3,0.001966,0.010649
3,7,2021,8,2686.857477,2682,250,6,2878.725962,149.0,1993,1.370239,103.962894,91,1,0,564.97,1,402.36,0.013908,0.004643
4,22,2022,11,2665.537634,2396,34,3,2636.211052,68.0,1972,1.320502,103.863341,12,0,1,271.72,0,1073.35,0.00169,0.025345


In [196]:
# X_df = _X_df[["town", "flat_type", "lease_commence_date", "flat_model", "floor_area_sqm", "subzone", "region", "block", "rent_approval_date"]]
# X_df = _X_df.drop(columns=["rent_approval_year", "rent_approval_month"])
X_df = _X_df.drop(columns=[
    # 'planned_mrt_distance', 
    # 'nearest_mrt_distance', 
    # 'num_shopping_mall_500m', 
    # 'nearest_shopping_mall_dist'
])
y_df = _y_df

In [197]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   rent_approval_date          60000 non-null  int64  
 1   rent_approval_year          60000 non-null  int64  
 2   rent_approval_month         60000 non-null  int64  
 3   town                        60000 non-null  float64
 4   street_name                 60000 non-null  int64  
 5   block                       60000 non-null  int64  
 6   flat_type                   60000 non-null  int64  
 7   flat_model                  60000 non-null  float64
 8   floor_area_sqm              60000 non-null  float64
 9   lease_commence_date         60000 non-null  int64  
 10  latitude                    60000 non-null  float64
 11  longitude                   60000 non-null  float64
 12  subzone                     60000 non-null  int64  
 13  region                      600

### Trainer、Model Selection、Cross Validation

In [198]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

In [199]:
class Trainer:
    def __init__(self, model, X=X_df.values, y=y_df.values, k_folds=5):
        """
        X : array-like of shape (n_samples, n_features)
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
        """
        self.X_train = X
        self.y_train = y
        self.k_fold = k_folds
        self.model = model
        self.trained = None

    def cross_validate(self, metric="rmse", k_fold=None):
        """Root Mean Square Error of Cross Validation"""
        if metric == "rmse":
            kf = KFold(self.k_fold if k_fold is None else k_fold, 
                       random_state=42, 
                       shuffle=True).get_n_splits(self.X_train)
            # TODO other metrics?
            rmse = np.sqrt(-cross_val_score(self.model, self.X_train, self.y_train, scoring="neg_mean_squared_error", cv = kf))
            return (rmse)
        else:
            raise NotImplementedError

    def train(self):
        self.trained = self.model.fit(self.X_train, self.y_train)
        return self.trained
    
    def predict(self, X_test):
        if self.trained is None:
            self.model.fit(self.X_train, self.y_train)
        return self.model.predict(X_test)
    
    def rmse_on_train(self):
        y_pred = self.predict(self.X_train)
        return np.sqrt(mean_squared_error(y_pred, self.y_train))

### Models

In [200]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [201]:
def model_selection(trainer: Trainer, trainer_name="", k_fold=None, metric="rmse"):
    cv = trainer.cross_validate(k_fold=k_fold, metric=metric)
    mean_cv = cv.mean()
    std_cv = cv.std()
    print(f"trainer: {trainer_name} with {metric} | mean: {round(mean_cv, 5)}, std: {round(std_cv, 5)} | all: {cv}")

# TODO: parameter grid search

In [202]:
# XGBoost
xgboost_model = XGBRegressor(learning_rate=0.05,
                       n_estimators=300,
                       max_depth=4,
                       min_child_weight=0.5,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

xgboost = Trainer(xgboost_model, X=X_df.values)

In [203]:
X_df

Unnamed: 0,rent_approval_date,rent_approval_year,rent_approval_month,town,street_name,block,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,region,num_school_500m,nearest_school_dist,num_shopping_mall_500m,nearest_shopping_mall_dist,nearest_mrt_distance,planned_mrt_distance
0,8,2021,9,2595.146199,2550,257,3,2369.965462,67.0,1983,1.344518,103.738630,149,4,1,334.85,0,1202.67,0.006289,0.006071
1,16,2022,5,2438.227223,2347,119,4,2369.965462,92.0,1978,1.330186,103.938717,8,1,0,607.72,0,1114.34,0.008087,0.008135
2,21,2022,10,2516.680515,2498,157,3,2636.211052,67.0,1971,1.332242,103.845643,128,0,1,425.76,1,468.30,0.001966,0.010649
3,7,2021,8,2686.857477,2682,250,6,2878.725962,149.0,1993,1.370239,103.962894,91,1,0,564.97,1,402.36,0.013908,0.004643
4,22,2022,11,2665.537634,2396,34,3,2636.211052,68.0,1972,1.320502,103.863341,12,0,1,271.72,0,1073.35,0.001690,0.025345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,8,2021,9,2416.700057,2417,441,3,2369.965462,67.0,1979,1.366050,103.854168,31,3,2,360.65,0,949.72,0.006027,0.009471
59996,27,2023,4,2904.113924,2715,95,4,2612.031305,83.0,2019,1.286493,103.821434,52,0,0,562.69,0,740.67,0.005573,0.018980
59997,17,2022,6,2638.489123,2602,862,5,2636.211052,122.0,1988,1.355064,103.936507,118,1,1,320.03,2,439.80,0.006503,0.014191
59998,24,2023,1,2438.227223,2487,67,5,2444.223986,123.0,1977,1.318974,103.944076,10,1,1,219.55,0,1775.20,0.008306,0.004277


In [204]:
# X_df = _X_df[["rent_approval_date", "town", "flat_type", "flat_model", "floor_area_sqm", "subzone", "region"]]
# xgboost.cross_validate()
X_df = _X_df.drop(columns=[
    # 'planned_mrt_distance', 
    # 'nearest_mrt_distance', 
    # 'num_shopping_mall_500m', 
    # 'nearest_shopping_mall_dist'
])
y_df = _y_df
model_selection(xgboost)

trainer:  with rmse | mean: 482.47477, std: 5.18635 | all: [483.73821008 477.33416794 491.64377973 477.77988549 481.87781324]


In [208]:
# X_df = _X_df[["rent_approval_date", "town", "flat_type", "flat_model", "floor_area_sqm", "subzone", "region"]]
# xgboost.cross_validate()
X_df = _X_df.drop(columns=[
    # 'planned_mrt_distance', 
    # 'nearest_mrt_distance', 
    # 'num_shopping_mall_500m', 
    # 'nearest_shopping_mall_dist'
    # 'nearest_school_dist',
    # 'num_school_500m'
])
y_df = _y_df

# XGBoost
xgboost_model = XGBRegressor(learning_rate=0.05,
                       n_estimators=300,
                       max_depth=4,
                       min_child_weight=1,
                       subsample=0.7,
                       gamma=0.6,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00005,
                       random_state=42)

xgboost = Trainer(xgboost_model, X=X_df.values)
model_selection(xgboost)

trainer:  with rmse | mean: 482.47477, std: 5.18635 | all: [483.73821011 477.33416794 491.64377973 477.77988549 481.87781324]


In [210]:
# X_df = _X_df
xgboost.cross_validate()

array([483.73821011, 477.33416794, 491.64377973, 477.77988549,
       481.87781324])

In [211]:
xgboost.rmse_on_train()

471.4222076126094

In [214]:
prd = xgboost.predict(test_df.drop(columns=['planned_mrt_opening_year']))

In [215]:
pd.DataFrame({
    "Id": [i for i in range(len(prd))],
    "Predicted": np.round(prd, 0)
}).to_csv("./preprocessed/submission.csv", index=False)

In [30]:
# Random Forest
rf_model = RandomForestRegressor(
                        n_estimators=100,
                        max_depth=8, # 7 underfitting, 9~10 overfitting
                        min_samples_split=5,
                        min_samples_leaf=5,
                        max_features=4,
                        criterion="squared_error",
                        # oob_score=True,
                        random_state=42,
                        n_jobs=4
                        )
rf = Trainer(rf_model)

In [31]:
model_selection(rf)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


trainer:  with rmse | mean: 493.97232, std: 5.06886 | all: [495.67128597 489.43610487 502.65337451 488.54372229 493.55712967]


In [32]:
rf.rmse_on_train()

  return fit_method(estimator, *args, **kwargs)


485.22555742392336

In [72]:
# Lasso
lasso_model = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso = Trainer(lasso_model)

In [73]:
lasso.rmse_on_train()

  model = cd_fast.enet_coordinate_descent(


513.2450414065169

In [15]:
# Support Vector Regressor
svr_model = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))
svr = Trainer(svr_model)

In [16]:
model_selection(svr)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


trainer:  with rmse | mean: 603.18412, std: 3.81415 | all: [602.39420835 601.02505091 608.10423393 597.65416751 606.74293299]
