### Merge Data (Only Necessary When EDA changed)

In [1]:
import pandas as pd
import numpy as np

### Settings

In [2]:
file_columns_mapping = {
    # xiangjie
    "xiangjie": ["index","rent_approval_date","rent_approval_year","rent_approval_month","town","street_name","block","flat_type", "flat_model"],

    # weihong
    "weihong": ["index","floor_area_sqm","lease_commence_date"],
    "mrt": ["index", "nearest_mrt_distance" ,"planned_mrt_distance", "planned_mrt_opening_year"],
    
    # jinfeng
    "jinfeng": ["index","latitude","longitude","subzone","region","monthly_rent"],
    "school": [],

    # honglin
    "mall": [],
}

TRAIN_DATA_PATH = "./preprocessed/train.csv"
TEST_DATA_PATH = "./preprocessed/test.csv"
Y_LABEL_COL_NAME = "monthly_rent"
RAW_DATA_PATH = "./Dataset/train.csv"

pd.set_option("display.max_columns", 30)


In [3]:
def load_training_data(folder="./preprocessed/train_"):
    train_df = load_csvs(folder=folder, mapping=file_columns_mapping)
    # train_df.info()
    return train_df

def load_testing_data(folder="./preprocessed/test_", lable_col_name=Y_LABEL_COL_NAME):
    test_mapping = {}
    for k, v in file_columns_mapping.items():
        if lable_col_name in v:
            c_v = v.copy()
            c_v.remove(lable_col_name)
            test_mapping[k] = c_v
        else:
            test_mapping[k] = v
    return load_csvs(folder=folder, mapping=test_mapping)
    

def load_csvs(folder, mapping):
    df_list = []

    for filename, columns in mapping.items():
        df = pd.read_csv(folder+filename+".csv")
        if len(columns) == 0:
            df_list.append(df)
        else:
            df_list.append(df[columns])
    
    df = pd.DataFrame({"index": [i for i in range(0, 60000)]})
    for _df in df_list:
        df = df.merge(_df, on='index')

    return df

In [4]:
train_df = load_training_data()
train_df.head()

Unnamed: 0,index,rent_approval_date,rent_approval_year,rent_approval_month,town,street_name,block,flat_type,flat_model,floor_area_sqm,lease_commence_date,nearest_mrt_distance,planned_mrt_distance,planned_mrt_opening_year,latitude,longitude,subzone,region,monthly_rent,num_school_500m,nearest_school_dist,num_shopping_mall_500m,nearest_shopping_mall_dist
0,0,8,2021,9,2595.146199,2550,257,3,2369.965462,67.0,1983,0.006289,0.006071,2028,1.344518,103.73863,2542.158516,2569.167537,1600,1,334.85,0,1202.67
1,1,16,2022,5,2438.227223,2347,119,4,2369.965462,92.0,1978,0.008087,0.008135,2040,1.330186,103.938717,2360.371046,2570.667785,2250,0,607.72,0,1114.34
2,2,21,2022,10,2516.680515,2498,157,3,2636.211052,67.0,1971,0.001966,0.010649,TBA,1.332242,103.845643,2808.893871,2737.201353,1900,1,425.76,1,468.3
3,3,7,2021,8,2686.857477,2682,250,6,2878.725962,149.0,1993,0.013908,0.004643,2030,1.370239,103.962894,2610.338573,2570.667785,2850,0,564.97,1,402.36
4,4,22,2022,11,2665.537634,2396,34,3,2636.211052,68.0,1972,0.00169,0.025345,2024,1.320502,103.863341,2793.52518,2737.201353,2100,1,271.72,0,1073.35


In [5]:
test_df = load_testing_data()
test_df.head()

Unnamed: 0,index,rent_approval_date,rent_approval_year,rent_approval_month,town,street_name,block,flat_type,flat_model,floor_area_sqm,lease_commence_date,nearest_mrt_distance,planned_mrt_distance,planned_mrt_opening_year,latitude,longitude,subzone,region,num_school_500m,nearest_school_dist,num_shopping_mall_500m,nearest_shopping_mall_dist
0,0,24,2023,1,2503.252886,2480.0,245,5,2636.211052,121.0,1984,0.007379,0.008114,2030,1.358411,103.891722,2427.604167,2558.82271,1,149.67,0,739.4
1,1,20,2022,9,2540.49101,2658.0,316,4,2612.031305,100.0,1999,0.002768,0.079235,2030,1.446343,103.820817,2592.33279,2450.623806,1,143.33,1,254.92
2,2,30,2023,7,2646.808979,2513.0,708,4,2369.965462,91.0,1980,0.009872,0.006675,2032,1.305719,103.762168,2395.588235,2569.167537,0,1040.84,1,497.67
3,3,7,2021,8,2595.146199,2550.0,351,3,2612.031305,74.0,1986,0.003263,0.009626,2028,1.344832,103.730778,2400.15015,2569.167537,0,1075.53,0,1596.4
4,4,14,2022,3,2595.146199,2550.0,305,5,2636.211052,121.0,1983,0.004038,0.007085,2028,1.345437,103.735241,2400.15015,2569.167537,0,714.78,0,1461.49


In [6]:
train_df.to_csv(TRAIN_DATA_PATH, index=False)
test_df.to_csv(TEST_DATA_PATH, index=False)

## Model
1. data preparation
2. model selection
3. test

In [7]:
import pandas as pd
import numpy as np

def get_X_y_df(data, y_column_name=Y_LABEL_COL_NAME):
    feature_names = data.columns.to_list()
    feature_names.remove(y_column_name)
    X = data.loc[:, feature_names]
    y = data.loc[:, [y_column_name]]
    return X, y


### Get Training Data

In [8]:
data_df = pd.read_csv(TRAIN_DATA_PATH)

_X_df, _y_df = get_X_y_df(data_df)

In [9]:
_X_df['planned_mrt_opening_year'] = _X_df['planned_mrt_opening_year'].map(lambda x: 2026 if x =="TBA" else x)
# _X_df = _X_df.drop(columns=['planned_mrt_opening_year'])

### Feature Selection

In [10]:
_X_df.head()

Unnamed: 0,index,rent_approval_date,rent_approval_year,rent_approval_month,town,street_name,block,flat_type,flat_model,floor_area_sqm,lease_commence_date,nearest_mrt_distance,planned_mrt_distance,planned_mrt_opening_year,latitude,longitude,subzone,region,num_school_500m,nearest_school_dist,num_shopping_mall_500m,nearest_shopping_mall_dist
0,0,8,2021,9,2595.146199,2550,257,3,2369.965462,67.0,1983,0.006289,0.006071,2028,1.344518,103.73863,2542.158516,2569.167537,1,334.85,0,1202.67
1,1,16,2022,5,2438.227223,2347,119,4,2369.965462,92.0,1978,0.008087,0.008135,2040,1.330186,103.938717,2360.371046,2570.667785,0,607.72,0,1114.34
2,2,21,2022,10,2516.680515,2498,157,3,2636.211052,67.0,1971,0.001966,0.010649,2026,1.332242,103.845643,2808.893871,2737.201353,1,425.76,1,468.3
3,3,7,2021,8,2686.857477,2682,250,6,2878.725962,149.0,1993,0.013908,0.004643,2030,1.370239,103.962894,2610.338573,2570.667785,0,564.97,1,402.36
4,4,22,2022,11,2665.537634,2396,34,3,2636.211052,68.0,1972,0.00169,0.025345,2024,1.320502,103.863341,2793.52518,2737.201353,1,271.72,0,1073.35


In [14]:
X_df = _X_df.drop(columns=[
    # 'planned_mrt_distance', 
    # 'nearest_mrt_distance', 
    # 'num_shopping_mall_500m', 
    # 'nearest_shopping_mall_dist'
])
y_df = _y_df

In [15]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       60000 non-null  int64  
 1   rent_approval_date          60000 non-null  int64  
 2   rent_approval_year          60000 non-null  int64  
 3   rent_approval_month         60000 non-null  int64  
 4   town                        60000 non-null  float64
 5   street_name                 60000 non-null  int64  
 6   block                       60000 non-null  int64  
 7   flat_type                   60000 non-null  int64  
 8   flat_model                  60000 non-null  float64
 9   floor_area_sqm              60000 non-null  float64
 10  lease_commence_date         60000 non-null  int64  
 11  nearest_mrt_distance        60000 non-null  float64
 12  planned_mrt_distance        60000 non-null  float64
 13  planned_mrt_opening_year    600

### Trainer、Model Selection、Cross Validation

In [16]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

In [17]:
class Trainer:
    def __init__(self, model, X=X_df.values, y=y_df.values, k_folds=5):
        """
        X : array-like of shape (n_samples, n_features)
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
        """
        self.X_train = X
        self.y_train = y
        self.k_fold = k_folds
        self.model = model
        self.trained = None

    def cross_validate(self, metric="rmse", k_fold=None):
        """Root Mean Square Error of Cross Validation"""
        if metric == "rmse":
            kf = KFold(self.k_fold if k_fold is None else k_fold, 
                       random_state=42, 
                       shuffle=True).get_n_splits(self.X_train)
            # TODO other metrics?
            rmse = np.sqrt(-cross_val_score(self.model, self.X_train, self.y_train, scoring="neg_mean_squared_error", cv = kf))
            return (rmse)
        else:
            raise NotImplementedError

    def train(self):
        self.trained = self.model.fit(self.X_train, self.y_train)
        return self.trained
    
    def predict(self, X_test):
        if self.trained is None:
            self.model.fit(self.X_train, self.y_train)
        return self.model.predict(X_test)
    
    def rmse_on_train(self):
        y_pred = self.predict(self.X_train)
        return np.sqrt(mean_squared_error(y_pred, self.y_train))

### Models

In [18]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [19]:
def model_selection(trainer: Trainer, trainer_name="", k_fold=None, metric="rmse"):
    cv = trainer.cross_validate(k_fold=k_fold, metric=metric)
    mean_cv = cv.mean()
    std_cv = cv.std()
    print(f"trainer: {trainer_name} with {metric} | mean: {round(mean_cv, 5)}, std: {round(std_cv, 5)} | all: {cv}")

# TODO: parameter grid search

In [20]:
# X_df = _X_df[["rent_approval_date", "town", "flat_type", "flat_model", "floor_area_sqm", "subzone", "region"]]
# xgboost.cross_validate()
X_df = _X_df.drop(columns=[
    # 'planned_mrt_distance', 
    # 'nearest_mrt_distance', 
    # 'num_shopping_mall_500m', 
    # 'nearest_shopping_mall_dist'
    'planned_mrt_opening_year'
])
y_df = _y_df

# XGBoost
xgboost_model = XGBRegressor(learning_rate=0.05,
                       n_estimators=450,
                       max_depth=4,
                       min_child_weight=0.5,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

xgboost = Trainer(xgboost_model, X=X_df.values)
model_selection(xgboost)

trainer:  with rmse | mean: 482.39784, std: 4.9345 | all: [483.20091217 477.47694176 490.92932228 477.51145867 482.87056526]


In [106]:
# X_df = _X_df[["rent_approval_date", "town", "flat_type", "flat_model", "floor_area_sqm", "subzone", "region"]]
# xgboost.cross_validate()
X_df = _X_df.drop(columns=[
    # 'planned_mrt_distance', 
    # 'nearest_mrt_distance', 
    # 'num_shopping_mall_500m', 
    # 'nearest_shopping_mall_dist'
    # 'nearest_school_dist',
    # 'num_school_500m'
])
y_df = _y_df

# XGBoost
xgboost_model = XGBRegressor(learning_rate=0.05,
                       n_estimators=300,
                       max_depth=4,
                       min_child_weight=1,
                       subsample=0.7,
                       gamma=0.6,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00005,
                       random_state=42)

xgboost = Trainer(xgboost_model, X=X_df.values)
model_selection(xgboost)

trainer:  with rmse | mean: 482.61666, std: 4.98168 | all: [483.68887417 477.58736509 491.29664318 477.89435835 482.61604976]


In [210]:
# X_df = _X_df
xgboost.cross_validate()

array([483.73821011, 477.33416794, 491.64377973, 477.77988549,
       481.87781324])

In [211]:
xgboost.rmse_on_train()

471.4222076126094

In [214]:
prd = xgboost.predict(test_df.drop(columns=['planned_mrt_opening_year']))

In [215]:
pd.DataFrame({
    "Id": [i for i in range(len(prd))],
    "Predicted": np.round(prd, 0)
}).to_csv("./preprocessed/submission.csv", index=False)

In [30]:
# Random Forest
rf_model = RandomForestRegressor(
                        n_estimators=100,
                        max_depth=8, # 7 underfitting, 9~10 overfitting
                        min_samples_split=5,
                        min_samples_leaf=5,
                        max_features=4,
                        criterion="squared_error",
                        # oob_score=True,
                        random_state=42,
                        n_jobs=4
                        )
rf = Trainer(rf_model)

In [31]:
model_selection(rf)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


trainer:  with rmse | mean: 493.97232, std: 5.06886 | all: [495.67128597 489.43610487 502.65337451 488.54372229 493.55712967]


In [32]:
rf.rmse_on_train()

  return fit_method(estimator, *args, **kwargs)


485.22555742392336

In [72]:
# Lasso
lasso_model = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso = Trainer(lasso_model)

In [73]:
lasso.rmse_on_train()

  model = cd_fast.enet_coordinate_descent(


513.2450414065169

In [15]:
# Support Vector Regressor
svr_model = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))
svr = Trainer(svr_model)

In [16]:
model_selection(svr)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


trainer:  with rmse | mean: 603.18412, std: 3.81415 | all: [602.39420835 601.02505091 608.10423393 597.65416751 606.74293299]
