# Kernel Ridge Regression

In [1]:
import pandas as pd

## Data Import

In [2]:
df_train = pd.read_csv('raw_data/train.csv')
df_test = pd.read_csv('raw_data/test.csv')

Import auxiliary data and select useful columns

In [3]:
df_train_aux = pd.read_csv('raw_data/train_auxiliary_data.csv', index_col=0)
df_test_aux = pd.read_csv('raw_data/test_auxiliary_data.csv', index_col=0)

useful_columns = ['nearest_mrt_line', 'dist_meters_to_nearest_mrt', 'dist_meters_to_nearest_mall',
       'dist_meters_to_nearest_commercial_centre',
       'dist_meters_to_nearest_primary_school',
       'dist_meters_to_nearest_secondary_school']
df_train_aux = df_train_aux[useful_columns]
df_test_aux = df_test_aux[useful_columns]

Join the auxiliary data to the dataset

In [4]:
df_train = df_train.join(df_train_aux)
df_test = df_test.join(df_test_aux)

## Data pre-processing

Clean dirty values and drop outliers identified in EDA.

In [5]:
from src.preprocessing import clean_property_type, fill_missing_values, update_data, drop_outliers

def data_cleaning(df):
    df_clean = df.copy()
    df_clean = clean_property_type(df_clean)
    df_clean = fill_missing_values(df_clean)
    df_clean = update_data(df_clean)
    return df_clean

In [6]:
df_train = data_cleaning(df_train)
df_train = drop_outliers(df_train)

df_test = data_cleaning(df_test)

Drop unused columns from the original dataset.

In [7]:
from src.preprocessing import drop_columns
df_train = drop_columns(df_train)
df_test = drop_columns(df_test)

### Separate into X and y DataFrames

In [8]:
y_train = df_train['price']
X_train = df_train.drop('price', axis=1)
X_test = df_test

## Encoding

Check the number of unique values for each categorical feature.

In [9]:
X_train.describe(include='object')

Unnamed: 0,property_type,tenure,planning_area,nearest_mrt_line
count,20242,20242,20242,20242
unique,15,11,43,8
top,condo,99-year leasehold,bukit timah,dt
freq,9642,12842,1323,4494


For the numerical variables, the order of magnitude varies, so normalization is carried out.

In [10]:
X_train.describe()

Unnamed: 0,built_year,num_beds,num_baths,size_sqft,lat,lng,dist_meters_to_nearest_mrt,dist_meters_to_nearest_mall,dist_meters_to_nearest_commercial_centre,dist_meters_to_nearest_primary_school,dist_meters_to_nearest_secondary_school
count,20242.0,20242.0,20242.0,20242.0,20242.0,20242.0,20242.0,20242.0,20242.0,20242.0,20242.0
mean,2010.78678,3.118615,2.631311,1711.291374,1.339994,103.843159,787.372721,794.325956,1916.811853,739.733503,821.856021
std,15.66283,1.280566,1.461078,1850.382364,0.046302,0.054085,554.400468,529.386317,1059.405893,535.123264,615.858802
min,1963.0,1.0,1.0,65.0,1.239621,103.685206,5.257545,0.0,89.629657,0.0,0.0
25%,2000.0,2.0,2.0,807.0,1.307313,103.806671,401.576118,401.713836,1168.87972,371.143964,430.393277
50%,2017.0,3.0,2.0,1119.0,1.328225,103.841572,658.704159,678.778647,1681.479925,605.14608,685.280667
75%,2023.0,4.0,3.0,1528.0,1.371107,103.880155,999.991303,1039.547208,2423.052923,997.606995,1029.960484
max,2028.0,10.0,10.0,30000.0,1.461582,103.99751,4113.783288,4028.204443,5797.965651,4009.378093,4442.112669


In [11]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, minmax_scale
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import make_column_transformer

# cat_vars = list(X_train.select_dtypes('object'))
num_vars = list(X_train.select_dtypes('number'))

ct = make_column_transformer(
    (MinMaxScaler(), num_vars),
    (TargetEncoder(), ['planning_area']),
    (OneHotEncoder(max_categories=3), ['tenure']),
    (OneHotEncoder(), ['property_type', 'nearest_mrt_line']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# Fit the column transformer
ct.fit(X_train, minmax_scale(y_train))



In [12]:
# Transform the features
X_train = ct.transform(X_train)
X_test = ct.transform(X_test)

## Fitting model with Cross Validation

In [13]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha' : [0.1],
    'gamma' : [0.1]
}

model = GridSearchCV(KernelRidge(kernel='rbf'), param_grid, scoring='neg_root_mean_squared_error')
model.fit(X_train, y_train)

In [14]:
print(f'The best parameters are: {model.best_params_}')

The best parameters are: {'alpha': 0.1, 'gamma': 0.1}


In [15]:
cv_results = pd.DataFrame(model.cv_results_).T
cv_results

Unnamed: 0,0
mean_fit_time,24.525093
std_fit_time,1.147311
mean_score_time,0.991567
std_score_time,0.053283
param_alpha,0.1
param_gamma,0.1
params,"{'alpha': 0.1, 'gamma': 0.1}"
split0_test_score,-2302014.978408
split1_test_score,-3282720.683281
split2_test_score,-1756500.400308


## Model Results

In [16]:
import joblib
# Save the model
joblib.dump(model, 'models/krr_model_with_aux_data.joblib')

['models/krr_model_with_aux_data.joblib']