In [41]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
import matplotlib.pyplot as plt

In [43]:
def print_full(x):
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

In [44]:
df_train = pd.read_csv("./datasets/train-data-cleaned-v2.csv")
df_test = pd.read_csv("./datasets/test-data-cleaned-v2.csv")

df_train = df_train.loc[:, ~df_train.columns.str.contains('^Unnamed')]
df_test = df_test.loc[:, ~df_test.columns.str.contains('^Unnamed')]

# df_train = df_train.iloc[:,0:29]
# df_test = df_test.iloc[:,0:28]

df_train = df_train.reindex(sorted(df_train.columns), axis=1)
df_test = df_test.reindex(sorted(df_test.columns), axis=1)



In [45]:
print(df_train.shape)
print(df_test.shape)

df_X, df_y = df_train.loc[:, df_train.columns != 'monthly_rent'], df_train['monthly_rent']

X, y = df_X.to_numpy(), df_y.to_numpy()

print_full(df_train.head(1))
print()
print_full(df_test.head(1))

(60000, 79)
(60000, 78)
                0799hk               1792hk               2588hk               a17usi                ap4si                 asln                bn4si               buousi                c07si                c09si               c38usi                c52si                c6lsi               cjlusi                d05si                e5hsi                f34si  flat_model                 flex       floor_area_sqm                g07si                g13si                 grab                 grin                hmnsi               k71usi                  ken                 klic             latitude            longitude               m44usi                 maxn               me8usi  monthly_avg_rent_by_flat_model  monthly_avg_rent_by_planning_area  monthly_avg_rent_by_region  monthly_avg_rent_by_subzone  monthly_avg_rent_per_sqm_by_flat_model  monthly_avg_rent_per_sqm_by_planning_area  monthly_avg_rent_per_sqm_by_region  monthly_avg_rent_per_sqm_by_subzone  monthly_r

In [46]:
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.neighbors  import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_validate

In [47]:
%%time

print(X.shape)
print(y.shape)
# Only considered hyperparameter: max depth of trees
param_choices = [1, 2, 3, 5, 8, 10, 12, 15, 20, 25, 50,100]
# param_choices = [100,150,200]

# Keep track of results for visualization
param_to_scores = {}

for param in param_choices:

    # Train regressor with the current parameter setting
    # regressor = DecisionTreeRegressor(max_depth=param)
    # regressor = RandomForestRegressor(max_depth=param)
    regressor = KNeighborsRegressor(n_neighbors=param)
    
    # Perform 10-fold cross_validations
    scores = cross_validate(regressor, X, y, cv=10, scoring='neg_root_mean_squared_error', return_train_score=True)
    
    # Extract the 10 RSME scores (training scores and validation scores) for each run/fold
    # The (-1) is only needed since we get the negative root mean squared errors (it's a sklearn thing)
    rsme_train = scores['train_score'] * (-1)
    rsme_valid = scores['test_score'] * (-1)
    
    ## Keep track of all num_folds f1 scores for current param (for plotting)
    param_to_scores[param] = (rsme_train, rsme_valid)
    
    ## Print statement for some immediate feedback (values in parenthesis represent the Standard Deviation)
    print('param = {}, RSME training = {:.1f} ({:.1f}), RSME validation = {:.1f} ({:.1f})'
          .format(param, np.mean(rsme_train), np.std(rsme_train), np.mean(rsme_valid), np.std(rsme_valid)))

(60000, 78)
(60000,)
param = 1, RSME training = 173.3 (1.1), RSME validation = 682.2 (10.6)
param = 2, RSME training = 343.8 (0.8), RSME validation = 593.5 (8.2)
param = 3, RSME training = 395.1 (0.6), RSME validation = 560.5 (6.0)
param = 5, RSME training = 435.1 (0.8), RSME validation = 534.2 (6.3)
param = 8, RSME training = 458.3 (0.7), RSME validation = 520.6 (6.0)
param = 10, RSME training = 466.4 (0.6), RSME validation = 517.2 (6.3)
param = 12, RSME training = 472.7 (0.6), RSME validation = 515.2 (6.1)
param = 15, RSME training = 479.8 (0.8), RSME validation = 513.9 (6.2)
param = 20, RSME training = 487.6 (0.7), RSME validation = 513.3 (5.9)
param = 25, RSME training = 492.8 (0.7), RSME validation = 513.6 (5.6)
param = 50, RSME training = 508.6 (0.7), RSME validation = 519.3 (5.4)
param = 100, RSME training = 525.0 (0.7), RSME validation = 530.6 (6.0)
Wall time: 1h 51min 28s


In [48]:
regressor = KNeighborsRegressor(n_neighbors=15)
pred = regressor.fit(X,y).predict(df_test.to_numpy())

In [49]:
df = pd.DataFrame({'Id': np.arange(len(pred)), 'Predicted': pred})
df.to_csv("./pred/cx_knn.csv", index=False)