In [45]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
import joblib

## Reading data

In [46]:
s1raw     = pd.read_csv("../data/s1.csv")
s1_2raw   = pd.read_csv("../data/s1_2u.csv")
df2017raw = pd.read_csv("../data/df2017.csv")

## Droping cols and OneHotEncoding aka set dummy variable

In [47]:
def preprocess(csv_file):
  csv_file = csv_file.drop([
    'Unnamed: 0',
    'month', 
    'flat_type', 
    'address',
    'lease_commence_date'
    # these 5, try removing to see the effects on data
    ,'year'
    ,'years'
    ,'quarter'
    #,'Lat'
    #,'Lon'
    ],axis = 1)

  onehot = OneHotEncoder()

  town_df = pd.DataFrame(onehot.fit_transform(csv_file[['town']]).toarray())
  town_df.columns = onehot.get_feature_names_out(['town'])

  flat_model_df = pd.DataFrame(onehot.fit_transform(csv_file[['flat_model']]).toarray())
  flat_model_df.columns = onehot.get_feature_names_out(['flat_model'])

  csv_file = csv_file.join([town_df, flat_model_df])
  csv_file = csv_file.drop(['town','flat_model'], axis = 1)

  return csv_file


In [48]:
s1_2 = preprocess(s1_2raw)

add_flat = np.zeros(1091)
## since flat_model_Adjoined flat is not in the data 
s1_2["flat_model_Adjoined flat"] = add_flat
cols = list(s1_2.columns.values)
## reorder back to the original col order
s1_2 = s1_2[cols]

df2017 = preprocess(df2017raw)

savefile = open("savefile.txt", "w")

print(cols)

['floor_area_sqm', 'resale_price', 'remaining_lease', 'index', 'Lat', 'Lon', 'dist_presch', 'dist_gym', 'dist_park', 'dist_pharm', 'dist_mrt', 'dist_hawker', 'storey_index', 'k_fold', 'town_BUKIT BATOK', 'town_BUKIT MERAH', 'town_CLEMENTI', 'town_JURONG WEST', 'town_SENGKANG', 'town_TAMPINES', 'town_TOA PAYOH', 'town_WOODLANDS', 'flat_model_2-room', 'flat_model_Apartment', 'flat_model_DBSS', 'flat_model_Improved', 'flat_model_Maisonette', 'flat_model_Model A', 'flat_model_Model A-Maisonette', 'flat_model_Model A2', 'flat_model_Multi Generation', 'flat_model_New Generation', 'flat_model_Premium Apartment', 'flat_model_Premium Maisonette', 'flat_model_Simplified', 'flat_model_Standard', 'flat_model_Adjoined flat']


## Spliting data to training and validation set 

In [49]:
# Test set S1.2.2
s1_2 = s1_2.loc[s1_2['k_fold'] == 2]
s1_2 = s1_2.drop(['k_fold'],axis = 1)

# "Classes" / actual values
y_test = s1_2['resale_price'].tolist()

# Values to test on
s1_2 = s1_2.drop(['resale_price'], axis=1)
x_test = s1_2.to_numpy()


In [50]:
def k_fold(dataset, k: int):
  test_num_of_rows = s1_2raw[s1_2raw["k_fold"] == 1].count()["k_fold"]
  # Hyper params
  max_feat = "auto"
  min_sam_split = 2
  min_sam_leaf = 1
  min_weight = 0
  crit = "squared_error"
  #for max_feat in ['auto', 'sqrt', 'log2']:
  #for min_sam_split in range(2, 110, 10):
  #for min_sam_leaf in range(1, 110, 10):
  #for min_weight in [i * 0.1 for i in range(0, 6)]:
  #for crit in ['squared_error', 'poisson', 'absolute_error']:
  p_diff_test_saved = 1
  p_diff_valid_saved = 1

  print(f'starting...', end = "")

  # load from saved model.
  # Uncomment EITHER this or the one below (MODEL)
  model = joblib.load("savedModel.joblib")

  for fold_num in range(1, k + 1):
    print(f'{fold_num}', end = "")

    # Validation set
    validation    = dataset.loc[dataset['k_fold'] == fold_num]
    validation    = validation.drop(['k_fold'], axis = 1)
    y_validation  = validation['resale_price'].tolist()
    validation    = validation.drop(['resale_price'], axis = 1)
    x_validation  = validation.to_numpy()

    # Training set
    train   = dataset.loc[dataset['k_fold'] != fold_num]
    train   = train.drop(['k_fold'], axis = 1)
    y_train = train['resale_price'].tolist()
    train   = train.drop(['resale_price'], axis = 1)
    x_train = train.to_numpy()

    # Random Forest
    # Uncomment EITHER this or the one above (MODEL)
    # model = RandomForestRegressor(
    #   n_estimators = 90, 
    #   max_depth = 18, 
    #   random_state = 0, 
    #   n_jobs = -1,
    #   criterion = crit,
    #   max_features = max_feat,
    #   min_samples_split = min_sam_split,
    #   min_samples_leaf = min_sam_leaf,
    #   min_weight_fraction_leaf = min_weight      
    #   )
    # model.fit(x_train, y_train)
    
    y_test_pred = model.predict(x_test)
    y_validation_pred = model.predict(x_validation)

    # avg % diff in each fold
    num_of_rows = df2017raw[df2017raw["k_fold"] == fold_num].count()["k_fold"]
    p_diff_valid = sum(abs(y_validation - y_validation_pred) / y_validation) / num_of_rows
    p_diff_test = sum(abs(y_test - y_test_pred) / y_test) / test_num_of_rows

    if (p_diff_test < p_diff_test_saved):
      p_diff_test_saved = p_diff_test
      p_diff_valid_saved = p_diff_valid
      # to save:
      #joblib.dump(model, "savedModel4.joblib", compress=3)
      # to load:
      # model = joblib.load("savedModel.joblib")
    
    savefile.write(f'{fold_num}, {p_diff_valid}, {p_diff_test}\n')
    savefile.flush()
    print(f', {p_diff_valid}, {p_diff_test}')

  print("\n", end = "")
  print(f'Best - valid: {p_diff_valid_saved}, test: {p_diff_test_saved}')
  #savedModel  - valid: 0.03970219427728985, test: 0.0640816996101494
  #savedModel2 - valid: 0.03970219427728984, test: 0.06408169961014941
  #savedmodel3 - valid: 0.039702194277289854, test: 0.06408169961014941

In [51]:
k_fold(df2017, 10)

starting...1, 0.021959855967716366, 0.0640816996101494
2, 0.021650838792550517, 0.06408169961014938
3, 0.021220784676266046, 0.0640816996101494
4, 0.021142484170544307, 0.0640816996101494
5, 0.022021697685312615, 0.0640816996101494
6, 0.021360590694391262, 0.06408169961014941
7, 0.021394681034410157, 0.06408169961014938
8, 0.039702194277289854, 0.0640816996101494
9, 0.021549174093342904, 0.0640816996101494
10, 0.02210983643009377, 0.0640816996101494

Best - valid: 0.021650838792550517, test: 0.06408169961014938
