In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm

## Reading data

In [2]:
s1 = pd.read_csv("../s1.csv")
# df2001 = pd.read_csv("../df2001.csv")
# df2003 = pd.read_csv("../df2003.csv")
# df2005 = pd.read_csv("../df2005.csv")
# df2007 = pd.read_csv("../df2007.csv")
# df2009 = pd.read_csv("../df2009.csv")
# df2011 = pd.read_csv("../df2011.csv")
# df2013 = pd.read_csv("../df2013.csv")
# df2015 = pd.read_csv("../df2015.csv")
# df2017 = pd.read_csv("../df2017.csv")
# df2019 = pd.read_csv("../df2019.csv")
# df2021 = pd.read_csv("../df2021.csv")
df2020 = pd.read_csv("../df2020.csv")

## Droping cols and OneHotEncoding aka set dummy variable

In [3]:
def preprocess(csv_file):
  csv_file = csv_file.drop(['lease_commence_date'], axis = 1)
  csv_file = csv_file.drop(['Unnamed: 0'],axis = 1)
  csv_file = csv_file.drop(['address'],axis = 1)

  onehot = OneHotEncoder()

  town_df = pd.DataFrame(onehot.fit_transform(csv_file[['town']]).toarray())
  town_df.columns = onehot.get_feature_names_out(['town'])

  flat_model_df = pd.DataFrame(onehot.fit_transform(csv_file[['flat_model']]).toarray())
  flat_model_df.columns = onehot.get_feature_names_out(['flat_model'])

  flat_type_df = pd.DataFrame(onehot.fit_transform(csv_file[['flat_type']]).toarray())
  flat_type_df.columns = onehot.get_feature_names_out(['flat_type'])

  csv_file = csv_file.join([town_df, flat_model_df, flat_type_df])
  csv_file = csv_file.drop(['town','flat_type','flat_model'], axis = 1)

  return csv_file


In [4]:
s1     = preprocess(s1)
# df2001 = preprocess(df2001)
# df2003 = preprocess(df2003)
# df2005 = preprocess(df2005)
# df2007 = preprocess(df2007)
# df2009 = preprocess(df2009)
# df2011 = preprocess(df2011)
# df2013 = preprocess(df2013)
# df2015 = preprocess(df2015)
# df2017 = preprocess(df2017)
# df2019 = preprocess(df2019)
# df2021 = preprocess(df2021)
df2020 = preprocess(df2020)

savefile = open("savefile.txt", "w")

## Spliting data to training and validation set 

In [5]:
# Test set S1.1
s1 = s1.loc[s1['k_fold'] == 1]
s1 = s1.drop(['k_fold'],axis = 1)

# "Classes" / actual values
y_test = s1['resale_price'].tolist()

# Values to test on
s1  = s1.drop(['resale_price'], axis=1)
x_test = s1.to_numpy()


In [9]:
def k_fold(dataset, k: int) -> list:
  SSE_valid_acc = 0
  SSE_test_acc = 0
  for fold_num in range(1, k + 1):
    print(f'Fold number {fold_num}...')
    # Validation set
    validation    = dataset.loc[dataset['k_fold'] == fold_num]
    validation    = validation.drop(['k_fold'], axis = 1)
    y_validation  = validation['resale_price'].tolist()
    validation    = validation.drop(['resale_price'], axis = 1)
    x_validation  = validation.to_numpy()

    # Training set
    train   = dataset.loc[dataset['k_fold'] != fold_num]
    train   = train.drop(['k_fold'], axis = 1)
    y_train = train['resale_price'].tolist()
    train   = train.drop(['resale_price'], axis = 1)
    x_train = train.to_numpy()

    # =========== modeling ==========
    
    # Support Vector Regression
    model = svm.SVR()

    # Random Forest
    #model = RandomForestRegressor(n_estimators = 110, max_depth = 14, random_state = 0)
    model = model.fit(x_train, y_train)
    y_validation_pred = model.predict(x_validation)

    SSE_valid = sum((y_validation - y_validation_pred)**2)
    SSE_valid_acc += SSE_valid

    y_test_pred = model.predict(x_test)

    SSE_test = sum((y_test - y_test_pred)**2)
    SSE_test_acc += SSE_test
    
  SSE_valid_avg = SSE_valid_acc / k
  SSE_test_avg = SSE_test_acc / k

  savefile.write(f'K-fold Cross validation. Selected K = {k}\n')
  savefile.write(f'Average validation SSE = {SSE_valid_avg}\n')
  savefile.write(f'Average testing SSE = {SSE_test_avg}\n')
  savefile.flush()

  print(f'K-fold Cross Validation. Selected K = {k}')
  print(f'Average validation Sum of Squared Errors = {SSE_valid_avg}')
  print(f'Average testing Sum of Squared Errors = {SSE_test_avg}')

  return [SSE_valid_avg, SSE_test_avg]

In [44]:
def k_fold2(dataset) -> list:
  print("Training...")
  # Validation set
  validation    = dataset.loc[dataset['k_fold'] == 1]
  validation    = validation.drop(['k_fold'], axis = 1)
  y_validation  = validation['resale_price'].tolist()
  validation    = validation.drop(['resale_price'], axis = 1)
  x_validation  = validation.to_numpy()

  # Training set
  train   = dataset.loc[dataset['k_fold'] != 1]
  train   = train.drop(['k_fold'], axis = 1)
  y_train = train['resale_price'].tolist()
  train   = train.drop(['resale_price'], axis = 1)
  x_train = train.to_numpy()

  # =========== modeling ==========
  
  # Support Vector Regression
  model = svm.SVR()

  model = model.fit(x_train, y_train)
  y_validation_pred = model.predict(x_validation)

  SSE_valid = sum((y_validation - y_validation_pred)**2)

  y_test_pred = model.predict(x_test)

  SSE_test = sum((y_test - y_test_pred)**2)
  
  print(f'Average validation Sum of Squared Errors = {SSE_valid}')
  print(f'Average testing Sum of Squared Errors = {SSE_test}')

  return [SSE_valid, SSE_test]

In [None]:
avg_results = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

year_range = {0 : "2001",
              1 : "2003",
              2 : "2005",
              3 : "2007",
              4 : "2009",
              5 : "2011",
              6 : "2013",
              7 : "2015",
              8 : "2017",
              9 : "2019",
              10: "2021"}


In [10]:
# avg_results[0]= k_fold(df2001, 1)
# validation sse = 508794190916643.0
# test sse = 73370395116532.02
# k_fold2(df2003)
# k_fold2(df2005)
# avg_results[1]= k_fold(df2003)
# avg_results[2]= k_fold(df2005)
# avg_results[3]= k_fold(df2007, 1)
# avg_results[4]= k_fold(df2009, 1)
# avg_results[5]= k_fold(df2011, 1)
# avg_results[6]= k_fold(df2013, 1)
# avg_results[7]= k_fold(df2015, 1)
# avg_results[8]= k_fold(df2017, 1)
# avg_results[9]= k_fold(df2019, 1)
k_fold(df2020, 10)


Fold number 1...
Fold number 2...
Fold number 3...
Fold number 4...
Fold number 5...
Fold number 6...
Fold number 7...
Fold number 8...
Fold number 9...
Fold number 10...
K-fold Cross Validation. Selected K = 10
Average validation Sum of Squared Errors = 52683558166932.65
Average testing Sum of Squared Errors = 34236858039764.113


[52683558166932.65, 34236858039764.113]

In [None]:
add_flat = np.zeros(11553)
df2021["flat_model_Premium Maisonette"] = add_flat
avg_results[10]= k_fold(df2021, 10)

Finding smallest error

In [None]:
# Lowest error
# [validation, test]
min_error = [-1, -1]

# Which range gives the lowest error
# [validation, test]
min_error_index = [-1, -1]


for col in range(2):
    for result in range(11):
        if (min_error[col] == -1):
            min_error[col] = avg_results[result][col]
            min_error_index[col] = result
            continue
        curr_error = min_error[col]
        seen_error = avg_results[result][col]
        if (curr_error > seen_error): # if smaller
            min_error[col] = seen_error
            min_error_index[col] = result

print(f'Best starting year for validation: {year_range[min_error_index[0]]}')
print(f'Best starting year for testing: {year_range[min_error_index[1]]}')

savefile.write("Finished!\n")
savefile.write(f'Best starting year for validation: {year_range[min_error_index[0]]}\n')
savefile.write(f'Best starting year for testing: {year_range[min_error_index[1]]}\n')
savefile.flush()
savefile.close()

In [None]:
print(avg_results)