<a href="https://colab.research.google.com/github/Mak-Wei-Zheng/stgat_thesis/blob/main/RF_no_date.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Tue Apr 23 12:37:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   37C    P8              11W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py


In [3]:
# prompt: import libraries for linear regression using cuml
import numpy as np
import cupy as cp

import pandas as pd
import cudf as cd

import numba
import numba.cuda

from cuml import Ridge as cumlRidge

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, GridSearchCV

In [4]:
raw_data = pd.read_csv("/content/drive/MyDrive/raw_all_data.csv")

In [5]:
raw_data.shape

(162575, 24)

In [6]:
raw_data.dtypes

Date                                               object
SORA Index                                        float64
Compound SORA - 1 month                           float64
Compound SORA - 3 month                           float64
Quarterly Composite Leading Index                 float64
Services Inflation Measure (Index)                float64
Retail & Other Goods Inflation Measure (Index)    float64
Electricity & Gas Inflation Measure (Index)       float64
Week                                              float64
Year                                              float64
Transacted Price ($)                              float64
Area (SQFT)                                       float64
Type of Sale                                       object
Type of Area                                       object
Property Type                                      object
Postal District                                   float64
Planning Region                                    object
Planning Area 

In [146]:
raw_data["Transacted Price ($)"].head()

0    2160605.75
1    1186731.00
2    1979959.12
3    2820688.12
4    1299973.08
Name: Transacted Price ($), dtype: float64

### This portion of the code is for grid search over RF variables with my custom train-val-test split

In [156]:
# trim data
columns_to_drop = ["SORA Index", "Compound SORA - 1 month", "Sale Date", "latitude", "longitude"]
rf_total_df = raw_data.drop(columns_to_drop, axis=1)
rf_total_df.head()

Unnamed: 0,Date,Compound SORA - 3 month,Quarterly Composite Leading Index,Services Inflation Measure (Index),Retail & Other Goods Inflation Measure (Index),Electricity & Gas Inflation Measure (Index),Week,Year,Transacted Price ($),Area (SQFT),Type of Sale,Type of Area,Property Type,Postal District,Planning Region,Planning Area,is_freehold,tenure_remaining,region classification
0,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2160605.75,2335.79,Resale,Strata,Condominium,11.0,Central Region,Bukit Timah,True,,CCR
1,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1186731.0,968.76,New Sale,Strata,Condominium,13.0,Central Region,Toa Payoh,False,35452.0,RCR
2,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1979959.12,1065.64,Resale,Strata,Condominium,10.0,Central Region,Tanglin,True,,CCR
3,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2820688.12,1636.13,Resale,Strata,Condominium,4.0,Central Region,Bukit Merah,False,30825.0,RCR
4,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1299973.08,871.88,Sub Sale,Strata,Condominium,20.0,North East Region,Ang Mo Kio,False,33864.0,OCR


In [8]:
# categorical encoding
one_hot_cols = ["region classification", "Type of Area", "Type of Sale"]
binary_cols = ["Planning Area", "Postal District", "Property Type", "Planning Region"]

In [9]:
!pip install category-encoders

Collecting category-encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category-encoders
Successfully installed category-encoders-2.6.3


In [11]:
import category_encoders as ce

# Create a binary encoder
encoder = ce.BinaryEncoder(cols=binary_cols)

# Fit and transform to generate binary encoded DataFrame
binary_encoded_df = encoder.fit_transform(rf_total_df)

# one-hot encode
total_encoded_df = pd.get_dummies(binary_encoded_df, columns=one_hot_cols, prefix='cat')

In [12]:
total_encoded_df.dtypes

Date                                               object
Compound SORA - 3 month                           float64
Quarterly Composite Leading Index                 float64
Services Inflation Measure (Index)                float64
Retail & Other Goods Inflation Measure (Index)    float64
Electricity & Gas Inflation Measure (Index)       float64
Week                                              float64
Year                                              float64
Transacted Price ($)                              float64
Area (SQFT)                                       float64
Property Type_0                                     int64
Property Type_1                                     int64
Property Type_2                                     int64
Postal District_0                                   int64
Postal District_1                                   int64
Postal District_2                                   int64
Postal District_3                                   int64
Postal Distric

In [13]:
# convert date to datetime object
from datetime import datetime

total_encoded_df["Date"] = total_encoded_df["Date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))

In [157]:
total_encoded_df["Transacted Price ($)"] = rf_total_df["Transacted Price ($)"]

In [158]:
total_encoded_df["tenure_remaining"].fillna(0, inplace=True)
total_encoded_df.dropna(axis=0, inplace=True)
total_encoded_df.head()

Unnamed: 0,Date,Compound SORA - 3 month,Quarterly Composite Leading Index,Services Inflation Measure (Index),Retail & Other Goods Inflation Measure (Index),Electricity & Gas Inflation Measure (Index),Week,Year,Transacted Price ($),Area (SQFT),...,is_freehold,tenure_remaining,cat_CCR,cat_OCR,cat_RCR,cat_Land,cat_Strata,cat_New Sale,cat_Resale,cat_Sub Sale
0,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2160605.75,2335.79,...,True,0.0,True,False,False,False,True,False,True,False
1,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1186731.0,968.76,...,False,35452.0,False,False,True,False,True,True,False,False
2,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1979959.12,1065.64,...,True,0.0,True,False,False,False,True,False,True,False
3,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2820688.12,1636.13,...,False,30825.0,False,False,True,False,True,False,True,False
4,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1299973.08,871.88,...,False,33864.0,False,True,False,False,True,False,False,True


In [159]:
import sklearn
print(sklearn.__version__)

1.2.2


In [160]:
encoded_cudf = cd.from_pandas(total_encoded_df)

In [161]:
encoded_cudf.head()

Unnamed: 0,Date,Compound SORA - 3 month,Quarterly Composite Leading Index,Services Inflation Measure (Index),Retail & Other Goods Inflation Measure (Index),Electricity & Gas Inflation Measure (Index),Week,Year,Transacted Price ($),Area (SQFT),...,is_freehold,tenure_remaining,cat_CCR,cat_OCR,cat_RCR,cat_Land,cat_Strata,cat_New Sale,cat_Resale,cat_Sub Sale
0,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2160605.75,2335.79,...,True,0.0,True,False,False,False,True,False,True,False
1,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1186731.0,968.76,...,False,35452.0,False,False,True,False,True,True,False,False
2,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1979959.12,1065.64,...,True,0.0,True,False,False,False,True,False,True,False
3,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2820688.12,1636.13,...,False,30825.0,False,False,True,False,True,False,True,False
4,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1299973.08,871.88,...,False,33864.0,False,True,False,False,True,False,False,True


In [162]:
# convert week and year to int
encoded_cudf["Year"] = encoded_cudf["Year"].astype(int)
encoded_cudf["Week"] = encoded_cudf["Week"].astype(int)

In [163]:
def filter_years(df, start_year, end_year=None):
    if end_year is None:
        end_year = start_year
    return df[(df['Year'] >= start_year) & (df['Year'] <= end_year)]

def get_latest_year_indices(df):
    max_year = df['Year'].max()
    return df[df['Year'] == max_year].index

search_set = filter_years(encoded_cudf, 2014, 2022) # representing ~9 years of data

final_train_set = filter_years(encoded_cudf, 2019, 2022)
final_test_set = filter_years(encoded_cudf, 2023)

print(search_set.shape, final_train_set.shape, final_test_set.shape)

(146243, 37) (76887, 37) (16229, 37)


In [164]:
# import dependencies
from cuml.ensemble import RandomForestRegressor
from cuml.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
import numpy as np


In [165]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("regressor", RandomForestRegressor())
])

In [166]:
param_grid = {
    'regressor__n_estimators': [20, 50, 100, 200],
    'regressor__max_depth': [3, 5, 10, 20],
    'regressor__bootstrap': [True, False],
    'regressor__split_criterion': [2,3,4,5]
}

In [167]:
# going for a 80-20 training split
tscv = TimeSeriesSplit(n_splits=9, max_train_size=4)

In [168]:
grid_search = GridSearchCV(pipeline, param_grid, cv=tscv, verbose=2)

In [169]:
features_to_drop = ["Year", "Week", "Transacted Price ($)", "Date"]
features_to_keep = [feature for feature in encoded_cudf.columns if feature not in features_to_drop]

In [170]:
# prepare data
rf_variables = search_set.drop(features_to_drop, axis=1).values
rf_target = search_set["Transacted Price ($)"].values

In [None]:
# Fit grid search (make sure to pass all columns except 'year' if it's not a feature)
grid_search.fit(rf_variables, rf_target)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [172]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'regressor__bootstrap': True, 'regressor__max_depth': 3, 'regressor__n_estimators': 100, 'regressor__split_criterion': 4}
Best score: -0.07636038680270536


In [None]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df

In [188]:
final_model = pipeline.set_params(**grid_search.best_params_)

In [190]:
# prepare data
train_variables = final_train_set.drop(features_to_drop, axis=1).values
train_target = final_train_set["Transacted Price ($)"].values

In [191]:
final_model.fit(train_variables,train_target)

  ret = func(*args, **kwargs)


In [192]:
X_test = final_test_set.drop(features_to_drop, axis=1).values
y_test = final_test_set["Transacted Price ($)"].values

In [194]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [195]:
# Assume X_test and y_test are your test features and target variable
# Make predictions
y_pred = pipeline.predict(X_test)
print(y_pred)
print(y_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

rmse = np.sqrt(mean_squared_error(y_test.get(), y_pred.get()))
mae = mean_absolute_error(y_test.get(), y_pred.get())
mape = mean_absolute_percentage_error(y_test.get(), y_pred.get())
r2 = r2_score(y_test.get(), y_pred.get())

# Print the performance metrics
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"MAPE: {mape}")
print(f"R^2 Score: {r2}")


[1434124.04815845 1429295.7874423  1429295.7874423  ... 1429295.7874423
 1429295.7874423  1429295.7874423 ]
[1338630.03  1823554.9   1060213.68  ... 2190680.8   1994234.588
 2691080.43 ]
Root Mean Squared Error: 1280325.3374180172
Mean Absolute Error: 601259.2494045702
MAPE: 23.761981238757173
R^2 Score: 0.6538809359207225


#### Exploring using log(price) instead

In [174]:
total_encoded_log_df = total_encoded_df
total_encoded_log_df["Transacted Price ($)"] = total_encoded_log_df["Transacted Price ($)"].apply(np.log)
encoded_log_cudf = cd.from_pandas(total_encoded_log_df)

In [175]:
search_log_set = filter_years(encoded_log_cudf, 2014, 2022) # representing ~9 years of data

final_log_train_set = filter_years(encoded_log_cudf, 2019, 2022)
final_log_test_set = filter_years(encoded_log_cudf, 2023)

print(search_set.shape, final_train_set.shape, final_test_set.shape)

(146243, 37) (76887, 37) (16229, 37)


In [205]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler())
    ("regressor", RandomForestRegressor())
])

  ("scaler", MinMaxScaler())


TypeError: 'tuple' object is not callable

In [206]:
log_grid_search = GridSearchCV(pipeline, param_grid, cv=tscv, verbose=2)

In [207]:
# prepare data
rf_log_variables = search_log_set.drop(features_to_drop, axis=1).values
rf_log_target = search_log_set["Transacted Price ($)"].values

In [208]:
# Fit grid search (make sure to pass all columns except 'year' if it's not a feature)
log_grid_search.fit(rf_log_variables, rf_log_target)
print("Best parameters:", log_grid_search.best_params_)
print("Best score:", log_grid_search.best_score_)

Fitting 9 folds for each of 128 candidates, totalling 1152 fits
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=2; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=2; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=2; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=2; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=2; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=2; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion

  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)


[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=3; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=3; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=3; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=3; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=3; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=3; total time=   0.0s
[CV] END regressor__bootstrap=True, regressor__max_depth=3, regressor__n_estimators=20, regressor__split_criterion=4; total time=   0.0s
[CV] END regressor__bootstrap=True, regre

288 fits failed out of a total of 1152.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
288 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/nvtx/nvtx.py", line 116, in inner
    result = func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/cuml/internals/api_decorators.py", line 188, in wrapper
    ret = func(*args, **kwargs)
  File "random

Best parameters: {'regressor__bootstrap': True, 'regressor__max_depth': 3, 'regressor__n_estimators': 100, 'regressor__split_criterion': 2}
Best score: -0.3387350150040047


In [209]:
print("Best parameters:", log_grid_search.best_params_)
print("Best score:", log_grid_search.best_score_)

Best parameters: {'regressor__bootstrap': True, 'regressor__max_depth': 3, 'regressor__n_estimators': 100, 'regressor__split_criterion': 2}
Best score: -0.3387350150040047


In [210]:
final_log_model = pipeline.set_params(**log_grid_search.best_params_)

In [211]:
# prepare data
train_log_variables = final_log_train_set.drop(features_to_drop, axis=1).values
train_log_target = final_log_train_set["Transacted Price ($)"].values

In [212]:
final_log_model.fit(train_log_variables,train_log_target)

In [213]:
final_log_test_set.head()

Unnamed: 0,Date,Compound SORA - 3 month,Quarterly Composite Leading Index,Services Inflation Measure (Index),Retail & Other Goods Inflation Measure (Index),Electricity & Gas Inflation Measure (Index),Week,Year,Transacted Price ($),Area (SQFT),...,is_freehold,tenure_remaining,cat_CCR,cat_OCR,cat_RCR,cat_Land,cat_Strata,cat_New Sale,cat_Resale,cat_Sub Sale
146346,2023-01-02,3.0966,110.5,105.128,100.221,115.38,1.0,2023.0,14.107157,1054.87,...,False,35449.0,False,True,False,False,True,True,False,False
146347,2023-01-02,3.0966,110.5,105.128,100.221,115.38,1.0,2023.0,14.416298,882.65,...,True,0.0,False,True,False,False,True,True,False,False
146348,2023-01-03,3.0294,110.5,105.128,100.221,115.38,1.0,2023.0,13.873981,775.01,...,False,31817.0,False,True,False,False,True,False,True,False
146349,2023-01-03,3.0294,110.5,105.128,100.221,115.38,1.0,2023.0,14.450983,678.13,...,True,0.0,False,False,True,False,True,True,False,False
146350,2023-01-03,3.0294,110.5,105.128,100.221,115.38,1.0,2023.0,14.371982,1334.74,...,False,32720.0,False,True,False,False,True,False,True,False


In [202]:
X_test = final_log_test_set.drop(features_to_drop, axis=1).values
y_test = final_test_set["Transacted Price ($)"].values

In [204]:
# Assume X_test and y_test are your test features and target variable
# Make predictions
y_log_pred = pipeline.predict(X_test)
y_pred = cp.exp(y_log_pred)
print(y_pred)
print(y_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

rmse = np.sqrt(mean_squared_error(y_test.get(), y_pred.get()))
mae = mean_absolute_error(y_test.get(), y_pred.get())
mape = mean_absolute_percentage_error(y_test.get(), y_pred.get())
r2 = r2_score(y_test.get(), y_pred.get())

# Print the performance metrics
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"MAPE: {mape}")
print(f"R^2 Score: {r2}")


[1205195.77657142 1205195.77657142 1205195.77657142 ... 1205195.77657142
 1760454.30947889 1760454.30947889]
[1338630.03  1823554.9   1060213.68  ... 2190680.8   1994234.588
 2691080.43 ]
Root Mean Squared Error: 1420698.4199479262
Mean Absolute Error: 623067.5841317202
MAPE: 22.879344720112485
R^2 Score: 0.5738243470411044
