<a href="https://colab.research.google.com/github/Mak-Wei-Zheng/stgat_thesis/blob/main/Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [70]:
!nvidia-smi

Tue Apr 23 16:26:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0              28W /  70W |    349MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [71]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

fatal: destination path 'rapidsai-csp-utils' already exists and is not an empty directory.
Traceback (most recent call last):
  File "/content/rapidsai-csp-utils/colab/pip-install.py", line 30, in <module>
    if ('P' not in gpu_name):
TypeError: a bytes-like object is required, not 'str'


In [72]:
# prompt: import libraries for linear regression using cuml
import numpy as np
import cupy as cp

import pandas as pd
import cudf as cd

import numba
import numba.cuda

from cuml import Ridge as cumlRidge

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, GridSearchCV

In [73]:
raw_data = pd.read_csv("/content/drive/MyDrive/raw_all_data.csv")

In [74]:
raw_data.shape

(162575, 24)

In [75]:
raw_data.dtypes

Date                                               object
SORA Index                                        float64
Compound SORA - 1 month                           float64
Compound SORA - 3 month                           float64
Quarterly Composite Leading Index                 float64
Services Inflation Measure (Index)                float64
Retail & Other Goods Inflation Measure (Index)    float64
Electricity & Gas Inflation Measure (Index)       float64
Week                                              float64
Year                                              float64
Transacted Price ($)                              float64
Area (SQFT)                                       float64
Type of Sale                                       object
Type of Area                                       object
Property Type                                      object
Postal District                                   float64
Planning Region                                    object
Planning Area 

In [76]:
raw_data["Transacted Price ($)"].head()

0    2160605.75
1    1186731.00
2    1979959.12
3    2820688.12
4    1299973.08
Name: Transacted Price ($), dtype: float64

### This portion of the code is for grid search over LR variables with my custom train-val-test split

In [77]:
# trim data
columns_to_drop = ["SORA Index", "Compound SORA - 1 month", "Sale Date"]
lr_total_df = raw_data.drop(columns_to_drop, axis=1)
lr_total_df.head()

Unnamed: 0,Date,Compound SORA - 3 month,Quarterly Composite Leading Index,Services Inflation Measure (Index),Retail & Other Goods Inflation Measure (Index),Electricity & Gas Inflation Measure (Index),Week,Year,Transacted Price ($),Area (SQFT),...,Type of Area,Property Type,Postal District,Planning Region,Planning Area,latitude,longitude,is_freehold,tenure_remaining,region classification
0,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2160605.75,2335.79,...,Strata,Condominium,11.0,Central Region,Bukit Timah,35.41774,-78.461267,True,,CCR
1,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1186731.0,968.76,...,Strata,Condominium,13.0,Central Region,Toa Payoh,55.915648,-3.156864,False,35452.0,RCR
2,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1979959.12,1065.64,...,Strata,Condominium,10.0,Central Region,Tanglin,52.90605,-1.375444,True,,CCR
3,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2820688.12,1636.13,...,Strata,Condominium,4.0,Central Region,Bukit Merah,1.265778,103.814563,False,30825.0,RCR
4,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1299973.08,871.88,...,Strata,Condominium,20.0,North East Region,Ang Mo Kio,1.370357,103.848536,False,33864.0,OCR


In [78]:
# categorical encoding
one_hot_cols = ["region classification", "Type of Area", "Type of Sale"]
binary_cols = ["Planning Area", "Postal District", "Property Type", "Planning Region"]

In [79]:
!pip install category-encoders



In [80]:
import category_encoders as ce

# Create a binary encoder
encoder = ce.BinaryEncoder(cols=binary_cols)

# Fit and transform to generate binary encoded DataFrame
binary_encoded_df = encoder.fit_transform(lr_total_df)

# one-hot encode
total_encoded_df = pd.get_dummies(binary_encoded_df, columns=one_hot_cols, prefix='cat')

In [81]:
total_encoded_df.dtypes

Date                                               object
Compound SORA - 3 month                           float64
Quarterly Composite Leading Index                 float64
Services Inflation Measure (Index)                float64
Retail & Other Goods Inflation Measure (Index)    float64
Electricity & Gas Inflation Measure (Index)       float64
Week                                              float64
Year                                              float64
Transacted Price ($)                              float64
Area (SQFT)                                       float64
Property Type_0                                     int64
Property Type_1                                     int64
Property Type_2                                     int64
Postal District_0                                   int64
Postal District_1                                   int64
Postal District_2                                   int64
Postal District_3                                   int64
Postal Distric

In [82]:
# convert date to datetime object
from datetime import datetime

total_encoded_df["Date"] = total_encoded_df["Date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))

In [83]:
total_encoded_df["Transacted Price ($)"] = lr_total_df["Transacted Price ($)"]

In [84]:
total_encoded_df["tenure_remaining"].fillna(0, inplace=True)
total_encoded_df.dropna(axis=0, inplace=True)
total_encoded_df.head()

Unnamed: 0,Date,Compound SORA - 3 month,Quarterly Composite Leading Index,Services Inflation Measure (Index),Retail & Other Goods Inflation Measure (Index),Electricity & Gas Inflation Measure (Index),Week,Year,Transacted Price ($),Area (SQFT),...,is_freehold,tenure_remaining,cat_CCR,cat_OCR,cat_RCR,cat_Land,cat_Strata,cat_New Sale,cat_Resale,cat_Sub Sale
0,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2160605.75,2335.79,...,True,0.0,True,False,False,False,True,False,True,False
1,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1186731.0,968.76,...,False,35452.0,False,False,True,False,True,True,False,False
2,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1979959.12,1065.64,...,True,0.0,True,False,False,False,True,False,True,False
3,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2820688.12,1636.13,...,False,30825.0,False,False,True,False,True,False,True,False
4,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1299973.08,871.88,...,False,33864.0,False,True,False,False,True,False,False,True


In [85]:
import sklearn
print(sklearn.__version__)

1.2.2


In [86]:
encoded_cudf = cd.from_pandas(total_encoded_df)

In [87]:
encoded_cudf.head()

Unnamed: 0,Date,Compound SORA - 3 month,Quarterly Composite Leading Index,Services Inflation Measure (Index),Retail & Other Goods Inflation Measure (Index),Electricity & Gas Inflation Measure (Index),Week,Year,Transacted Price ($),Area (SQFT),...,is_freehold,tenure_remaining,cat_CCR,cat_OCR,cat_RCR,cat_Land,cat_Strata,cat_New Sale,cat_Resale,cat_Sub Sale
0,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2160605.75,2335.79,...,True,0.0,True,False,False,False,True,False,True,False
1,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1186731.0,968.76,...,False,35452.0,False,False,True,False,True,True,False,False
2,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1979959.12,1065.64,...,True,0.0,True,False,False,False,True,False,True,False
3,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,2820688.12,1636.13,...,False,30825.0,False,False,True,False,True,False,True,False
4,2014-02-28,0.0444,99.8,93.063,97.714,117.098,9.0,2014.0,1299973.08,871.88,...,False,33864.0,False,True,False,False,True,False,False,True


In [88]:
# convert week and year to int
encoded_cudf["Year"] = encoded_cudf["Year"].astype(int)
encoded_cudf["Week"] = encoded_cudf["Week"].astype(int)

In [89]:
def filter_years(df, start_year, end_year=None):
    if end_year is None:
        end_year = start_year
    return df[(df['Year'] >= start_year) & (df['Year'] <= end_year)]

def get_latest_year_indices(df):
    max_year = df['Year'].max()
    return df[df['Year'] == max_year].index

search_set = filter_years(encoded_cudf, 2014, 2022) # representing ~9 years of data

final_train_set = filter_years(encoded_cudf, 2019, 2022)
final_test_set = filter_years(encoded_cudf, 2023)

print(search_set.shape, final_train_set.shape, final_test_set.shape)

(146243, 39) (76887, 39) (16229, 39)


In [90]:
# import dependencies
from cuml.linear_model import LinearRegression
from cuml.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
import numpy as np

In [91]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("regressor", LinearRegression())
])

  return init_func(self, *args, **filtered_kwargs)


In [98]:
param_grid = {
    # 'regressor__alpha': [100, 10, 1.0, 0.1, 0.01],
    'regressor__fit_intercept': [True, False],
    'regressor__normalize': [True, False]
}

In [99]:
# going for a 80-20 training split
tscv = TimeSeriesSplit(n_splits=9, max_train_size=4)

In [100]:
grid_search = GridSearchCV(pipeline, param_grid, cv=tscv, verbose=2)

In [101]:
features_to_drop = ["Transacted Price ($)", "Date"]
features_to_keep = [feature for feature in encoded_cudf.columns if feature not in features_to_drop]

In [102]:
# prepare data
lr_variables = search_set.drop(features_to_drop, axis=1).values
lr_target = search_set["Transacted Price ($)"].values

In [103]:
# Fit grid search (make sure to pass all columns except 'year' if it's not a feature)
grid_search.fit(lr_variables, lr_target)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 9 folds for each of 4 candidates, totalling 36 fits


  jitify._init_module()


[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=  55.4s
[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=False; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__normalize=False; total time=   0.0s
[CV] END regressor__fit_intercept=True, regressor__n

In [104]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'regressor__fit_intercept': True, 'regressor__normalize': False}
Best score: -174.66860444054365


In [105]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__fit_intercept,param_regressor__normalize,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,mean_test_score,std_test_score,rank_test_score
0,5.980374,16.885625,0.188828,0.518847,True,True,"{'regressor__fit_intercept': True, 'regressor_...",-3.376984,-3367.488014,0.659963,0.535645,-0.26081,-484.599128,-136.113763,-18.103744,-1.572523,-445.59104,1043.856037,2
1,0.009834,0.000589,0.005608,0.000278,True,False,"{'regressor__fit_intercept': True, 'regressor_...",-2.264967,-1221.269177,0.672615,0.544861,-0.220539,-267.306759,-67.345816,-13.264519,-1.563139,-174.668604,379.046903,1
2,0.010579,0.001489,0.00749,0.004697,False,True,"{'regressor__fit_intercept': False, 'regressor...",-19.992186,-68.195615,0.633908,0.560842,-0.207685,-3608.201688,-914.121403,-139.18329,-1.869565,-527.841854,1124.060632,3
3,0.010585,0.001788,0.005871,0.000704,False,False,"{'regressor__fit_intercept': False, 'regressor...",-19.992186,-68.195615,0.633908,0.560842,-0.207685,-3608.201688,-914.121403,-139.18329,-1.869565,-527.841854,1124.060632,3


In [106]:
final_model = pipeline.set_params(**grid_search.best_params_)

In [107]:
# prepare data
train_variables = final_train_set.drop(features_to_drop, axis=1).values
train_target = final_train_set["Transacted Price ($)"].values

In [108]:
final_model.fit(train_variables,train_target)

In [109]:
X_test = final_test_set.drop(features_to_drop, axis=1).values
y_test = final_test_set["Transacted Price ($)"].values

In [110]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [111]:
# Assume X_test and y_test are your test features and target variable
# Make predictions
y_pred = pipeline.predict(X_test)
print(y_pred)
print(y_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

rmse = np.sqrt(mean_squared_error(y_test.get(), y_pred.get()))
mae = mean_absolute_error(y_test.get(), y_pred.get())
mape = mean_absolute_percentage_error(y_test.get(), y_pred.get())
r2 = r2_score(y_test.get(), y_pred.get())

# Print the performance metrics
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"MAPE: {mape}")
print(f"R^2 Score: {r2}")


[1649344.86408709 1906479.44756301  991121.18546833 ... 2475427.04288905
 2063187.42993116 2420241.02764117]
[1338630.03  1823554.9   1060213.68  ... 2190680.8   1994234.588
 2691080.43 ]
Root Mean Squared Error: 1012982.0526714184
Mean Absolute Error: 495141.7616994858
MAPE: 23.289058219314615
R^2 Score: 0.7833352090746664


In [112]:
from cuml.linear_model import Ridge

In [113]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("regressor", Ridge())
])

In [128]:
param_grid = {
    'regressor__alpha': [100, 10, 1.0, 0.1, 0.01],
    'regressor__fit_intercept': [True, False],
    'regressor__normalize': [True, False]
}

In [129]:
# going for a 80-20 training split
tscv = TimeSeriesSplit(n_splits=9, max_train_size=4)

In [130]:
grid_search = GridSearchCV(pipeline, param_grid, cv=tscv, verbose=2)

In [131]:
features_to_drop = ["Transacted Price ($)", "Date"]
features_to_keep = [feature for feature in encoded_cudf.columns if feature not in features_to_drop]

In [132]:
# prepare data
lr_variables = search_set.drop(features_to_drop, axis=1).values
lr_target = search_set["Transacted Price ($)"].values

In [None]:
# Fit grid search (make sure to pass all columns except 'year' if it's not a feature)
grid_search.fit(lr_variables, lr_target)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [134]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'regressor__alpha': 100, 'regressor__fit_intercept': True, 'regressor__normalize': False}
Best score: -0.8112051395817362


In [135]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__alpha,param_regressor__fit_intercept,param_regressor__normalize,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011405,0.001413,0.005649,0.000829,100.0,True,True,"{'regressor__alpha': 100, 'regressor__fit_inte...",-0.694083,-179.583777,-0.319888,-0.141433,-0.173821,-44.581753,-10.487557,-0.696237,0.045379,-26.292574,55.903966,4
1,0.0112,0.001902,0.005076,0.000125,100.0,True,False,"{'regressor__alpha': 100, 'regressor__fit_inte...",-0.061458,-4.019951,-0.29289,-0.189152,-0.150611,-1.989421,-0.560852,-0.044678,0.008167,-0.811205,1.276278,1
2,0.010883,0.001408,0.004899,0.000133,100.0,False,True,"{'regressor__alpha': 100, 'regressor__fit_inte...",-0.737739,-29.50682,-0.500955,-0.2415,-0.651778,-50.009563,-20.916604,-2.664156,-0.617514,-11.760736,16.908368,2
3,0.011199,0.001676,0.004988,0.000138,100.0,False,False,"{'regressor__alpha': 100, 'regressor__fit_inte...",-0.737739,-29.50682,-0.500955,-0.2415,-0.651778,-50.009563,-20.916604,-2.664156,-0.617514,-11.760736,16.908368,2
4,0.011037,0.001504,0.004902,0.000102,10.0,True,True,"{'regressor__alpha': 10, 'regressor__fit_inter...",-4.369081,-1908.049753,-0.383831,-0.069111,-0.231055,-346.340978,-84.585052,-9.719309,0.146722,-261.511272,591.83153,11
5,0.01166,0.002236,0.005282,0.000312,10.0,True,False,"{'regressor__alpha': 10, 'regressor__fit_inter...",-1.291051,-182.671205,-0.337015,-0.120758,-0.173686,-59.327945,-12.901451,-1.735559,0.064709,-28.721551,57.38323,5
6,0.011716,0.001494,0.005363,0.000224,10.0,False,True,"{'regressor__alpha': 10, 'regressor__fit_inter...",-14.54834,-92.870947,-0.175582,0.169761,-0.318222,-974.997328,-336.771676,-63.932026,0.120575,-164.813754,304.13574,7
7,0.011316,0.00142,0.005189,8.2e-05,10.0,False,False,"{'regressor__alpha': 10, 'regressor__fit_inter...",-14.54834,-92.870947,-0.175582,0.169761,-0.318222,-974.997328,-336.771676,-63.932026,0.120575,-164.813754,304.13574,7
8,0.011603,0.001534,0.005245,0.000183,1.0,True,True,"{'regressor__alpha': 1.0, 'regressor__fit_inte...",-3.937781,-3157.427821,-0.313584,-0.023753,-0.256718,-472.914885,-129.0998,-16.844554,0.185411,-420.070387,978.765263,12
9,0.011976,0.001927,0.005579,0.000306,1.0,True,False,"{'regressor__alpha': 1.0, 'regressor__fit_inte...",-3.294565,-912.946407,-0.375477,-0.074015,-0.208915,-228.552827,-52.53593,-9.742644,0.150765,-134.175557,284.125525,6


In [136]:
final_model = pipeline.set_params(**grid_search.best_params_)

In [137]:
# prepare data
train_variables = final_train_set.drop(features_to_drop, axis=1).values
train_target = final_train_set["Transacted Price ($)"].values

In [138]:
final_model.fit(train_variables,train_target)

In [139]:
X_test = final_test_set.drop(features_to_drop, axis=1).values
y_test = final_test_set["Transacted Price ($)"].values

In [140]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [141]:
# Assume X_test and y_test are your test features and target variable
# Make predictions
y_pred = pipeline.predict(X_test)
print(y_pred)
print(y_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

rmse = np.sqrt(mean_squared_error(y_test.get(), y_pred.get()))
mae = mean_absolute_error(y_test.get(), y_pred.get())
mape = mean_absolute_percentage_error(y_test.get(), y_pred.get())
r2 = r2_score(y_test.get(), y_pred.get())

# Print the performance metrics
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"MAPE: {mape}")
print(f"R^2 Score: {r2}")


[1730214.96131903 2014978.96115544 1202949.20010329 ... 2204466.66596672
 2277449.50689247 2361805.1722722 ]
[1338630.03  1823554.9   1060213.68  ... 2190680.8   1994234.588
 2691080.43 ]
Root Mean Squared Error: 1412734.7279772304
Mean Absolute Error: 688636.9277099101
MAPE: 34.36257299132565
R^2 Score: 0.5785887914518739


In [142]:
from cuml.linear_model import Lasso

In [147]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("regressor", Lasso())
])

In [148]:
param_grid = {
    'regressor__alpha': [100, 10, 1.0, 0.1, 0.01],
    'regressor__fit_intercept': [True, False],
    'regressor__normalize': [True, False]
}

In [149]:
# going for a 80-20 training split
tscv = TimeSeriesSplit(n_splits=9, max_train_size=4)

In [150]:
grid_search = GridSearchCV(pipeline, param_grid, cv=tscv, verbose=2)

In [151]:
features_to_drop = ["Transacted Price ($)", "Date"]
features_to_keep = [feature for feature in encoded_cudf.columns if feature not in features_to_drop]

In [152]:
# prepare data
lr_variables = search_set.drop(features_to_drop, axis=1).values
lr_target = search_set["Transacted Price ($)"].values

In [153]:
# Fit grid search (make sure to pass all columns except 'year' if it's not a feature)
grid_search.fit(lr_variables, lr_target)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 9 folds for each of 20 candidates, totalling 180 fits
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__normalize=True; total time=   0.1s
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__normalize=True; total time=   0.2s
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__normalize=True; total time=   0.1s
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__normalize=True; total time=   0.0s
[CV] END regressor__alpha=100, regressor__fit_intercept=True, regressor__n

In [154]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'regressor__alpha': 100, 'regressor__fit_intercept': True, 'regressor__normalize': False}
Best score: -59.727028636084434


In [155]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__alpha,param_regressor__fit_intercept,param_regressor__normalize,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,mean_test_score,std_test_score,rank_test_score
0,0.090783,0.112928,0.006929,0.001587,100.0,True,True,"{'regressor__alpha': 100, 'regressor__fit_inte...",-302.730263,0.606147,-4.300343,0.230136,-1.196224,0.681023,0.034657,-1098.761944,-0.048984,-156.165088,346.322974,2
1,0.212725,0.247868,0.007489,0.001552,100.0,True,False,"{'regressor__alpha': 100, 'regressor__fit_inte...",-300.825214,0.611021,-4.492368,0.220998,-0.113834,0.727306,0.340837,-233.965361,-0.046642,-59.727029,112.126418,1
2,0.031434,0.010421,0.005673,0.000417,100.0,False,True,"{'regressor__alpha': 100, 'regressor__fit_inte...",-70.827485,-0.961922,-0.207065,0.104518,-0.259055,-2711.628683,-327.07275,-274.488233,0.35383,-376.10965,834.475349,11
3,0.044413,0.06586,0.005539,0.000721,100.0,False,False,"{'regressor__alpha': 100, 'regressor__fit_inte...",-1308.860545,-1.865527,-53.728269,-0.274286,0.333956,-1.38658,-42.058571,-6994.171415,-0.126512,-933.570861,2180.525421,16
4,0.084273,0.143276,0.006229,0.001337,10.0,True,True,"{'regressor__alpha': 10, 'regressor__fit_inter...",-304.089348,0.40338,-4.715281,0.232898,-1.206235,0.582699,-1.558265,-1101.392516,-0.051821,-156.866054,347.067053,4
5,0.074813,0.121391,0.005993,0.000762,10.0,True,False,"{'regressor__alpha': 10, 'regressor__fit_inter...",-303.90934,0.463037,-4.713655,0.23247,-1.205528,0.596577,-1.358065,-1101.227007,-0.074343,-156.799539,347.021148,3
6,0.031547,0.009843,0.005557,0.000222,10.0,False,True,"{'regressor__alpha': 10, 'regressor__fit_inter...",-70.801485,-1.032689,-0.207447,0.10446,-0.258938,-2712.202277,-327.246629,-274.610032,0.353866,-376.211241,834.648449,12
7,0.044836,0.06747,0.005483,0.000637,10.0,False,False,"{'regressor__alpha': 10, 'regressor__fit_inter...",-1311.92635,-3.273245,-60.627944,-0.290369,0.333003,-2.292112,-42.382986,-6999.601965,-0.127473,-935.576605,2181.827328,17
8,0.073471,0.120354,0.005639,0.000988,1.0,True,True,"{'regressor__alpha': 1.0, 'regressor__fit_inte...",-304.225774,0.362391,-4.716561,0.233516,-1.206616,0.574832,-1.685504,-1101.495973,-0.020712,-156.908933,347.097495,6
9,0.077856,0.126896,0.00572,0.001023,1.0,True,False,"{'regressor__alpha': 1.0, 'regressor__fit_inte...",-304.207349,0.369116,-4.715929,0.233475,-1.206545,0.576287,-1.664323,-1101.479421,-0.056187,-156.905653,347.091336,5


In [156]:
final_model = pipeline.set_params(**grid_search.best_params_)

In [157]:
# prepare data
train_variables = final_train_set.drop(features_to_drop, axis=1).values
train_target = final_train_set["Transacted Price ($)"].values

In [158]:
final_model.fit(train_variables,train_target)

In [159]:
X_test = final_test_set.drop(features_to_drop, axis=1).values
y_test = final_test_set["Transacted Price ($)"].values

In [160]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [161]:
# Assume X_test and y_test are your test features and target variable
# Make predictions
y_pred = pipeline.predict(X_test)
print(y_pred)
print(y_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

rmse = np.sqrt(mean_squared_error(y_test.get(), y_pred.get()))
mae = mean_absolute_error(y_test.get(), y_pred.get())
mape = mean_absolute_percentage_error(y_test.get(), y_pred.get())
r2 = r2_score(y_test.get(), y_pred.get())

# Print the performance metrics
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"MAPE: {mape}")
print(f"R^2 Score: {r2}")


[1624828.74535507 1978224.87162138  985491.24709066 ... 2442929.7075525
 2246774.08005425 2600668.08192821]
[1338630.03  1823554.9   1060213.68  ... 2190680.8   1994234.588
 2691080.43 ]
Root Mean Squared Error: 1024562.206042669
Mean Absolute Error: 515211.5885262565
MAPE: 24.737137966073284
R^2 Score: 0.778353180630318


In [None]:
df = pd.DataFrame.from_dict(grid_search.cv_results_)
df