In [1]:
import numpy as np
import pandas as pd

# Load data

In [2]:
# Read data from csv
df = pd.read_csv("data/rental_prices_singapore_preprocessed.csv")

In [3]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1576 entries, 0 to 1575
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             1576 non-null   int64  
 1   size              1576 non-null   int64  
 2   bedrooms          1576 non-null   object 
 3   bathrooms         1576 non-null   int64  
 4   latitude          1576 non-null   float64
 5   longitude         1576 non-null   float64
 6   meters_to_school  1576 non-null   int64  
 7   property_type     1576 non-null   object 
 8   furnishing        1576 non-null   object 
 9   year              1576 non-null   int64  
 10  meters_to_mrt     1576 non-null   int64  
 11  high_floor        1576 non-null   bool   
 12  new               1576 non-null   bool   
 13  renovated         1576 non-null   bool   
 14  view              1576 non-null   bool   
 15  penthouse         1576 non-null   bool   
dtypes: bool(5), float64(2), int64(6), object(3

In [4]:
# Show top five rows
df.head()

Unnamed: 0,price,size,bedrooms,bathrooms,latitude,longitude,meters_to_school,property_type,furnishing,year,meters_to_mrt,high_floor,new,renovated,view,penthouse
0,3000,400,1,1,1.312952,103.887868,422,Apartment,Fully Furnished,2012,450,False,False,False,False,False
1,2000,1130,Room,1,1.32882,103.912904,3573,Apartment,Fully Furnished,2012,810,False,False,False,False,False
2,7400,3800,5,4,1.389444,103.857002,568,Apartment,Fully Furnished,2012,450,False,False,False,False,False
3,1000,120,Room,1,1.429261,103.828917,1090,Apartment,Fully Furnished,2012,700,False,False,False,False,False
4,4300,689,1,1,1.297356,103.836707,1262,Apartment,Fully Furnished,2012,420,False,False,False,False,False


# Train-validation-test split

In [6]:
from sklearn.model_selection import train_test_split

In [5]:
# Split the data into X features and y target
X = df.drop("price", axis=1)
y = df["price"]

In [34]:
# Split the data into training and temporary sets (70% train, 30% temporary)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary data into validation and test sets (50% each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Note: This accomplishes a 70% training, 15% validation and 15% test set size.

# Feature scaling

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
# Initialize a StandardScaler object
scaler = StandardScaler()

In [10]:
# Store the numerical columns
numerical_columns = ["size", "bathrooms", "latitude", "longitude", "meters_to_mrt", "meters_to_school", "year"]

In [11]:
# Perform z-score normalization on numerical columns
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled = scaler.transform(X_test[numerical_columns])

In [12]:
# Convert numpy arrays to pandas dataframes assigning column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=numerical_columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=numerical_columns)

In [13]:
# Assign the index of the new dataframes to match the originial dataframes' index
X_train_scaled.index = X_train.index
X_test_scaled.index = X_test.index

# Feature encoding

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
# Store categorical columns
categorical_columns = ["bedrooms", "property_type", "furnishing"]

In [16]:
# Initialize a OneHotEncoder object
encoder = OneHotEncoder(drop=None, sparse=False)

In [17]:
# Perform one-hot encoding on categorical columns
X_train_encoded = encoder.fit_transform(X_train[categorical_columns])
X_test_encoded = encoder.transform(X_test[categorical_columns])

In [18]:
# Get names of the newly encoded features
encoded_feature_names = encoder.get_feature_names_out(input_features=categorical_columns)

# Define a mapping dictionary to change names
column_name_mapping = {
    "bedrooms_Room": "bedrooms_room",
    "bedrooms_Studio": "bedrooms_studio",
    "property_type_Apartment": "type_apartment",
    "property_type_Cluster House": "type_cluster_house",
    "property_type_Condominium": "type_condominium",
    "property_type_Corner Terrace": "type_corner_terrace",
    "property_type_Detached House": "type_detached_house",
    "property_type_Good Class Bungalow": "type_good_class_bungalow",
    "property_type_HDB Flat": "type_hdb_flat",
    "property_type_Semi-Detached House": "type_semi_detached_house",
    "property_type_Terraced House": "type_terraced_house",
    "furnishing_Fully Furnished": "furnishing_full",
    "furnishing_Partially Furnished": "furnishing_partial",
    "furnishing_Unfurnished": "furnishing_none",
}

# Rename one-hot encoded features 
encoded_feature_names = np.vectorize(lambda x: column_name_mapping.get(x, x))(encoded_feature_names)

In [19]:
# Convert numpy arrays to pandas dataframes assigning column names
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoded_feature_names)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoded_feature_names)

In [20]:
# Assign the index of the new dataframes to match the originial dataframes' index
X_train_encoded.index = X_train.index
X_test_encoded.index = X_test.index

# Create final data

In [21]:
# Store boolean columns
boolean_columns = ["high_floor", "new", "renovated", "view", "penthouse"]

In [22]:
# Combine scaled numerical features, encoded categorical features, and boolean features
X_train_final = pd.concat([X_train_scaled, X_train_encoded, X_train[boolean_columns]], axis=1)
X_test_final = pd.concat([X_test_scaled, X_test_encoded, X_test[boolean_columns]], axis=1)

# Convert the boolean features from object to boolean
X_train_final[boolean_columns] = X_train_final[boolean_columns].astype(bool)
X_test_final[boolean_columns] = X_test_final[boolean_columns].astype(bool)

# Model training: Default parameters

## Linear regression

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
reg = LinearRegression()

## Support vector machine

In [26]:
from sklearn.svm import SVR

In [27]:
svm = SVR()

## Random forest

In [28]:
from sklearn.ensemble import RandomForestRegressor

In [31]:
rf = RandomForestRegressor()

## XGboost

## Neural network

In [32]:
from sklearn.neural_network import MLPRegressor

In [33]:
mlp = MLPRegressor()