In [None]:
Kevin Cheng
Yuting Huang

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Read the Excel file using Pandas.
alldata = pd.read_excel('Hemnet_data.xlsx')

# # Convert the timestamp string to an integer representing the year.
alldata['year'] = pd.DatetimeIndex(alldata['Sold Date']).year

# Convert 'yes' to 1 and 'no' to 0
alldata['Balcony'] = alldata['Balcony'].map({'Yes': 1, 'No': 0})
alldata['Patio'] = alldata['Patio'].map({'Yes': 1, 'No': 0})
alldata['Lift'] = alldata['Lift'].map({'Yes': 1, 'No': 0})

# Select the 12 input columns and the output column.
selected_columns = ['Final Price (kr)', 'year',  'Num of Room', 'Living Area (m²)', 'Balcony', 'Patio','Current Floor', 'Total Floor', 'Lift', 'Built Year', 'Fee (kr/month)', 'Operating Fee (kr/year)']
alldata = alldata[selected_columns]
alldata = alldata.dropna()

# Shuffle.
alldata_shuffled = alldata.sample(frac=1.0, random_state=0)
# Separate the input and output columns.
X = alldata_shuffled.drop('Final Price (kr)', axis=1)
X["Fee (kr/month)"] = X["Fee (kr/month)"].astype(str).str.contains('kr')
# For the output, we'll use the log of the sales price.
Y = alldata_shuffled['Final Price (kr)'].apply(np.  log)

# Split into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

In [34]:
X

Unnamed: 0,year,Num of Room,Living Area (m²),Balcony,Patio,Current Floor,Total Floor,Lift,Built Year,Fee (kr/month),Operating Fee (kr/year)
11321,2024.0,3.0,72.0,1,0,4.0,4.0,1,2016.0,False,4800.0
2336,2023.0,2.0,77.0,1,0,4.0,4.0,1,1983.0,False,3600.0
19825,2024.0,2.0,52.2,0,0,1.0,5.0,1,1998.0,False,6655.0
13446,2024.0,1.0,33.0,0,0,9.0,10.0,1,2019.0,False,4860.0
4609,2023.0,3.0,72.0,0,0,1.0,3.0,0,1957.0,False,7200.0
...,...,...,...,...,...,...,...,...,...,...,...
14024,2023.0,4.0,96.0,1,0,3.0,4.0,0,1961.0,False,2775.0
8499,2024.0,2.0,48.0,1,0,2.0,6.0,1,2012.0,False,5900.0
26609,2022.0,2.0,51.5,0,0,2.0,2.0,0,1989.0,False,3732.0
29190,2024.0,2.0,65.0,1,0,1.5,6.0,1,1945.0,False,5520.0


In [21]:
Y

11321    14.316286
2336     13.721200
19825    14.914123
13446    14.417091
4609     14.253765
           ...    
14024    14.808762
8499     14.557448
26609    12.765688
29190    14.669926
6956     14.331324
Name: Final Price (kr), Length: 11064, dtype: float64

In [25]:
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_validate
m1 = LinearRegression()
cross_validate(m1, Xtrain, Ytrain, scoring='neg_mean_squared_error')

{'fit_time': array([0.00521302, 0.00632787, 0.00269175, 0.00280094, 0.00405598]),
 'score_time': array([0.00298095, 0.00359392, 0.00093532, 0.00185013, 0.00207901]),
 'test_score': array([-0.22838203, -0.22975599, -0.20002046, -0.22530708, -0.24852397])}

In [26]:
from sklearn.metrics import mean_squared_error
  
m1.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, m1.predict(Xtest))


0.21562494511368396

In [32]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# create the models
m2 = DecisionTreeRegressor()
m3 = RandomForestRegressor()

# train models
m2.fit(Xtrain, Ytrain)
m3.fit(Xtrain, Ytrain)  

# output cross validation   
print(cross_validate(m2, Xtrain, Ytrain, scoring='neg_mean_squared_error'))
print(cross_validate(m3, Xtrain, Ytrain, scoring='neg_mean_squared_error'))

# get mse
print(mean_squared_error(Ytest, m2.predict(Xtest)))
print(mean_squared_error(Ytest, m3.predict(Xtest)))


{'fit_time': array([0.02278399, 0.02256298, 0.02280807, 0.02232289, 0.02257919]), 'score_time': array([0.0010643 , 0.0011251 , 0.00100899, 0.00088906, 0.00091887]), 'test_score': array([-0.2894317 , -0.31712543, -0.27106418, -0.29047338, -0.28125532])}
{'fit_time': array([1.44579005, 1.40766597, 1.40849805, 1.4069519 , 1.43436289]), 'score_time': array([0.03231692, 0.02969432, 0.02927899, 0.02990603, 0.02979612]), 'test_score': array([-0.15542245, -0.16771856, -0.14297892, -0.15616003, -0.16362399])}
0.26498035242300066
0.14613357905535235


In [33]:
from sklearn.ensemble import GradientBoostingRegressor

m4 = GradientBoostingRegressor(random_state=0)
m4.fit(Xtrain, Ytrain)
print(cross_validate(m4, Xtrain, Ytrain, scoring='neg_mean_squared_error'))
print(mean_squared_error(Ytest, m4.predict(Xtest)))

{'fit_time': array([0.42157602, 0.41513705, 0.41573977, 0.41513801, 0.41581798]), 'score_time': array([0.00296712, 0.00264192, 0.00279021, 0.00259614, 0.0036931 ]), 'test_score': array([-0.15979796, -0.16709733, -0.14164804, -0.16352724, -0.17566993])}
0.14968303166518607


In [31]:
from sklearn.neural_network import MLPRegressor
m5 = MLPRegressor(hidden_layer_sizes=(100,), random_state=1, max_iter=2000, tol=0.1)
m5.fit(Xtrain, Ytrain)
print(cross_validate(m5, Xtrain, Ytrain, scoring='neg_mean_squared_error'))
print(mean_squared_error(Ytest, m5.predict(Xtest)))

{'fit_time': array([0.83442283, 0.66174889, 1.00841713, 1.11366796, 0.8749671 ]), 'score_time': array([0.00379801, 0.00370622, 0.00414419, 0.00326514, 0.00443578]), 'test_score': array([-5.75576109, -1.94364892, -1.86139575, -1.58452043, -2.8645643 ])}
3.089509250610125


LinearRegression
{'fit_time': array([0.00521302, 0.00632787, 0.00269175, 0.00280094, 0.00405598]),
 'score_time': array([0.00298095, 0.00359392, 0.00093532, 0.00185013, 0.00207901]),
 'test_score': array([-0.22838203, -0.22975599, -0.20002046, -0.22530708, -0.24852397])}

DecisionTreeRegresson
{'fit_time': array([0.02278399, 0.02256298, 0.02280807, 0.02232289, 0.02257919]), 'score_time': array([0.0010643 , 0.0011251 , 0.00100899, 0.00088906, 0.00091887]), 'test_score': array([-0.2894317 , -0.31712543, -0.27106418, -0.29047338, -0.28125532])}

RandomForestRegression

{'fit_time': array([1.44579005, 1.40766597, 1.40849805, 1.4069519 , 1.43436289]), 'score_time': array([0.03231692, 0.02969432, 0.02927899, 0.02990603, 0.02979612]), 'test_score': array([-0.15542245, -0.16771856, -0.14297892, -0.15616003, -0.16362399])}

GradientBoostingRegression

{'fit_time': array([0.42157602, 0.41513705, 0.41573977, 0.41513801, 0.41581798]), 'score_time': array([0.00296712, 0.00264192, 0.00279021, 0.00259614, 0.0036931 ]), 'test_score': array([-0.15979796, -0.16709733, -0.14164804, -0.16352724, -0.17566993])}

MLPRegression

{'fit_time': array([0.83442283, 0.66174889, 1.00841713, 1.11366796, 0.8749671 ]), 'score_time': array([0.00379801, 0.00370622, 0.00414419, 0.00326514, 0.00443578]), 'test_score': array([-5.75576109, -1.94364892, -1.86139575, -1.58452043, -2.8645643 ])}

