# 0.0 Data Prep

## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import time
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# import torch
# import torch.nn as nn
# import torch.utils.data as data


import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import plotly.express as px
import plotly.graph_objects as go

## Load the Data

In [5]:
df = pd.read_csv("coursework_fintech.csv")
print(df.shape)
df.head()

(11782, 18)


Unnamed: 0,SalePrice,YearBuilt,YrSold,MonthSold,Size(sqf),Floor,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),N_manager,N_elevators,SubwayStation,N_FacilitiesNearBy(PublicOffice),N_FacilitiesNearBy(Hospital),N_FacilitiesNearBy(Dpartmentstore),N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park)
0,,,,,,,,,,,,,,,,,,
1,141592.0,2006.0,2007.0,8.0,814.0,3.0,management_in_trust,111.0,184.0,3.0,0.0,Kyungbuk_uni_hospital,2.0,1.0,1.0,1.0,1.0,0.0
2,,,,,,,,,,,,,,,,,,
3,51327.0,1985.0,2007.0,8.0,587.0,8.0,self_management,80.0,76.0,2.0,2.0,Daegu,5.0,1.0,2.0,1.0,2.0,1.0
4,,,,,,,,,,,,,,,,,,


In [12]:
# Removing blank rows from the df
df = df.dropna()

# Sorting the df by the year of sale
df = df.sort_values("YrSold", ascending=True)

df.head()

Unnamed: 0,SalePrice,YearBuilt,YrSold,MonthSold,Size(sqf),Floor,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),N_manager,N_elevators,SubwayStation,N_FacilitiesNearBy(PublicOffice),N_FacilitiesNearBy(Hospital),N_FacilitiesNearBy(Dpartmentstore),N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park)
1,141592.0,2006.0,2007.0,8.0,814.0,3.0,management_in_trust,111.0,184.0,3.0,0.0,Kyungbuk_uni_hospital,2.0,1.0,1.0,1.0,1.0,0.0
227,185840.0,1986.0,2007.0,12.0,1131.0,5.0,management_in_trust,713.0,0.0,8.0,27.0,Kyungbuk_uni_hospital,5.0,1.0,1.0,0.0,1.0,1.0
213,265486.0,2007.0,2007.0,12.0,1394.0,24.0,management_in_trust,554.0,524.0,5.0,10.0,Banwoldang,2.0,2.0,2.0,2.0,0.0,1.0
171,36283.0,1992.0,2007.0,11.0,355.0,4.0,management_in_trust,200.0,0.0,5.0,10.0,Myung-duk,7.0,1.0,1.0,1.0,5.0,1.0
169,194690.0,1993.0,2007.0,11.0,2337.0,18.0,management_in_trust,523.0,536.0,8.0,20.0,Myung-duk,6.0,2.0,0.0,1.0,5.0,0.0


In [13]:
# Defining the target
target = df["SalePrice"]

# Defining the features
features = df.drop("SalePrice", axis=1)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

In [14]:
target.head()

1      141592.0
227    185840.0
213    265486.0
171     36283.0
169    194690.0
Name: SalePrice, dtype: float64

In [15]:
features.head()

Unnamed: 0,YearBuilt,YrSold,MonthSold,Size(sqf),Floor,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),N_manager,N_elevators,SubwayStation,N_FacilitiesNearBy(PublicOffice),N_FacilitiesNearBy(Hospital),N_FacilitiesNearBy(Dpartmentstore),N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park)
1,2006.0,2007.0,8.0,814.0,3.0,management_in_trust,111.0,184.0,3.0,0.0,Kyungbuk_uni_hospital,2.0,1.0,1.0,1.0,1.0,0.0
227,1986.0,2007.0,12.0,1131.0,5.0,management_in_trust,713.0,0.0,8.0,27.0,Kyungbuk_uni_hospital,5.0,1.0,1.0,0.0,1.0,1.0
213,2007.0,2007.0,12.0,1394.0,24.0,management_in_trust,554.0,524.0,5.0,10.0,Banwoldang,2.0,2.0,2.0,2.0,0.0,1.0
171,1992.0,2007.0,11.0,355.0,4.0,management_in_trust,200.0,0.0,5.0,10.0,Myung-duk,7.0,1.0,1.0,1.0,5.0,1.0
169,1993.0,2007.0,11.0,2337.0,18.0,management_in_trust,523.0,536.0,8.0,20.0,Myung-duk,6.0,2.0,0.0,1.0,5.0,0.0


# 1.0 Dummy Regressor

In [16]:
from sklearn.dummy import DummyRegressor

# Create a dummy regressor object
dummy_regr = DummyRegressor(strategy="mean")

# Fit the dummy regressor to the training data
dummy_regr.fit(x_train, y_train)

# Make predictions on the test data
y_pred = dummy_regr.predict(x_test)

# Evaluate the model using mean squared error and root mean squared error
dummy_mse = mean_squared_error(y_test, y_pred)
dummy_rmse = np.sqrt(dummy_mse)

print("Root Mean Squared Error: {:.2f}".format(dummy_rmse))
print("Mean squared error: ", dummy_mse)


Root Mean Squared Error: 107302.22
Mean squared error:  11513766836.494812


# 2.0 Decision Tree Regression

# 3.0 Long Short Term Memory