Imports

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Read the file

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/HoverHander/CS4661-Final-Project/main/Datasets/train.csv')

Replace categorical features with dummy columns

In [3]:
fill_columns = ['LotFrontage', 'GarageYrBlt', 'MasVnrArea']
df[fill_columns] = df[fill_columns].fillna(0)

# drop Id since it has no effect on the price
df = df.drop('Id', axis=1)

# should identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_features)
df_encoded[0::10]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
10,20,70.0,11200,5,5,1965,1965,0.0,906,0,...,0,0,0,1,0,0,0,0,1,0
20,60,101.0,14215,8,5,2005,2006,380.0,0,0,...,0,1,0,0,0,0,0,0,0,1
30,70,50.0,8500,4,4,1920,1950,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
40,20,84.0,8658,6,5,1965,1965,101.0,643,0,...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1410,60,79.0,12420,7,5,2001,2001,0.0,666,0,...,0,0,0,1,0,0,0,0,1,0
1420,60,90.0,11700,6,6,1968,1968,420.0,404,0,...,0,0,0,1,0,0,0,0,1,0
1430,60,60.0,21930,5,5,2005,2005,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1440,70,79.0,11526,6,7,1922,1994,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0


Set up feature matrix and label vector

In [4]:
X = df_encoded.drop('SalePrice', axis=1)
y = df_encoded['SalePrice']
print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (1460, 288)
y shape:  (1460,)


Split the dataset

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# print the size of the traning set:
print(X_train.shape)
print(y_train.shape)

# print the size of the testing set:
print(X_test.shape)
print(y_test.shape)

(1168, 288)
(1168,)
(292, 288)
(292,)


Random Forest Regression

In [6]:
from sklearn.ensemble import RandomForestRegressor

house_rfr = RandomForestRegressor()


Training and predicting

In [7]:
house_rfr.fit(X_train, y_train)
y_prediction = house_rfr.predict(X_test)
print(y_prediction)

[264387.27 179714.3  307557.11 221842.68 211292.47 215079.6  214773.87
 193830.4  150607.11 354461.72 199250.   188616.65  98228.87 139092.62
 108746.94 238706.43 264701.35 296152.03 194245.55 292976.26 336762.23
 169195.52 174708.86 234311.1  354798.26 228471.14 160817.67 270001.94
 212198.33 407103.23 309712.52 170052.9  273711.84 134034.   186984.12
 218565.95 123951.25 310337.54 320820.03 368404.28 160920.5  185953.58
 148199.5   96366.   112124.   231552.03  94860.   162918.53  95043.
 142618.59 147142.5  137896.25 241518.66 172138.75 126457.75 295169.1
 124965.93 278526.22 389513.23 126024.   165499.48 137102.5  302466.83
 146037.25 357470.04 152733.59 223133.16 118902.5  193399.2  126925.69
 216223.02 109194.91 107277.19 140216.25 148737.25 196649.65 168590.
 287621.7  157123.7  240304.92 300342.68 188592.16 190271.26 217697.49
 152836.38 190804.12 172949.5  234056.76 210901.4  153600.5  162610.64
 219237.71 197030.46 179072.82 175703.95 176929.37 109012.04 157753.5
 141221.45 1

RMSE

In [8]:
from sklearn import metrics

mse = metrics.mean_squared_error(y_test, y_prediction)
rmse = np.sqrt(mse)

print(rmse)

26850.164525560034


Trying with cross validation

In [9]:
mse_list = cross_val_score(house_rfr, X, y, cv=10, scoring='neg_mean_squared_error')
print(mse_list)

[-6.33607446e+08 -7.10782164e+08 -4.86182802e+08 -1.58036858e+09
 -1.21048571e+09 -6.74919227e+08 -5.86313809e+08 -5.80023923e+08
 -1.79157900e+09 -7.27891459e+08]


Make the results positive, calculate the square root and mean

In [10]:
mse_list_pos = -mse_list
rmse_list = np.sqrt(mse_list_pos)
print(rmse_list.mean())

29201.081966345537
