In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv("Transformed Train.csv")
test_df = pd.read_csv("Transformed Test.csv")

In [3]:
train_df

Unnamed: 0,YearBuilt,TotalBsmtSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,...,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,Log Price
0,2003,856,1710,548,0,61,0,0,0,0,...,0,0,0,0,1,0,0,1,0,12.247694
1,1976,1262,1262,460,298,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,12.109011
2,2001,920,1786,608,0,42,0,0,0,0,...,0,0,0,0,1,0,0,1,0,12.317167
3,1915,756,1717,642,0,35,0,0,0,0,...,0,0,0,0,1,0,0,0,1,11.849398
4,2000,1145,2198,836,192,84,0,0,0,0,...,0,0,0,0,1,0,0,1,0,12.429216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1999,953,1647,460,0,40,0,0,0,0,...,0,0,0,0,1,0,0,1,0,12.072541
1456,1978,1542,2073,500,349,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,12.254863
1457,1941,1152,2340,252,0,60,0,0,0,0,...,0,0,0,0,1,0,0,0,1,12.493130
1458,1950,1078,1078,240,366,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,11.864462


In [4]:
X = train_df.iloc[:, :-1].values
y = train_df.iloc[:, -1].values
print(X)

[[2003  856 1710 ...    0    1    0]
 [1976 1262 1262 ...    0    1    0]
 [2001  920 1786 ...    0    1    0]
 ...
 [1941 1152 2340 ...    0    0    1]
 [1950 1078 1078 ...    0    0    1]
 [1965 1256 1256 ...    0    0    1]]


In [5]:
print(y)

[12.24769432 12.10901093 12.31716669 ... 12.49312952 11.86446223
 11.90158345]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [7]:
XGB = HistGradientBoostingRegressor(loss = 'absolute_error', learning_rate = 0.15, min_samples_leaf = 30, 
                                    max_leaf_nodes = 31, random_state = 0)
Rf = RandomForestRegressor(n_estimators = 150, criterion = "absolute_error", random_state = 0)
Ada = AdaBoostRegressor()

In [8]:
XGB.fit(X_train, y_train)

HistGradientBoostingRegressor(learning_rate=0.15, loss='absolute_error',
                              min_samples_leaf=30, random_state=0)

In [9]:
XGB.score(X_test, y_test)

0.8691668888791202

In [10]:
y_predicted = XGB.predict(X_test)
print('RMSE between log prices: ', mean_squared_error(y_test, y_predicted, squared = False))
print('RMSE between actual prices: ', mean_squared_error(np.exp(y_test), np.exp(y_predicted), squared = False))
print('R2 score between actual price: ', r2_score(np.exp(y_test), np.exp(y_predicted)))

RMSE between log prices:  0.15051572711316813
RMSE between actual prices:  30352.25631585622
R2 score between actual price:  0.8684912461990819


In [11]:
Rf.fit(X_train, y_train)
y_predicted = Rf.predict(X_test)
print('RMSE between log prices: ', mean_squared_error(y_test, y_predicted, squared = False))
print('RMSE between actual prices: ', mean_squared_error(np.exp(y_test), np.exp(y_predicted), squared = False))
print('R2 score between actual price: ', r2_score(np.exp(y_test), np.exp(y_predicted)))

RMSE between log prices:  0.15283559745456338
RMSE between actual prices:  29835.276007818356
R2 score between actual price:  0.8729329872797758


In [12]:
Ada.fit(X_train, y_train)
y_predicted = Ada.predict(X_test)
print('RMSE between log prices: ', mean_squared_error(y_test, y_predicted, squared = False))
print('RMSE between actual prices: ', mean_squared_error(np.exp(y_test), np.exp(y_predicted), squared = False))
print('R2 score between actual price: ', r2_score(np.exp(y_test), np.exp(y_predicted)))

RMSE between log prices:  0.20362192774568416
RMSE between actual prices:  38260.0631929484
R2 score between actual price:  0.7910395637663437


In [13]:
models = [XGB, Rf, Ada]
RMSE_log = []
RMSE_actual = []
R2_score = []
for i in range(len(models)):
    y_predicted = models[i].predict(X_test)
    RMSE_log.append(round(mean_squared_error(y_test, y_predicted, squared = False), 4))
    RMSE_actual.append(round(mean_squared_error(np.exp(y_test), np.exp(y_predicted), squared = False), 4))
    R2_score.append(round(r2_score(np.exp(y_test), np.exp(y_predicted)), 4))
result = {'Model': ['XGB', 'Random Forest', 'Ada'], }

In [15]:
result = {'Model': ['XGB', 'Random Forest', 'Ada'], 'RMSE_log': [RMSE_log[0], RMSE_log[1], RMSE_log[2]],
          'RMSE_actual': [RMSE_actual[0], RMSE_actual[1], RMSE_actual[2]], 
          'R2_actual': [R2_score[0], R2_score[1], R2_score[2]]}

df = pd.DataFrame(data = result)
df

Unnamed: 0,Model,RMSE_log,RMSE_actual,R2_actual
0,XGB,0.1505,30352.2563,0.8685
1,Random Forest,0.1528,29835.276,0.8729
2,Ada,0.2036,38260.0632,0.791


In [16]:
### Use XGB and fit everything

In [17]:
XGB.fit(X, y)

HistGradientBoostingRegressor(learning_rate=0.15, loss='absolute_error',
                              min_samples_leaf=30, random_state=0)

In [18]:
df_submission = pd.read_csv('sample_submission.csv')
df_submission

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


In [21]:
test_df

Unnamed: 0,YearBuilt,TotalBsmtSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,...,MoSold_12,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA
0,1961,882.0,896,730.0,140,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,1958,1329.0,1329,312.0,393,36,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,1997,928.0,1629,482.0,212,34,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,1998,926.0,1604,470.0,360,36,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,1992,1280.0,1280,506.0,0,82,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1970,546.0,1092,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1455,1970,546.0,1092,286.0,0,24,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1456,1960,1224.0,1224,576.0,474,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1457,1992,912.0,970,0.0,80,32,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [20]:
XGB.score(X, y)

0.9201577351144966

In [24]:
y_predicted = XGB.predict(X)
print('RMSE between log prices: ', mean_squared_error(y, y_predicted, squared = False))
print('RMSE between actual prices: ', mean_squared_error(np.exp(y), np.exp(y_predicted), squared = False))
print('R2 score between actual price: ', r2_score(np.exp(y), np.exp(y_predicted)))

RMSE between log prices:  0.11283195129578075
RMSE between actual prices:  24353.8676014447
R2 score between actual price:  0.9059567545158407


In [25]:
df_submission['SalePrice'] = np.exp(XGB.predict(test_df.values))

In [26]:
df_submission

Unnamed: 0,Id,SalePrice
0,1461,124445.114565
1,1462,157875.957741
2,1463,193590.965994
3,1464,195113.097720
4,1465,184795.967722
...,...,...
1454,2915,76458.093233
1455,2916,82494.359346
1456,2917,157579.289177
1457,2918,123732.851075


In [27]:
#df_submission.to_csv("Submission.csv", index = False)