In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn import linear_model
import houseprice_functions as hpf
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn import metrics
from sklearn import impute
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn import ensemble
KNN = impute.KNNImputer
KNN = KNN()

In [2]:
cleaned_hp = pd.read_csv('data/cleaned_houseprice.csv')
HousePrices = pd.read_csv('data/train.csv')
cleaned_hp_test = pd.read_csv('data/cleaned_houseprice_test.csv')
y = HousePrices[["Id",'SalePrice']]
x_y = cleaned_hp.merge(y,how = "inner",on = "Id")

In [3]:
cleaned_hp.shape

(1460, 110)

# Outlier imputation

In [4]:
train_outliers = hpf.outlier_selecter(cleaned_hp,cleaned_hp,num_sd = 4,min_unique = 20, drop_zeros = True)
test_range = hpf.outlier_selecter(cleaned_hp,cleaned_hp,num_sd = 4,min_unique = 20, drop_zeros = True,for_test = True)

## Train

In [5]:
#replaces outliers with NAN
for col, idx in train_outliers.items():
    cleaned_hp.loc[:,col].iloc[idx] = np.nan
    
columns_ = cleaned_hp.columns
cleaned_hp = KNN.fit_transform(cleaned_hp)
cleaned_hp = pd.DataFrame(cleaned_hp,columns = columns_)

## Test

In [6]:
#replaces outliers with NAN
for col,low_up_sd in test_range.items():
    outliers = cleaned_hp_test.index[cleaned_hp_test.loc[:,col].apply(lambda x: (x < low_up_sd[0]) or (x > low_up_sd[1]))].to_list()
    if len(outliers) != 0:
        cleaned_hp_test.loc[:,col].iloc[outliers] = np.nan

columns_ = cleaned_hp_test.columns
cleaned_hp_test = KNN.fit_transform(cleaned_hp_test)
cleaned_hp_test = pd.DataFrame(cleaned_hp_test,columns = columns_)

In [7]:
#TRAIN X y
y_train = np.log(x_y['SalePrice'])
X_train = cleaned_hp.drop(["Id"],axis = 1)
#TRAIN X
X_test = cleaned_hp_test.drop(["Id",'0'],axis = 1)

# Elastic Net Model

In [8]:
print("Train:\n")
for col in X_train.columns:
    if (X_train[col].nunique() != 2) and (X_train[col].skew() > 0.75):
        print(col)
        print(X_train[col].skew())
        X_train[col] = np.log1p(X_train[col])
        print(X_train[col].skew())
        print('-'*50)

print("\n\n\nTest:\n")
for col in X_test.columns:
    if (X_test[col].nunique() != 2) and (X_test[col].skew() > 0.75):
        print(col)
        print(X_test[col].skew())
        X_test[col] = np.log1p(X_test[col])
        print(X_test[col].skew())
        print('-'*50)

Train:

LotArea
2.5023009425758294
-0.9136315930988614
--------------------------------------------------
LotShape
1.309985656555955
0.7831035648282987
--------------------------------------------------
LandSlope
4.813682424489448
4.297167061742489
--------------------------------------------------
GrLivArea
0.7945580682885606
-0.12900470046311321
--------------------------------------------------
EnclosedPorch
2.8793402413491145
2.118754318995043
--------------------------------------------------
Total_PorchDeckSF
1.0106992266380932
-0.9071782105229678
--------------------------------------------------



Test:

LotArea
2.552806435590781
-0.9920046340781267
--------------------------------------------------
LotShape
1.2042262704496145
0.7679560899873284
--------------------------------------------------
LandSlope
4.968389880056036
4.64800252535215
--------------------------------------------------
GrLivArea
0.8183697282292276
-0.042539082761763256
-------------------------------------

In [9]:
#Model Training
en_model = ElasticNetCV(l1_ratio=[0.5,0.6,0.65,0.7,0.75],n_alphas = 720, eps=1e-4, fit_intercept=True, 
                        normalize=True, precompute='auto', max_iter=2000, tol=0.0001, cv=10, 
                        copy_X=True, verbose=0, n_jobs=-1, positive=False, random_state=1)

full_model = en_model.fit(X_train, y_train)
y_pred = full_model.predict(X_train)
print("Full Model RMSE:",np.sqrt(metrics.mean_squared_error(y_pred, y_train)))
print("Full model R2:",full_model.score(X_train,y_train))

Full Model RMSE: 0.1149195503876102
Full model R2: 0.9171759454923877


In [10]:
y_test = np.exp(full_model.predict(X_test))
elasticnet_prices = pd.DataFrame(zip(cleaned_hp_test.Id.astype(int),y_test),columns = ["Id","SalePrice"])
elasticnet_prices.head(10)

Unnamed: 0,Id,SalePrice
0,1461,124396.933843
1,1462,156662.899057
2,1463,176943.845714
3,1464,195490.709857
4,1465,203327.466246
5,1466,166876.792797
6,1467,175150.285114
7,1468,160157.0289
8,1469,190776.115748
9,1470,116747.628951


---

# XGBoost Model

In [11]:
cleaned_hp = pd.read_csv('data/cleaned_houseprice.csv')
HousePrices = pd.read_csv('data/train.csv')
cleaned_hp_test = pd.read_csv('data/cleaned_houseprice_test.csv')
y = HousePrices[["Id",'SalePrice']]
x_y = cleaned_hp.merge(y,how = "inner",on = "Id")

In [12]:
train_outliers = hpf.outlier_selecter(cleaned_hp,cleaned_hp,num_sd = 4,min_unique = 20, drop_zeros = True)
test_range = hpf.outlier_selecter(cleaned_hp,cleaned_hp,num_sd = 4,min_unique = 20, drop_zeros = True,for_test = True)

In [13]:
#replaces outliers with NAN
for col, idx in train_outliers.items():
    cleaned_hp.loc[:,col].iloc[idx] = np.nan
    
columns_ = cleaned_hp.columns
cleaned_hp = KNN.fit_transform(cleaned_hp)
cleaned_hp = pd.DataFrame(cleaned_hp,columns = columns_)

In [14]:
#replaces outliers with NAN
for col,low_up_sd in test_range.items():
    outliers = cleaned_hp_test.index[cleaned_hp_test.loc[:,col].apply(lambda x: (x < low_up_sd[0]) or (x > low_up_sd[1]))].to_list()
    if len(outliers) != 0:
        cleaned_hp_test.loc[:,col].iloc[outliers] = np.nan

columns_ = cleaned_hp_test.columns
cleaned_hp_test = KNN.fit_transform(cleaned_hp_test)
cleaned_hp_test = pd.DataFrame(cleaned_hp_test,columns = columns_)

In [15]:
#TRAIN X y
y_train = np.log(x_y['SalePrice'])
cleaned_hp['TotalSF'] = cleaned_hp.TotalBsmtSF + cleaned_hp.GrLivArea
X_train = cleaned_hp.drop(["Id"],axis = 1)
cleaned_hp_test['TotalSF'] = cleaned_hp_test.TotalBsmtSF + cleaned_hp_test.GrLivArea
#TRAIN X
X_test = cleaned_hp_test.drop(["Id",'0'],axis = 1)

In [16]:
randomForest = ensemble.RandomForestRegressor()
xg_cl = xgb.XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth= 3, seed = 1, subsample = 0.9)
xg_cl.fit(X_train,y_train)
predict_test = xg_cl.predict(X_test)
preds_train_all_stack = xg_cl.predict(X_train)

randomForest.set_params(n_estimators=180, random_state=42, max_features=1, max_depth = 5, min_samples_leaf=4)
randomForest.fit(np.array(preds_train_all_stack).reshape(-1,1), y_train)
predict_test_stack = randomForest.predict(np.array(predict_test).reshape(-1,1))

In [17]:
xgboost_prices = pd.DataFrame(zip(cleaned_hp_test.Id.astype(int),np.exp(predict_test_stack)),columns = ["Id","SalePrice"])
xgboost_prices.head()

Unnamed: 0,Id,SalePrice
0,1461,127969.614729
1,1462,162764.534717
2,1463,178237.925324
3,1464,181226.988226
4,1465,183564.910836


---

# Model ensembling

In [18]:
elasticnet_prices.SalePrice = (xgboost_prices.SalePrice + elasticnet_prices.SalePrice)/2

In [19]:
test_submission = elasticnet_prices.to_csv('data/submission.csv',index = False)

---