In [93]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import collections
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

#####               delete features with more than 20% missing values

In [16]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

features = pd.concat([train.drop("SalePrice", axis=1), test]).reset_index(drop=True)
feature_flag = features.isna().sum() > features.shape[0] * 0.2
features_omit = list(feature_flag[feature_flag == True].index) + ["Id"]
features = features.loc[:, ~features.columns.isin(features_omit)]

##### detect numeric values which are not ordinal

In [17]:
features["MSSubClass"] = features["MSSubClass"].astype(str)

##### fill NA variables

In [18]:
num_features = []
str_features = []
for i in features.columns:
    if pd.api.types.is_string_dtype(features[i]):
        str_features.append(i)
    if pd.api.types.is_numeric_dtype(features[i]):
        num_features.append(i)

In [19]:
for i in str_features:
    features.loc[features[i].isna(), i] = "NAN"

In [20]:
for i in num_features:
    features[i].fillna(features[i].median(), inplace=True)

##### convert categorical variables to dummy variables

In [22]:
ordinal_cols = ["ExterQual", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC",
               "KitchenQual", "Functional", "FireplaceQu", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence"]

In [23]:
onehot_cols = [i for i in str_features if i not in ordinal_cols]

In [24]:
ordinal_cols = [i for i in str_features if i in ordinal_cols]

In [25]:
for i in onehot_cols:
    features = pd.get_dummies(features, columns=[i], prefix=[i])

In [26]:
ordinal_cols

['ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

##### convert ordinal categorical variables to numeric values

In [40]:
quality_mapping = {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0}
exposure_mapping = {"Gd": 5, "Av": 4, "Mn": 3, "No": 1, "NA": 0} #BsmtExposure
basement_mapping = {"GLQ": 7, "ALQ": 6, "BLQ": 5, "Rec": 4, "LwQ": 3, "Unf": 2, "NA": 1} #BsmtFinType1, BsmtFinType2
functional_mapping = {"Typ": 8, "Min1": 7, "Min2": 6, "Mod": 5, "Maj1": 4, "Maj2": 3, "Sev": 2, "Sal": 1} # Functional
garage_mapping = {"Fin": 4, "RFn": 3, "Unf": 2, "NA": 1} # GarageFinish

In [58]:
for i in ordinal_cols:
    if i not in ["BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Functional", "GarageFinish"]:
        med = np.median(features.loc[~features[i].isin(["NAN"]), :].replace({i: quality_mapping})[i])
        temp_mapping = quality_mapping.copy()
        temp_mapping["NAN"] = med
        features.replace({i: temp_mapping}, inplace=True)

In [59]:
i = "BsmtExposure"
med = np.median(features.loc[~features[i].isin(["NAN"]), :].replace({i: exposure_mapping})[i])
temp_mapping = exposure_mapping.copy()
temp_mapping["NAN"] = med
features.replace({i: temp_mapping}, inplace=True)

basement_cols = ["BsmtFinType1", "BsmtFinType2"]
for i in basement_cols:
    med = np.median(features.loc[~features[i].isin(["NAN"]), :].replace({i: basement_mapping})[i])
    temp_mapping = basement_mapping.copy()
    temp_mapping["NAN"] = med
    features.replace({i: temp_mapping}, inplace=True)
    
i = "Functional"
med = np.median(features.loc[~features[i].isin(["NAN"]), :].replace({i: functional_mapping})[i])
temp_mapping = functional_mapping.copy()
temp_mapping["NAN"] = med
features.replace({i: temp_mapping}, inplace=True)

i = "GarageFinish"
med = np.median(features.loc[~features[i].isin(["NAN"]), :].replace({i: garage_mapping})[i])
temp_mapping = garage_mapping.copy()
temp_mapping["NAN"] = med
features.replace({i: temp_mapping}, inplace=True)

##### Train and Validation Split

In [60]:
train_x = features.iloc[:train.shape[0], :]
test_x = features.iloc[train.shape[0]:, :]

In [61]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
train_x = std_scaler.fit_transform(train_x)
test_x = std_scaler.transform(test_x)

In [62]:
train_y = train["SalePrice"]

##### Training

In [65]:
rf = RandomForestRegressor()
rf.fit(train_x, train_y)

In [66]:
train_pred = rf.predict(train_x)

In [67]:
r2_score(train_pred, train_y)

0.979228324131964

##### random forest cross validation

In [69]:
n_estimators = list(range(100, 1000, 300))
min_samples_split = list(range(2, 10, 3))
min_samples_leaf = list(range(1, 5, 2))
max_features = ["sqrt", 1.0]
hyperparams = {'n_estimators': n_estimators, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf,
               'max_features': max_features}
gd = GridSearchCV(estimator = RandomForestRegressor(), param_grid = hyperparams, verbose=5,
                cv=5, scoring = "neg_mean_squared_error")

In [70]:
gd.fit(train_x, train_y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-581249910.943 total time=   0.4s
[CV 2/5] END max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-1105612899.877 total time=   0.4s
[CV 3/5] END max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-1140341812.573 total time=   0.4s
[CV 4/5] END max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-614618654.187 total time=   0.4s
[CV 5/5] END max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-1148368165.821 total time=   0.4s
[CV 1/5] END max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=-619650061.548 total time=   1.8s
[CV 2/5] END max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=-1024809491.811 total time=   1.8s
[CV 3/5] E

[CV 3/5] END max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=100;, score=-1444230053.632 total time=   0.2s
[CV 4/5] END max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=100;, score=-669806532.826 total time=   0.2s
[CV 5/5] END max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=100;, score=-1382252386.372 total time=   0.2s
[CV 1/5] END max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=400;, score=-670105830.322 total time=   1.3s
[CV 2/5] END max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=400;, score=-1079493608.412 total time=   1.3s
[CV 3/5] END max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=400;, score=-1390871094.465 total time=   1.3s
[CV 4/5] END max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=400;, score=-659754050.519 total time=   1.3s
[CV 5/5] END max_features=sqrt, min_samples_leaf=3, min_samples_split=5,

[CV 1/5] END max_features=1.0, min_samples_leaf=1, min_samples_split=8, n_estimators=400;, score=-698998414.454 total time=  10.3s
[CV 2/5] END max_features=1.0, min_samples_leaf=1, min_samples_split=8, n_estimators=400;, score=-1012404219.157 total time=  10.4s
[CV 3/5] END max_features=1.0, min_samples_leaf=1, min_samples_split=8, n_estimators=400;, score=-951597928.359 total time=  10.8s
[CV 4/5] END max_features=1.0, min_samples_leaf=1, min_samples_split=8, n_estimators=400;, score=-595179437.683 total time=  10.5s
[CV 5/5] END max_features=1.0, min_samples_leaf=1, min_samples_split=8, n_estimators=400;, score=-1275970371.005 total time=  10.3s
[CV 1/5] END max_features=1.0, min_samples_leaf=1, min_samples_split=8, n_estimators=700;, score=-687483691.907 total time=  18.2s
[CV 2/5] END max_features=1.0, min_samples_leaf=1, min_samples_split=8, n_estimators=700;, score=-1007357574.970 total time=  18.3s
[CV 3/5] END max_features=1.0, min_samples_leaf=1, min_samples_split=8, n_estima

In [71]:
r2_score(gd.best_estimator_.predict(train_x), train_y)

0.9797197003799956

##### gradient boosting 

In [74]:
learning_rate = list(np.linspace(0.01, 0.1, 3))
n_estimators = list(range(100, 1000, 300))
subsample = list(np.linspace(0.7, 1.0, 3))
min_samples_split = list(range(2, 10, 3))
min_samples_leaf = list(range(1, 5, 2))
max_features = ["sqrt", 1.0]
hyperparams = {"learning_rate": learning_rate, 'n_estimators': n_estimators, "subsample": subsample,
               'min_samples_split': min_samples_split, 
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features}
gd_gbm = GridSearchCV(estimator = GradientBoostingRegressor(), param_grid = hyperparams, verbose=True,
                cv=5, scoring = "neg_mean_squared_error")

In [75]:
gd_gbm.fit(train_x, train_y)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [76]:
r2_score(gd_gbm.best_estimator_.predict(train_x), train_y)

0.9898424205103801

In [91]:
test["SalePrice"] = gd_gbm.best_estimator_.predict(test_x)

In [92]:
test[["Id", "SalePrice"]].to_csv("submission.csv", index=False)

##### Try stacking

In [83]:
estimators = [("rf", gd.best_estimator_), ("gbm", gd_gbm.best_estimator_)]
reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor())

In [84]:
reg.fit(train_x, train_y)

In [88]:
test["SalePrice"] = reg.predict(test_x)

In [90]:
test[["Id", "SalePrice"]].to_csv("submission.csv", index=False)