In [1]:
from sklearn.linear_model import LogisticRegression, RidgeCV, LassoCV
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from scipy.stats import mode, skew

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [2]:
import matplotlib.pyplot as plt
import warnings
import pandas as pd
import numpy as np
def ignore_warn(*args, **kwargs): pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

%matplotlib inline

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)

train_miss_index = train.isna().sum().sort_values(ascending=False).head(6).index
train = train.drop(train_miss_index, axis=1)
test = test.drop(train_miss_index, axis=1)
train = train.dropna(subset=['Electrical'], axis=0) # Electrical feature는 결측치가 있는 데이터만 삭제한다

train_miss_index2 = train.isna().sum().sort_values(ascending=False).head(12).index
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(train['GarageYrBlt'].mean())
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mean())
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean())
test['MasVnrArea'] = test['MasVnrArea'].fillna(test['MasVnrArea'].mean())

train['MasVnrType'] = train['MasVnrType'].fillna('None')
test['MasVnrType'] = test['MasVnrType'].fillna('None')

train_miss_index3 = train.isna().sum().sort_values(ascending=False).head(9).index

train[train_miss_index3] = train[train_miss_index3].fillna('None')

test_miss_index = test.isna().sum().sort_values(ascending=False).head(9).index
mode_index = mode(test[test_miss_index])
for i in range(9):
    test[test_miss_index[i]] = test[test_miss_index[i]].fillna((mode_index[0][0])[i]) # 이 부분에서 막힘..
    
test_miss_index2 = test.isna().sum().sort_values(ascending=False).head(15).index # 남은 결측치

test['GarageArea'] = test['GarageArea'].fillna(test['GarageArea'].mean())

test_miss_index3 = test.isna().sum().sort_values(ascending=False).head(14).index # 남은 결측치
mode_index2 = mode(test[test_miss_index3])

for i in range(14):
    test[test_miss_index3[i]] = test[test_miss_index3[i]].fillna((mode_index2[0][0])[i])

train.loc[train['LowQualFinSF'] > 0, 'LowQualFinSF'] = 1
test.loc[test['LowQualFinSF'] > 0, 'LowQualFinSF'] = 1

train.loc[train['WoodDeckSF'] > 0, 'WoodDeckSF'] = 1
test.loc[test['WoodDeckSF'] > 0, 'WoodDeckSF'] = 1

train.loc[train['OpenPorchSF'] > 0, 'OpenPorchSF'] = 1
test.loc[test['OpenPorchSF'] > 0, 'OpenPorchSF'] = 1

train.loc[train['EnclosedPorch'] > 0, 'EnclosedPorch'] = 1
test.loc[test['EnclosedPorch'] > 0, 'EnclosedPorch'] = 1

train.loc[train['3SsnPorch'] > 0, '3SsnPorch'] = 1
test.loc[test['3SsnPorch'] > 0, '3SsnPorch'] = 1

train.loc[train['ScreenPorch'] > 0, 'ScreenPorch'] = 1
test.loc[test['ScreenPorch'] > 0, 'ScreenPorch'] = 1

train.loc[train['PoolArea'] > 0, 'PoolArea'] = 1
test.loc[test['PoolArea'] > 0, 'PoolArea'] = 1

train.loc[train['MiscVal'] > 0, 'MiscVal'] = 1
test.loc[test['MiscVal'] > 0, 'MiscVal'] = 1

train['MSSubClass'] = train['MSSubClass'].astype('category').cat.codes
test['MSSubClass'] = test['MSSubClass'].astype('category').cat.codes

train['OverallQual'] = train['OverallQual'].astype('category').cat.codes
test['OverallQual'] = test['OverallQual'].astype('category').cat.codes

train['OverallCond'] = train['OverallCond'].astype('category').cat.codes
test['OverallCond'] = test['OverallCond'].astype('category').cat.codes 

train['MoSold'] = train['MoSold'].astype('category').cat.codes
test['MoSold'] = test['MoSold'].astype('category').cat.codes 
 
train['YrSold'] = train['YrSold'].astype('category').cat.codes
test['YrSold'] = test['YrSold'].astype('category').cat.codes 

train['YearBuilt'] = train['YearBuilt'].astype('category').cat.codes
test['YearBuilt'] = test['YearBuilt'].astype('category').cat.codes 

train['YearRemodAdd'] = train['YearRemodAdd'].astype('category').cat.codes
test['YearRemodAdd'] = test['YearRemodAdd'].astype('category').cat.codes 

train['GarageYrBlt'] = train['GarageYrBlt'].astype('category').cat.codes
test['GarageYrBlt'] = test['GarageYrBlt'].astype('category').cat.codes 

train.drop(['Id'], axis=1, inplace=True)  # 숫자형 데이터를 범주형으로 변경

columns = list(train.columns.values)

objcols = []
for i in columns:
    if train[i].dtype == 'object':
        objcols.append(i)
        
for objcol in objcols:
    train[objcol] = train[objcol].astype('category').cat.codes
    test[objcol] = test[objcol].astype('category').cat.codes
    
train = train.drop(['Utilities'], axis=1)
test = test.drop(['Utilities'], axis=1)

train = train.drop(['Street'], axis=1)
test = test.drop(['Street'], axis=1)

train['Condition2'] = np.log1p(train['Condition2'])
test['Condition2'] = np.log1p(test['Condition2'])

train['Heating'] = np.log1p(train['Heating'])
test['Heating'] = np.log1p(test['Heating'])

train['LotArea'] = np.log1p(train['LotArea'])
test['LotArea'] = np.log1p(test['LotArea'])

In [4]:
X_train = train.drop(['SalePrice'], axis=1).copy()
y_train = train['SalePrice']
X_test = test.drop(['Id'], axis=1).copy()

X_train.shape, y_train.shape, X_test.shape

((1457, 71), (1457,), (1459, 71))

In [5]:
import numpy as np

n_folds = 10
def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [14]:
xgb_reg = XGBRegressor(learning_rate=0.05, max_depth=9, colsample_bytree=0.6, 
                               subsample=0.6, eval_metric='rmse', colsample_bylevel=0.66,
                              alpha=1.05, n_estimators=1500, min_child_weight=1.0)
xgb_reg.fit(X_train, y_train)
xgb_pred = xgb_reg.predict(X_test)

rmse_cv(xgb_reg).mean()

22669.107076946726

In [None]:
xgb = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": xgb_pred
    }) 
xgb.to_csv("./xgb.csv",index=False) # 0.13565

In [7]:
rf_reg = RandomForestRegressor(n_estimators=100, max_features=26)
rf_reg.fit(X_train, y_train)
rf_pred = rf_reg.predict(X_test)

rmse_cv(rf_reg).mean()

24779.080704561835

In [None]:
rf = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": rf_pred
    }) 
rf.to_csv("./rf.csv",index=False) # 0.14775

In [8]:
gbr_reg = GradientBoostingRegressor(learning_rate=0.095, n_estimators=500, subsample=0.85, alpha=0.01)
gbr_reg.fit(X_train, y_train)
gbr_pred = gbr_reg.predict(X_test)

rmse_cv(gbr_reg).mean()

22064.732675619667

In [73]:
gbr = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": gbr_pred
    }) 
gbr.to_csv("./gbr.csv",index=False) # 0.13855

In [27]:
xgb_score = rmse_cv(xgb_reg).mean()
gbr_score = rmse_cv(gbr_reg).mean()
rf_score = rmse_cv(rf_reg).mean()

In [28]:
total_weight = (1. / gbr_score) + (1. / xgb_score) + (1. / rf_score)
pred = (gbr_pred * (1. / gbr_score) + xgb_pred * (1. / xgb_score) +  rf_pred * (1. / rf_score)) / total_weight

In [29]:
total = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": pred
    }) 
total.to_csv("./total.csv",index=False) # 0.13528

In [10]:
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor

In [19]:
estimators = [('gbr', gbr_reg),
              ('xgb', xgb_reg),
              ('rf', rf_reg)]

In [22]:
stack = StackingRegressor(estimators=estimators, final_estimator=xgb_reg)
stack.fit(X_train, y_train)
stack_pred = stack.predict(X_test)

stack_score = rmse_cv(stack).mean()

In [24]:
stack = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": stack_pred
    }) 
stack.to_csv("./stack.csv",index=False) #

In [30]:
def blended_predictions(X):
    return ((0.15 * gbr_pred) + (0.25 * xgb_pred) + (0.1 * rf_pred) + (0.4 * stack_pred) + (0.1 * pred))

In [33]:
blended_pred = blended_predictions(X_test)

In [32]:
blend = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": blended_pred
    }) 
blend.to_csv("./blend.csv",index=False) # 0.13920