In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy.stats import boxcox 
from scipy.special import boxcox1p
from sklearn.preprocessing import LabelEncoder,RobustScaler
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.pipeline import make_pipeline
import xgboost as xgb
import lightgbm as lgb
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape

In [None]:
test.shape

**Idの列を消去**

In [None]:
train.drop("Id",axis=1,inplace=True)
test.drop("Id",axis=1,inplace=True)

In [None]:
train.shape

In [None]:
test.shape

**外れ値の確認・消去**

In [None]:
x=train["GrLivArea"].values
y=train["SalePrice"].values
plt.xlabel("GrLivArea",fontsize=13)
plt.ylabel("SalePrice",fontsize=13)
plt.scatter(x,y)

In [None]:
train=train.drop(train[(train["GrLivArea"]>4000) & (train["SalePrice"]<300000)].index)

In [None]:
train.shape

In [None]:
x=train["GrLivArea"].values
y=train["SalePrice"].values
plt.xlabel("GrLivArea",fontsize=13)
plt.ylabel("SalePrice",fontsize=13)
plt.scatter(x,y)

**SalePrice(目的変数)の正規化**

In [None]:
sns.distplot(train["SalePrice"],fit=norm)

In [None]:
train["SalePrice"]=np.log(train["SalePrice"])
train["SalePrice"]

In [None]:
sns.distplot(train["SalePrice"],fit=norm)

**訓練データとテストデータの結合**

In [None]:
ntrain=train.shape[0]
ntest=test.shape[0]
y_train=train["SalePrice"]
all_data=pd.concat([train,test],axis=0).reset_index(drop=True)
all_data=all_data.drop("SalePrice",axis=1)
all_data.shape

**欠損地の確認**

In [None]:
missing_data=pd.DataFrame((all_data.isnull().sum()/len(all_data)*100).sort_values(ascending=False))
missing_data.columns=["Missing_Ratio"]
missing_data

In [None]:
missing_data=missing_data.drop(missing_data[missing_data["Missing_Ratio"]==0].index)
missing_data.shape

In [None]:
sns.barplot(x=missing_data.index,y=missing_data["Missing_Ratio"])
plt.xticks(rotation=90);
plt.figure(figsize=(30,20))

**欠損地処理**

In [None]:
for col in ("PoolQC","MiscFeature","Alley","Fence","FireplaceQu","GarageType", "GarageFinish", "GarageQual", "GarageCond","BsmtQual", "BsmtCond", 
            "BsmtExposure", "BsmtFinType1", "BsmtFinType2","MasVnrType","MSSubClass"):
    all_data[col] = all_data[col].fillna("None")

In [None]:
for col in ("GarageYrBlt","GarageArea","GarageCars","BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF","TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath","MasVnrArea"):
    all_data[col]=all_data[col].fillna(0)

In [None]:
for col in ("MSZoning","Electrical","KitchenQual","Exterior1st","Exterior2nd","SaleType"):
    all_data[col]=all_data[col].fillna(all_data[col].mode()[0])

In [None]:
all_data=all_data.drop(["Utilities"],axis=1)

In [None]:
all_data["Functional"]=all_data["Functional"].fillna("Typ")

In [None]:
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

In [None]:
missing_data=pd.DataFrame((all_data.isnull().sum()/len(all_data)*100).sort_values(ascending=False))
missing_data.columns=["Missing_Ratio"]
missing_data

**数値変換**

In [None]:
for col in ("MSSubClass","OverallCond","YrSold","MoSold"):
    all_data[col]=all_data[col].astype(str)

In [None]:
all_data["FireplaceQu"].unique()

In [None]:
for col in ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 
            'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope','LotShape', 'PavedDrive', 'Street',
            'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold'):
    lbl=LabelEncoder()
    all_data[col]=lbl.fit_transform(list(all_data[col].values))  

**特徴量追加**

In [None]:
all_data["TotalSF"]=all_data["TotalBsmtSF"]+all_data["1stFlrSF"]+all_data["2ndFlrSF"]

**歪度の高い特徴量処理**

In [None]:
numeric_col=all_data.dtypes[all_data.dtypes!="object"].index
skew_data=pd.DataFrame(all_data[numeric_col].skew().sort_values(ascending=False))
skew_data.columns=["Skew"]
skew_data.head(10)

**box_cox変換**

In [None]:
skew_data=skew_data[abs(skew_data["Skew"])>0.75]

In [None]:
skew_col=skew_data.index
skew_col

In [None]:
for col in skew_col:
    lam=0.15
    all_data[col]=boxcox1p(all_data[col],lam)

In [None]:
numeric_col=all_data.dtypes[all_data.dtypes!="object"].index
skew_data=pd.DataFrame(all_data[numeric_col].skew().sort_values(ascending=False))
skew_data.columns=["Skew"]
skew_data.head(10)

**ダミー変数に変換(one-hot-encoding)**

In [None]:
all_data=pd.get_dummies(all_data)
all_data.shape

**trainデータとtestデータに分割**

In [None]:
x_train=all_data[:ntrain]
x_test=all_data[ntrain:]

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

**モデリング**

In [None]:
def rmsle_cv(model):
    n=KFold(n_splits=5,shuffle=True,random_state=42).get_n_splits(x_train.values)
    rmse=np.sqrt(-cross_val_score(model,x_train.values,y_train,scoring="neg_mean_squared_error",cv=n))
    return rmse

**Elastic Net Regression**

In [None]:
ENet=make_pipeline(RobustScaler(),ElasticNet(alpha=0.0005,l1_ratio=0.9,random_state=1))

**XGboost**

In [None]:
model_xgb = xgb.XGBRegressor()

**LightGBM**

In [None]:
model_lgb = lgb.LightGBMRegressor()

**スコア**

In [None]:
for model in (ENet,model_xgb):
    score = rmsle_cv(model)
    print("{:.4f},{:.4f}\n".format(score.mean(), score.std()))

**モデルのスタッキング(ENet,XGboost,LightGBM)**

In [None]:
def predict_cv(model,x_train,y_train,x_test):
    preds_train=[]
    preds_test=[]
    va_idxes=[]
    
    kf=KFold(n_splits=4,shuffle=True,random_state=0)
    
    for tr_idx,va_idx in kf.split(x_train):
        tr_x=x_train.iloc[tr_idx,:]
        tr_y=y_train.iloc[tr_idx]
        val_x=x_train.iloc[va_idx,:]
        val_y=y_train.iloc[va_idx]
        model.fit(tr_x,tr_y)
        preds_train.append(model.predict(val_x))
        preds_test.append(model.predict(x_test))
        va_idxes.append(va_idx)
    
    va_idxes=np.concatenate(va_idxes) 
    preds_train=np.concatenate(preds_train)
    va_idxes=np.argsort(va_idxes)
    preds_train=preds_train[va_idxes]
    
    preds_test=np.mean(preds_test,axis=0)
    return preds_train,preds_test

In [None]:
preds_train_1,preds_test_1=predict_cv(ENet,x_train,y_train,x_test)

In [None]:
preds_train_2,preds_test_2=predict_cv(model_xgb,x_train,y_train,x_test)

In [None]:
preds_train_3,preds_test_3=predict_cv(model_lgb,x_train,y_train,x_test)

In [None]:
x2_train=pd.DataFrame({"preds_train_1":preds_train_1,"preds_train_2":preds_train_2,"preds_train_3":preds_train_3})

In [None]:
x2_test=pd.DataFrame({"preds_test_1":preds_test_1,"preds_test_2":preds_test_2"preds_train_3":preds_train_3})

In [None]:
model_2=lgb.LGBMRegressor()

In [None]:
model_2.fit(x2_train,y_train)

In [None]:
y_pred=model_2.predict(x2_test)

**提出**

In [None]:
sub = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
sub['SalePrice'] = list(map(int, y_pred))
sub.to_csv('submission.csv', index=False)