In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
#!pip install ydata-profiling
#import ydata_profiling as pdp
import numpy as np
import pandas as pd
import os 
import pickle
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import lightgbm as lgb

import warnings 
warnings. filterwarnings('ignore')

In [3]:
# データの読み込み
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# 訓練データとテストデータを識別するための列を追加
df_train['Type'] = 'train'
df_test['Type'] = 'test'

# SalePrice列がテストデータには存在しないため、ダミーで追加
df_test['SalePrice'] = None

# 訓練データとテストデータの結合
df_all = pd.concat([df_train, df_test], sort=False)

In [4]:
#1) LotFrontage欠損値をNeiborhood毎の平均値で埋める
LotFrontage_fill = df_all.groupby('Neighborhood')['LotFrontage'].transform('mean')
df_all['LotFrontage'].fillna(LotFrontage_fill,inplace=True)



# 外れ値の処理

In [5]:
# 数字の特徴量リスト
number_columns = ['LotFrontage','1stFlrSF','OverallQual','YearBuilt','YearRemodAdd',
                  'BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','2ndFlrSF','GrLivArea',
                  'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr',
                  'KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageArea','MoSold',
                  'YrSold','MSSubClass','LotArea','MasVnrArea','LowQualFinSF',
                  'OpenPorchSF','OverallCond','BsmtFinSF2','GarageYrBlt',
                  'GarageCars','WoodDeckSF','EnclosedPorch','3SsnPorch','ScreenPorch',
                  'PoolArea','MiscVal']

# カテゴリ変数を指定したい列のリスト
categorical_columns = [
    'MSZoning', 'LotShape', 'Neighborhood', 'BldgType', 'HouseStyle',
    'Exterior1st', 'HeatingQC', 'CentralAir', 'KitchenQual', 'SaleCondition',
    'GarageType', 'GarageFinish', 'Condition1', 'LandContour', 'PavedDrive',
    'Street', 'Alley', 'Utilities', 'LotConfig', 'LandSlope', 'Condition2',
    'RoofStyle', 'RoofMatl', 'Exterior2nd', 'MasVnrType', 'ExterQual',
    'ExterCond', 'Foundation', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
    'BsmtFinType2', 'Heating', 'Electrical', 'Functional', 'FireplaceQu',
    'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType'
]

# Convert specified columns to category type
categorical_cols = df_all.select_dtypes(include=['object']).columns
df_all[categorical_cols] = df_all[categorical_cols].fillna('Unknown')



# 訓練データとテストデータを分離
df_train_processed = df_all[df_all['Type'] == 'train'].drop(columns=['Type'])
df_test_processed = df_all[df_all['Type'] == 'test'].drop(columns=['Type', 'SalePrice'])


# Combine the lists of numerical and categorical columns
all_columns = number_columns + categorical_columns

# Correctly select columns for x_train, and select 'SalePrice' and 'Id' for y_train and id_train respectively
x_train = df_train[all_columns]  # Use single brackets with the variable holding the list of column names
y_train = df_train['SalePrice']
id_train = df_train['Id']

# Print the shapes of the DataFrames/series
print(x_train.shape, y_train.shape, id_train.shape)

(1460, 78) (1460,) (1460,)


In [6]:
#モデル学習の実行 （CV）

# 特徴量とターゲットの分割
x_train, x_valid, y_train, y_valid = train_test_split(df_train[all_columns], 
                                                      df_train['SalePrice'], 
                                                      test_size=0.2, 
                                                      random_state=123)
# カテゴリ変数をpd.Categorical型に変換
for col in categorical_columns:
    x_train[col] = pd.Categorical(x_train[col])
    x_valid[col] = pd.Categorical(x_valid[col])

# LightGBMのデータセットを作成
train_data = lgb.Dataset(x_train, label=y_train, categorical_feature=categorical_columns, free_raw_data=False)
valid_data = lgb.Dataset(x_valid, label=y_valid, categorical_feature=categorical_columns, free_raw_data=False)

# LightGBMのパラメータ設定
params = {
    'boosting_type':'gbdt',
    'objective': 'regression',
    'metric':'rmse',
    'learning_rate':0.1,
    'num_leaves':16,
    'n_estimators':100000,
    'random_state':123,
    'importance_type':'gain',
}

metrics = []
imp = pd.DataFrame()

n_splits = 5 
cv = list(KFold(n_splits=n_splits, shuffle=True, random_state=123).split(x_train, y_train))

for nfold in np.arange(n_splits):
    print("-"*20, nfold, "-"*20)
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr = x_train.iloc[idx_tr, :], y_train.iloc[idx_tr]
    x_va, y_va = x_train.iloc[idx_va, :], y_train.iloc[idx_va]
    print(x_tr.shape, y_tr.shape)
    print(x_va.shape, y_va.shape)
    print("y_train:{:.3f}, y_tr:{:.3f}, y_va:{:.3f}".format(
        y_train.mean(),
        y_tr.mean(),
        y_va.mean()
    ))

    model = lgb.LGBMRegressor(**params)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr, y_tr), (x_va, y_va)],
              callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=100)]
    )

    y_tr_pred = model.predict(x_tr)
    y_va_pred = model.predict(x_va)
    rmse_tr = mean_squared_error(y_tr, y_tr_pred, squared=False)
    rmse_va = mean_squared_error(y_va, y_va_pred, squared=False)

    print("[RMSE] tr:{:.2f}, va:{:.2f}".format(rmse_tr, rmse_va))
    metrics.append([nfold, rmse_tr, rmse_va])

    _imp = pd.DataFrame({"col": x_train.columns, "imp": model.feature_importances_, "nfold": nfold})
    imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

print("-"*20, 'result', "-"*20)
metrics = np.array(metrics)
print(metrics)

print("[cv] tr:{:.2f}+-{:.2f}, va:{:.2f}+-{:.2f}".format(
    metrics[:, 1].mean(), metrics[:, 1].std(),
    metrics[:, 2].mean(), metrics[:, 2].std(),
))

imp = imp.groupby('col')['imp'].agg(['mean', 'std'])
imp.columns = ['imp', 'imp_std']
imp = imp.reset_index(drop=False)

print("Done.")


-------------------- 0 --------------------
(934, 78) (934,)
(234, 78) (234,)
y_train:180717.069, y_tr:182244.217, y_va:174621.530
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2908
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 73
[LightGBM] [Info] Start training from score 182244.217345
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2492]	training's rmse: 46.2239	valid_1's rmse: 23651.4
[RMSE] tr:46.22, va:23651.41
-------------------- 1 --------------------
(934, 78) (934,)
(234, 78) (234,)
y_train:180717.069, y_tr:179798.290, y_va:184384.333
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000919 seconds.
You can set `force_row_wise=true` to remove

In [7]:
df_test[categorical_columns] = df_test[categorical_columns].astype('category')
all_columns = number_columns + categorical_columns
x_test = df_test[all_columns]
id_test = df_test[['Id']]

In [8]:
y_test_pred = model.predict(x_test)

In [9]:
df_submit = pd.DataFrame({"Id":id_test["Id"],"SalePrice":y_test_pred})
display(df_submit.head(5))
df_submit.to_csv('submission_baseline.csv', index=None)

Unnamed: 0,Id,SalePrice
0,1461,130157.908783
1,1462,158346.8507
2,1463,183518.219607
3,1464,186253.038543
4,1465,178993.136046


目的変数が連続値（金額）である場合、クラスの分布を考慮する必要はなし。stratify=y_trainを削除  
ホールドアウト検証かクロスバリデーション検証（CV）は、どちらかでよい。もしくは、両方実施する場合は、x_tr, y_tr、x_va, y_vaの変数設定を別にする様に注意！   
目的変数が連続値（金額）なので、CVは、StratifiesKFoldでなくKFold。

目的変数が連続値（金額）であるため、'objective':'binary'>**'objective': 'regression'**, LGBMClassifier＞**LGBMRegressor**に変更　  
今回のプロジェクトは、Metricは、RMSEに指定されている。


In [10]:
# Pandasの表示オプションを設定
pd.set_option('display.max_rows', None)  # 行数の最大表示数を無制限に設定

# 説明変数の重要度を降順で表示
print(imp.sort_values('imp', ascending=False, ignore_index=True))

              col           imp       imp_std
0     OverallQual  1.563981e+13  1.410518e+12
1       GrLivArea  3.001541e+12  3.960087e+11
2    Neighborhood  2.303072e+12  4.653068e+11
3     TotalBsmtSF  1.887670e+12  5.871634e+11
4      BsmtFinSF1  1.641097e+12  3.966003e+11
5      GarageCars  1.101739e+12  5.197079e+11
6        1stFlrSF  6.657198e+11  1.587354e+11
7         LotArea  4.854575e+11  1.132149e+11
8      GarageArea  4.186060e+11  1.079270e+11
9    TotRmsAbvGrd  3.021082e+11  1.980160e+11
10       FullBath  2.706434e+11  1.929864e+11
11      YearBuilt  2.429615e+11  8.078000e+10
12      ExterQual  2.113840e+11  2.379310e+11
13       2ndFlrSF  2.005095e+11  5.686590e+10
14    OverallCond  1.745885e+11  2.543394e+10
15   YearRemodAdd  1.574614e+11  6.035788e+10
16    KitchenQual  1.519501e+11  1.238520e+11
17    LotFrontage  1.189107e+11  4.146561e+10
18    GarageYrBlt  1.145893e+11  3.998261e+10
19    OpenPorchSF  1.115862e+11  4.751512e+10
20     Fireplaces  9.176034e+10  1

In [11]:
print(df_train_processed.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street    Alley LotShape  \
0   1          60       RL         65.0     8450   Pave  Unknown      Reg   
1   2          20       RL         80.0     9600   Pave  Unknown      Reg   
2   3          60       RL         68.0    11250   Pave  Unknown      IR1   
3   4          70       RL         60.0     9550   Pave  Unknown      IR1   
4   5          60       RL         84.0    14260   Pave  Unknown      IR1   

  LandContour Utilities  ... PoolArea   PoolQC    Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
1         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
2         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
3         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
4         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2

In [12]:
print(df_test_processed.head())

     Id  MSSubClass MSZoning  LotFrontage  LotArea Street    Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave  Unknown      Reg   
1  1462          20       RL         81.0    14267   Pave  Unknown      IR1   
2  1463          60       RL         74.0    13830   Pave  Unknown      IR1   
3  1464          60       RL         78.0     9978   Pave  Unknown      IR1   
4  1465         120       RL         43.0     5005   Pave  Unknown      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea   PoolQC    Fence  \
0         Lvl    AllPub  ...         120        0  Unknown    MnPrv   
1         Lvl    AllPub  ...           0        0  Unknown  Unknown   
2         Lvl    AllPub  ...           0        0  Unknown    MnPrv   
3         Lvl    AllPub  ...           0        0  Unknown  Unknown   
4         HLS    AllPub  ...         144        0  Unknown  Unknown   

  MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  
0     Unknown       0      6    20