# 집 값 예측(핵심컬럼만 포함)

## 머신러닝 과정
- 문제정의
- 데이터 수집
- 데이터 전처리
- 탐색적 데이터분석(EDA)
- 모델 선택 및 하이퍼파라미터 조정
- 모델 학습
- 모델 평가
- 서비스화(Web)

### 2. 데이터 수집

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # 시각화 라이브러리
from sklearn.preprocessing import LabelEncoder #인코딩 도구

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train.corr()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,1.0,0.002212,-0.017448,0.04034,0.014907,-0.01482,-0.010497,-0.008488,0.031044,0.031692,...,0.010164,0.012209,0.024373,0.013605,-0.003046,-0.047725,0.008472,0.002417,0.007702,0.029733
MSSubClass,0.002212,1.0,-0.440134,-0.195237,0.015992,-0.06948,0.030123,0.049555,0.006549,-0.08408,...,-0.022199,-0.017276,-0.023892,-0.040717,-0.042211,-0.013187,-0.043022,0.013917,-0.004983,-0.102403
LotFrontage,-0.017448,-0.440134,1.0,0.458013,0.203403,-0.057332,0.11508,0.087948,0.211236,0.230561,...,0.104918,0.152665,-0.006108,0.048278,0.066449,0.228577,0.015184,-0.001141,0.015999,0.358583
LotArea,0.04034,-0.195237,0.458013,1.0,0.098219,-0.04286,0.025845,0.011989,0.098021,0.210914,...,0.17963,0.098412,0.00083,0.031692,0.050773,0.114308,0.028732,0.012756,-0.012336,0.264751
OverallQual,0.014907,0.015992,0.203403,0.098219,1.0,-0.039341,0.569129,0.570151,0.425955,0.264792,...,0.231595,0.295776,-0.137066,0.031865,0.037596,0.028149,-0.012887,0.029502,-0.003876,0.803108
OverallCond,-0.01482,-0.06948,-0.057332,-0.04286,-0.039341,1.0,-0.325595,0.070395,-0.143961,-0.018594,...,0.027304,-0.077097,0.088624,0.016719,0.048653,-0.010603,0.065487,-0.006393,0.02292,-0.07558
YearBuilt,-0.010497,0.030123,0.11508,0.025845,0.569129,-0.325595,1.0,0.620058,0.316396,0.271104,...,0.220332,0.186058,-0.409039,0.02993,-0.031096,0.014258,-0.024138,-0.00206,0.012426,0.551999
YearRemodAdd,-0.008488,0.049555,0.087948,0.011989,0.570151,0.070395,0.620058,1.0,0.186949,0.145362,...,0.215457,0.234414,-0.228317,0.04689,-0.03017,-0.013916,-0.010245,0.007126,0.053171,0.539269
MasVnrArea,0.031044,0.006549,0.211236,0.098021,0.425955,-0.143961,0.316396,0.186949,1.0,0.28615,...,0.133993,0.128772,-0.108823,0.027967,0.039936,0.011835,-0.017293,0.005973,-0.026106,0.50397
BsmtFinSF1,0.031692,-0.08408,0.230561,0.210914,0.264792,-0.018594,0.271104,0.145362,0.28615,1.0,...,0.234766,0.125901,-0.118296,0.076501,0.078664,0.142446,0.022431,0.003086,0.042905,0.420841


## 3.데이터 전처리

### 수치형 데이터와 범주형 데이터 나누기

In [4]:
numerical_feats = train.dtypes[train.dtypes != "object"].index 
print("Number of Numerical features: ", len(numerical_feats)) 
categorical_feats = train.dtypes[train.dtypes == "object"].index 
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  38
Number of Categorical features:  43


In [5]:
print(train[numerical_feats].columns) # 수치형
print("*"*80) 
print(train[categorical_feats].columns) # 범주형

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')
********************************************************************************
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual'

##### 이상치 탐색 및 제거
- IQR을 이용한 함수를 지정하여 탐색기준 잡기
- train데이터의 이상치 탐색

In [6]:
from collections import Counter 

In [7]:
def detect_outliers(df, n, features): 
    outlier_indices = [] 
    for col in features: 
        Q1 = np.percentile(df[col], 25) 
        Q3 = np.percentile(df[col], 75) 
        IQR = Q3 - Q1 
        outlier_step = 1.5 * IQR 
        
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index 
        outlier_indices.extend(outlier_list_col) 
    outlier_indices = Counter(outlier_indices) 
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n) 
    
    return multiple_outliers 
Outliers_to_drop = detect_outliers(train, 2, ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 
                                                 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 
                                                 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
                                                 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 
                                                 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
                                                 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                                                 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 
                                                 'YrSold'])


In [8]:
train.loc[Outliers_to_drop]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
28,29,190,RM,60.0,10800,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Normal,163000.000000
86,87,190,RM,60.0,10320,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,5,2008,WD,Normal,72394.083792
193,194,190,RM,85.0,13600,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,9,2007,WD,Normal,90000.000000
196,197,190,RM,60.0,9600,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Normal,136072.916252
325,326,190,RH,,7082,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,160000.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,28,20,RL,76.0,11355,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Othr,6500,4,2008,WD,Normal,187664.806055
617,618,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000.000000
1051,1052,20,RL,80.0,12048,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Normal,151225.190506
1130,1131,20,RL,72.0,10152,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2006,WD,Normal,166444.087792


In [9]:
# 85개의 행에서 이상치 발견. 이상치가 있는 행 삭제
train = train.drop(Outliers_to_drop, axis= 0).reset_index(drop = True)
train.shape

(1666, 81)

### 핵심컬럼 외 드랍시키기

In [10]:
train['GarageArea'].fillna(0, inplace=True)
train['GarageQual'].fillna('NA', inplace=True)
train['GarageCond'].fillna('NA', inplace=True)
train['Fence'].fillna('NA', inplace=True)
train['MiscFeature'].fillna('NA', inplace=True)
train['BsmtQual'].fillna('NA', inplace=True)
train['BsmtCond'].fillna('NA', inplace=True)
train['BsmtExposure'].fillna('NA', inplace=True)
train['BsmtFinType1'].fillna('NA', inplace=True)
train['BsmtFinSF1'].fillna(0, inplace=True) # 평균값 사용
train['BsmtFinType2'].fillna('NA', inplace=True)
train['BsmtFinSF2'].fillna(0, inplace=True) # 수치형 - 연속형데이터 :mean 45.475429
train['BsmtUnfSF'].fillna(0, inplace=True) # 중간값(50%)사용, 461.000000
train['TotalBsmtSF'].fillna(0, inplace=True) # 중간값(50%)사용, 980.000000
train['Electrical'].fillna('SBrkr', inplace=True)  # 최빈값(top)
train['BsmtFullBath'].fillna(0, inplace=True) # mean
train['BsmtHalfBath'].fillna(0, inplace=True) #mean
train['Functional'].fillna('Typ', inplace=True)
train['FireplaceQu'].fillna('NA', inplace=True)
train['GarageType'].fillna('NA', inplace=True)
train['GarageYrBlt'].fillna(0, inplace=True)
train['GarageFinish'].fillna('NA', inplace=True)
train['GarageCars'].fillna(0, inplace=True)
train['MSZoning'].fillna('RL', inplace=True)
train['LotFrontage'].fillna(0, inplace=True)
train['Alley'].fillna('NA', inplace=True)
train['PoolQC'].fillna('NA', inplace=True)
train['Utilities'].fillna('NA', inplace=True)
train['MasVnrType'].fillna('NA', inplace=True)
train['MasVnrArea'].fillna(0, inplace=True)

In [11]:
test['GarageArea'].fillna(0, inplace=True)
test['GarageQual'].fillna('NA', inplace=True)
test['GarageCond'].fillna('NA', inplace=True)
test['Fence'].fillna('NA', inplace=True)
test['MiscFeature'].fillna('NA', inplace=True)
test['BsmtQual'].fillna('NA', inplace=True)
test['BsmtCond'].fillna('NA', inplace=True)
test['BsmtExposure'].fillna('NA', inplace=True)
test['BsmtFinType1'].fillna('NA', inplace=True)
test['BsmtFinSF1'].fillna(0, inplace=True) # 평균값 사용
test['BsmtFinType2'].fillna('NA', inplace=True)
test['BsmtFinSF2'].fillna(0, inplace=True) # 수치형 - 연속형데이터 :mean 45.475429
test['BsmtUnfSF'].fillna(0, inplace=True) # 중간값(50%)사용, 461.000000
test['TotalBsmtSF'].fillna(0, inplace=True) # 중간값(50%)사용, 980.000000
test['Electrical'].fillna('SBrkr', inplace=True)  # 최빈값(top)
test['BsmtFullBath'].fillna(0, inplace=True) # mean
test['BsmtHalfBath'].fillna(0, inplace=True) #mean
test['Functional'].fillna('Typ', inplace=True)
test['FireplaceQu'].fillna('NA', inplace=True)
test['GarageType'].fillna('NA', inplace=True)
test['GarageYrBlt'].fillna(0, inplace=True)
test['GarageFinish'].fillna('NA', inplace=True)
test['GarageCars'].fillna(0, inplace=True)
test['MSZoning'].fillna('RL', inplace=True)
test['LotFrontage'].fillna(0, inplace=True)
test['Alley'].fillna('NA', inplace=True)
test['PoolQC'].fillna('NA', inplace=True)
test['Utilities'].fillna('NA', inplace=True)
test['MasVnrType'].fillna('NA', inplace=True)
test['MasVnrArea'].fillna(0, inplace=True)
test['KitchenQual'].fillna('TA', inplace=True) # 최빈값 사용
test['SaleType'].fillna('WD', inplace=True) # 최빈값 사용
test['Exterior1st'].fillna('VinylSd', inplace=True)
test['Exterior2nd'].fillna('VinylSd', inplace=True)

In [12]:
train.drop([
       'PoolQC',
       'LowQualFinSF'
       ], axis=1, inplace=True)

In [13]:
test.drop([
        'PoolQC',
       'LowQualFinSF'], axis=1, inplace=True)

In [14]:
print(train.shape)
print(test.shape)

(1666, 79)
(1168, 78)


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1666 entries, 0 to 1665
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1666 non-null   int64  
 1   MSSubClass     1666 non-null   int64  
 2   MSZoning       1666 non-null   object 
 3   LotFrontage    1666 non-null   float64
 4   LotArea        1666 non-null   int64  
 5   Street         1666 non-null   object 
 6   Alley          1666 non-null   object 
 7   LotShape       1666 non-null   object 
 8   LandContour    1666 non-null   object 
 9   Utilities      1666 non-null   object 
 10  LotConfig      1666 non-null   object 
 11  LandSlope      1666 non-null   object 
 12  Neighborhood   1666 non-null   object 
 13  Condition1     1666 non-null   object 
 14  Condition2     1666 non-null   object 
 15  BldgType       1666 non-null   object 
 16  HouseStyle     1666 non-null   object 
 17  OverallQual    1666 non-null   int64  
 18  OverallC

In [16]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 78 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1168 non-null   int64  
 1   MSSubClass     1168 non-null   int64  
 2   MSZoning       1168 non-null   object 
 3   LotFrontage    1168 non-null   float64
 4   LotArea        1168 non-null   int64  
 5   Street         1168 non-null   object 
 6   Alley          1168 non-null   object 
 7   LotShape       1168 non-null   object 
 8   LandContour    1168 non-null   object 
 9   Utilities      1168 non-null   object 
 10  LotConfig      1168 non-null   object 
 11  LandSlope      1168 non-null   object 
 12  Neighborhood   1168 non-null   object 
 13  Condition1     1168 non-null   object 
 14  Condition2     1168 non-null   object 
 15  BldgType       1168 non-null   object 
 16  HouseStyle     1168 non-null   object 
 17  OverallQual    1168 non-null   int64  
 18  OverallC

### 범주형 특성공학

#### 'BsmtExposure', 순서0' 'GarageQual',순서0 'PoolQC', 순서0


In [17]:
# Alley
convert_title_dic = {
   'NA':1, 
    'Pave':2, 
    'Grvl':0
}

In [18]:
train['Alley'] = train['Alley'].map(convert_title_dic)
test['Alley'] = test['Alley'].map(convert_title_dic)

In [19]:
# 'Street'
convert_title_dic = {
    'Pave':1, 
    'Grvl':0

}

In [20]:
train['Street'] = train['Street'].map(convert_title_dic)
test['Street'] = test['Street'].map(convert_title_dic)

In [21]:
# 'Utilities'
convert_title_dic = {
    'AllPub':1, 
     'NA':0,
    'NoSeWa':0

}

In [22]:
train['Utilities'] = train['Utilities'].map(convert_title_dic)
test['Utilities'] = test['Utilities'].map(convert_title_dic)

In [23]:
# 'LotConfig'
convert_title_dic = {
    'Corner':1, 
    'Inside':1, 
    'CulDSac':3, 
    'FR2':2, 
    'FR3':0

}

In [24]:
train['LotConfig'] = train['LotConfig'].map(convert_title_dic)
test['LotConfig'] = test['LotConfig'].map(convert_title_dic)

In [25]:
# 'Condition2'
convert_title_dic = {
   'Norm':1, 
    'PosA':2, 
    'RRNn':0, 
    'Feedr':0,  
    'Artery':1, 
    'PosN':2,
    'RRAe':1,
    'RRAn':1

}

In [26]:
train['Condition2'] = train['Condition2'].map(convert_title_dic)
test['Condition2'] = test['Condition2'].map(convert_title_dic)

In [27]:
# 'RoofMatl'
convert_title_dic = {
   'CompShg':1, 
    'Tar&Grv':1, 
    'WdShake':2, 
    'WdShngl':2, 
    'Roll':1, 
    'Metal':2,
    'Membran':1

}

In [28]:
train['RoofMatl'] = train['RoofMatl'].map(convert_title_dic)
test['RoofMatl'] = test['RoofMatl'].map(convert_title_dic)

In [29]:
# 'ExterCond'
convert_title_dic = {
   'TA':2, 
    'Gd':1, 
    'Fa':0, 
    'Ex':2, 
    'Po':0
}

In [30]:
train['ExterCond'] = train['ExterCond'].map(convert_title_dic)
test['ExterCond'] = test['ExterCond'].map(convert_title_dic)

In [31]:
# 'GarageFinish'
convert_title_dic = {
   'Unf':0, 
    'RFn':1, 
    'Fin':1, 
    'NA':0

}

In [32]:
train['GarageFinish'] = train['GarageFinish'].map(convert_title_dic)
test['GarageFinish'] = test['GarageFinish'].map(convert_title_dic)

In [33]:
# 'Fence'
convert_title_dic = {
   'NA':1, 
    'GdPrv':0, 
    'MnPrv':0, 
    'GdWo':0, 
    'MnWw':0

}

In [34]:
train['Fence'] = train['Fence'].map(convert_title_dic)
test['Fence'] = test['Fence'].map(convert_title_dic)

In [35]:
# 'MiscFeature''
convert_title_dic = {
   'NA':1, 
    'Gar2':0, 
    'Shed':0, 
    'Othr':0,
    'TenC':0

}

In [36]:
train['MiscFeature'] = train['MiscFeature'].map(convert_title_dic)
test['MiscFeature'] = test['MiscFeature'].map(convert_title_dic)

In [37]:
# 'SaleCondition'
convert_title_dic = {
   'Normal':2, 
    'Partial':3, 
    'Abnorml':1, 
    'Family':1, 
    'Alloca':1, 
    'AdjLand':0

}

In [38]:
train['SaleCondition'] = train['SaleCondition'].map(convert_title_dic)
test['SaleCondition'] = test['SaleCondition'].map(convert_title_dic)

In [39]:
#BsmtExposure
convert_title_dic = {
    'No':'a',
    'NA':'a',
    'Mn':'b',
    'Av':'b',
    'Gd':'c'

}

In [40]:
train['BsmtExposure'] = train['BsmtExposure'].map(convert_title_dic)
test['BsmtExposure'] = test['BsmtExposure'].map(convert_title_dic)

In [41]:
#BsmtFinType1 순서0
convert_title_dic = {
    'TA':'e',
    'Fa':'b',
    'Gd':'d',
    'NA':'b',
    'Ex':'c',
    'Po':'a'

}

In [42]:
train['GarageQual'] = train['GarageQual'].map(convert_title_dic)
test['GarageQual'] = test['GarageQual'].map(convert_title_dic)

In [43]:
# #PoolQC
# convert_title_dic = {
   
#     'Gd':'d',
#     'NA':'b',
#     'Ex':'c',
 

# }

In [44]:
# train['PoolQC'] = train['PoolQC'].map(convert_title_dic)
# test['PoolQC'] = test['PoolQC'].map(convert_title_dic)

In [45]:
# Neighborhood
convert_title_dic = {
   'Sawyer':'a', 'NAmes':'a', 'Mitchel':'b', 'BrkSide':'a', 'IDOTRR':'a', 'Blueste':'a', 'MeadowV':'a', 'SWISU':'a', 'BrDale':'a',  'NPkVill':'a',
    'Crawfor':'b', 'CollgCr':'b', 'Somerst':'b', 'Edwards':'a', 'OldTown':'a',  'NWAmes':'b', 'Gilbert':'b', 'ClearCr':'b', 'SawyerW':'b',  'Blmngtn':'b',
    'StoneBr':'c',  'NridgHt':'c', 'NoRidge':'c', 'Veenker':'c',  'Timber':'c'
}

In [46]:
train['Neighborhood'] = train['Neighborhood'].map(convert_title_dic)

In [47]:
test['Neighborhood'] = test['Neighborhood'].map(convert_title_dic)

In [48]:
#HouseStyle
convert_title_dic = {
   '1Story':'1F', '2Story':'2F', 'SLvl':'others', '1.5Fin':'others', '1.5Unf':'1.5F', '2.5Unf':'others', 'SFoyer':'1.5F',
    '2.5Fin':'others'
}

In [49]:
train['HouseStyle'] = train['HouseStyle'].map(convert_title_dic)

In [50]:
test['HouseStyle'] = test['HouseStyle'].map(convert_title_dic)

In [51]:
#ExterQual
convert_title_dic = {
  'Gd':'c', 'TA':'b', 'Fa':'a', 'Ex':'d'
}

In [52]:
train['ExterQual'] = train['ExterQual'].map(convert_title_dic)

In [53]:
test['ExterQual'] = test['ExterQual'].map(convert_title_dic)

In [54]:
#Foundation
convert_title_dic = {
  'CBlock':'CBlock', 'PConc':'Poured Contrete', 'BrkTil':'other', 'Slab':'other', 'Stone':'other', 'Wood':'other'
}

In [55]:
train['Foundation'] = train['Foundation'].map(convert_title_dic)

In [56]:
test['Foundation'] = test['Foundation'].map(convert_title_dic)

In [57]:
#BsmtQual
convert_title_dic = {
  'Gd':'c', 'TA':'b', 'Ex':'d', 'NA':'a', 'Fa':'a'
}

In [58]:
train['BsmtQual'] = train['BsmtQual'].map(convert_title_dic)

In [59]:
test['BsmtQual'] = test['BsmtQual'].map(convert_title_dic)

In [60]:
#BsmtCond
convert_title_dic = {
  'TA':'b', 'Fa':'a', 'NA':'a', 'Gd':'b', 'Po':'a'
}

In [61]:
train['BsmtCond'] = train['BsmtCond'].map(convert_title_dic)

In [62]:
test['BsmtCond'] = test['BsmtCond'].map(convert_title_dic)

In [63]:
#BsmtFinType2
convert_title_dic = {
  'Unf':'Unf', 'LwQ':'LwQ', 'BLQ':'other', 'NA':'NA', 'Rec':'other', 'ALQ':'other', 'GLQ':'other'
}

In [64]:
train['BsmtFinType2'] = train['BsmtFinType2'].map(convert_title_dic)

In [65]:
test['BsmtFinType2'] = test['BsmtFinType2'].map(convert_title_dic)

In [66]:
#Heating
convert_title_dic = {
  'GasA':'Gas', 'GasW':'Gas', 'Wall':'other', 'Grav':'other', 'OthW':'other', 'Floor':'other'
}

In [67]:
train['Heating'] = train['Heating'].map(convert_title_dic)

In [68]:
test['Heating'] = test['Heating'].map(convert_title_dic)

In [69]:
#HeatingQC
convert_title_dic = {
  'Gd':'c', 'TA':'b', 'Ex':'d', 'Fa':'a','Po':'a'
}

In [70]:
train['HeatingQC'] = train['HeatingQC'].map(convert_title_dic)

In [71]:
test['HeatingQC'] = test['HeatingQC'].map(convert_title_dic)

In [72]:
#Electrical
convert_title_dic = {
  'SBrkr':'SBrkr', 'FuseA':'other', 'FuseF':'other', 'FuseP':'other', 'Mix':'other'
}

In [73]:
train['Electrical'] = train['Electrical'].map(convert_title_dic)

In [74]:
test['Electrical'] = test['Electrical'].map(convert_title_dic)

In [75]:
#KitchenQual
convert_title_dic = {
 'Gd':'c', 'TA':'b', 'Fa':'a', 'Ex':'d'
}

In [76]:
train['KitchenQual'] = train['KitchenQual'].map(convert_title_dic)

In [77]:
test['KitchenQual'] = test['KitchenQual'].map(convert_title_dic)

In [78]:
#Functional
convert_title_dic = {
  'Typ':'Typ', 'Min1':'other', 'Min2':'other', 'Maj1':'other', 'Mod':'other', 'Maj2':'other', 'Sev':'other'
}

In [79]:
train['Functional'] = train['Functional'].map(convert_title_dic)

In [80]:
test['Functional'] = test['Functional'].map(convert_title_dic)

In [81]:
#FireplaceQu
convert_title_dic = {
  'Gd':'d', 'NA':'b', 'TA':'c', 'Fa':'a', 'Po':'a', 'Ex':'e'
}

In [82]:
train['FireplaceQu'] = train['FireplaceQu'].map(convert_title_dic)

In [83]:
test['FireplaceQu'] = test['FireplaceQu'].map(convert_title_dic)

In [84]:
#GarageType
convert_title_dic = {
  'Attchd':'attbui', 'Detchd':'other', 'BuiltIn':'BuiltIn', 'NA':'other', 'Basment':'other', 'CarPort':'other', '2Types':'other'
}

In [85]:
train['GarageType'] = train['GarageType'].map(convert_title_dic)

In [86]:
test['GarageType'] = test['GarageType'].map(convert_title_dic)

In [87]:
#GarageCond
convert_title_dic = {
  'TA':'c', 'Fa':'a', 'NA':'a', 'Gd':'b', 'Ex':'a', 'Po':'a'
}

In [88]:
train['GarageCond'] = train['GarageCond'].map(convert_title_dic)

In [89]:
test['GarageCond'] = test['GarageCond'].map(convert_title_dic)

In [90]:
#PavedDrive
convert_title_dic = {
  'Y':'Y', 'P':'other', 'N':'other'
}

In [91]:
train['PavedDrive'] = train['PavedDrive'].map(convert_title_dic)

In [92]:
test['PavedDrive'] = test['PavedDrive'].map(convert_title_dic)

In [93]:
#SaleType
convert_title_dic = {
  'WD':'WD', 'New':'New', 'CWD':'CWD', 'Oth':'other', 'COD':'other', 'Con':'other', 'ConLI':'ConLI', 'ConLD':'other', 'ConLw':'other'
}

In [94]:
train['SaleType'] = train['SaleType'].map(convert_title_dic)

In [95]:
test['SaleType'] = test['SaleType'].map(convert_title_dic)

### 인코딩

In [96]:
# 순서O 데이터(라벨): Neighborhood,ExterQual,BsmtQual,BsmtCond,HeatingQC,KitchenQual,FireplaceQu,GarageCond
# 순서X 데이터(원핫): SaleType,PavedDrive,GarageType,Functional,Electrical,CentralAir,Heating,BsmtFinType2,Foundation,HouseStyle

#### 라벨인코딩

In [97]:
# train과 test합치기
# 정답 컬럼 따로 빼놓고 드랍
y_train = train['SalePrice']
train.drop('SalePrice',axis=1,inplace=True)

In [98]:
print(train.shape)
print(test.shape)

(1666, 78)
(1168, 78)


In [99]:
# train과 test합치기
combined = pd.concat([train,test],ignore_index=True) #세로로 붙이는 경우 axis=0(기본값)
combined

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,2,20,RL,0.0,10530,1,1,IR1,Lvl,1,...,0,0,0,1,1,0,3,2007,WD,2
1,3,20,RL,88.0,12803,1,1,IR1,Lvl,1,...,0,0,0,1,1,0,9,2008,WD,2
2,4,60,FV,114.0,8314,1,2,IR1,Lvl,1,...,0,110,0,0,1,0,11,2006,WD,2
3,5,20,RL,78.0,10335,1,1,IR1,Lvl,1,...,0,0,0,0,1,0,7,2006,WD,2
4,6,90,RL,55.0,12640,1,1,IR1,Lvl,1,...,0,0,0,1,1,0,7,2006,WD,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,2915,20,RL,70.0,8402,1,1,Reg,Lvl,1,...,0,0,0,1,1,0,12,2007,New,3
2830,2916,50,RM,50.0,9140,1,1,Reg,HLS,1,...,0,200,0,0,1,0,4,2010,other,2
2831,2917,20,RL,0.0,8050,1,1,IR1,Lvl,1,...,0,0,0,0,1,0,5,2010,WD,2
2832,2918,60,RL,0.0,8637,1,1,IR1,Lvl,1,...,0,0,0,1,1,0,4,2007,WD,1


In [100]:
#라벨인코딩
#객체 생성
BsmtExposure_encoder = LabelEncoder()
GarageQual_encoder = LabelEncoder()
# PoolQC_encoder = LabelEncoder()

Neighborhood_encoder = LabelEncoder()
ExterQual_encoder = LabelEncoder()
BsmtQual_encoder = LabelEncoder()
BsmtCond_encoder = LabelEncoder()
HeatingQC_encoder = LabelEncoder()
KitchenQual_encoder = LabelEncoder()
FireplaceQu_encoder = LabelEncoder()
GarageCond_encoder = LabelEncoder()

In [101]:
#인코더에 컬럼 대입
BsmtExposure_encoder.fit(combined['BsmtExposure'])
GarageQual_encoder.fit(combined['GarageQual'])
# PoolQC_encoder.fit(combined['PoolQC'])
Neighborhood_encoder.fit(combined['Neighborhood'])
ExterQual_encoder.fit(combined['ExterQual'])
BsmtQual_encoder.fit(combined['BsmtQual'])
BsmtCond_encoder.fit(combined['BsmtCond'])
HeatingQC_encoder.fit(combined['HeatingQC'])
KitchenQual_encoder.fit(combined['KitchenQual'])
FireplaceQu_encoder.fit(combined['FireplaceQu'])
GarageCond_encoder.fit(combined['GarageCond'])

LabelEncoder()

In [102]:
# 데이터 숫자로 변환 후 변수에 저장
combined['encoded_BsmtExposure'] =  BsmtExposure_encoder.transform(combined['BsmtExposure'])
combined['encoded_GarageQual'] =  GarageQual_encoder.transform(combined['GarageQual'])
combined['encoded_Neighborhood'] =  Neighborhood_encoder.transform(combined['Neighborhood'])
combined['encoded_ExterQual'] =  ExterQual_encoder.transform(combined['ExterQual'])
combined['encoded_BsmtQual'] =  BsmtQual_encoder.transform(combined['BsmtQual'])
combined['encoded_BsmtCond'] =  BsmtCond_encoder.transform(combined['BsmtCond'])
combined['encoded_HeatingQC'] =  HeatingQC_encoder.transform(combined['HeatingQC'])
combined['encoded_KitchenQual'] =  KitchenQual_encoder.transform(combined['KitchenQual'])
combined['encoded_FireplaceQu'] =  FireplaceQu_encoder.transform(combined['FireplaceQu'])
combined['encoded_GarageCond'] =  GarageCond_encoder.transform(combined['GarageCond'])

In [103]:
#원래 컬럼 drop
combined.drop('BsmtExposure',axis=1,inplace=True)
combined.drop('GarageQual',axis=1,inplace=True)

combined.drop('Neighborhood',axis=1,inplace=True)
combined.drop('ExterQual',axis=1,inplace=True)
combined.drop('BsmtQual',axis=1,inplace=True)
combined.drop('BsmtCond',axis=1,inplace=True)
combined.drop('HeatingQC',axis=1,inplace=True)
combined.drop('KitchenQual',axis=1,inplace=True)
combined.drop('FireplaceQu',axis=1,inplace=True)
combined.drop('GarageCond',axis=1,inplace=True)

In [104]:
combined.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,encoded_BsmtExposure,encoded_GarageQual,encoded_Neighborhood,encoded_ExterQual,encoded_BsmtQual,encoded_BsmtCond,encoded_HeatingQC,encoded_KitchenQual,encoded_FireplaceQu,encoded_GarageCond
0,2,20,RL,0.0,10530,1,1,IR1,Lvl,1,...,0,4,0,1,1,1,1,1,1,2
1,3,20,RL,88.0,12803,1,1,IR1,Lvl,1,...,1,4,1,2,2,1,3,2,2,2
2,4,60,FV,114.0,8314,1,2,IR1,Lvl,1,...,0,4,1,2,2,1,3,2,2,2
3,5,20,RL,78.0,10335,1,1,IR1,Lvl,1,...,0,4,0,1,1,1,2,1,2,2
4,6,90,RL,55.0,12640,1,1,IR1,Lvl,1,...,2,4,1,1,1,1,1,1,1,2


#### 원핫인코딩

In [105]:
#원핫인코딩
categorical_feature = ['BsmtFinType1','MSZoning','LandContour','LotShape','LandSlope','Condition1', 'BldgType','RoofStyle','Exterior1st' ,'Exterior2nd','MasVnrType',   'SaleType','PavedDrive','GarageType','Functional','Electrical','CentralAir','Heating','BsmtFinType2','Foundation','HouseStyle']

In [106]:
# train하고 test 병합된 것에서 문자로 구성된 열만 지정한 categorical_feaure
# 원핫인코딩: pd.get_dummies() 사용
one_hot = pd.get_dummies(combined[categorical_feature])
one_hot

Unnamed: 0,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NA,BsmtFinType1_Rec,BsmtFinType1_Unf,MSZoning_C (all),MSZoning_FV,MSZoning_RH,...,BsmtFinType2_NA,BsmtFinType2_Unf,BsmtFinType2_other,Foundation_CBlock,Foundation_Poured Contrete,Foundation_other,HouseStyle_1.5F,HouseStyle_1F,HouseStyle_2F,HouseStyle_others
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,0,0,0,0,0,0,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2830,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
2831,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,1,0,0
2832,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [107]:
# 원핫인코딩을 진행한 문자열 데이터 삭제
combined.drop(categorical_feature,axis=1,inplace=True)

In [108]:
# 원핫인코딩한 숫자열 데이터를 추가
combined = pd.concat([combined,one_hot], axis=1) # 가로로 붙이는 경우 axis=1
combined

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,Utilities,LotConfig,Condition2,OverallQual,...,BsmtFinType2_NA,BsmtFinType2_Unf,BsmtFinType2_other,Foundation_CBlock,Foundation_Poured Contrete,Foundation_other,HouseStyle_1.5F,HouseStyle_1F,HouseStyle_2F,HouseStyle_others
0,2,20,0.0,10530,1,1,1,1,1,6,...,0,0,0,1,0,0,0,1,0,0
1,3,20,88.0,12803,1,1,1,1,1,7,...,0,1,0,0,1,0,0,1,0,0
2,4,60,114.0,8314,1,2,1,1,1,7,...,0,1,0,0,1,0,0,0,1,0
3,5,20,78.0,10335,1,1,1,1,1,5,...,0,1,0,1,0,0,0,1,0,0
4,6,90,55.0,12640,1,1,1,1,1,6,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,2915,20,70.0,8402,1,1,1,1,1,5,...,0,1,0,0,1,0,0,1,0,0
2830,2916,50,50.0,9140,1,1,1,1,1,6,...,0,1,0,0,0,1,0,0,0,1
2831,2917,20,0.0,8050,1,1,1,1,1,5,...,0,0,1,1,0,0,0,1,0,0
2832,2918,60,0.0,8637,1,1,1,1,1,6,...,0,1,0,0,1,0,0,0,1,0


In [109]:
print(train.shape)
print(test.shape)

(1666, 78)
(1168, 78)


In [110]:
train.isnull().values.any()

False

In [111]:
test.isnull().values.any()

False

In [112]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1666 entries, 0 to 1665
Data columns (total 78 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1666 non-null   int64  
 1   MSSubClass     1666 non-null   int64  
 2   MSZoning       1666 non-null   object 
 3   LotFrontage    1666 non-null   float64
 4   LotArea        1666 non-null   int64  
 5   Street         1666 non-null   int64  
 6   Alley          1666 non-null   int64  
 7   LotShape       1666 non-null   object 
 8   LandContour    1666 non-null   object 
 9   Utilities      1666 non-null   int64  
 10  LotConfig      1666 non-null   int64  
 11  LandSlope      1666 non-null   object 
 12  Neighborhood   1666 non-null   object 
 13  Condition1     1666 non-null   object 
 14  Condition2     1666 non-null   int64  
 15  BldgType       1666 non-null   object 
 16  HouseStyle     1666 non-null   object 
 17  OverallQual    1666 non-null   int64  
 18  OverallC

In [113]:
# 다시 train과 test 데이터 나눠주기
X_train = combined.iloc[:1666]
X_test = combined.iloc[1666:]

In [114]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape) #train 정답 컬럼

(1666, 165)
(1168, 165)
(1666,)


## 5. 모델링

In [532]:
# 컬럼 정리
# X_train = train
# X_test = test
# y_train = train['SalePrice']
# X_train.drop('SalePrice', axis=1, inplace=True)

In [533]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape) #train 정답 컬럼

(1666, 165)
(1168, 165)
(1666,)


### KNN모델

In [534]:
#KNN 모델
from sklearn.model_selection import cross_val_score # 교차검증: K-fold cross-validation 방법 활용
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler #스케일링

In [535]:
mm_scaler = MinMaxScaler()

In [536]:
# X_train 스케일링.
mm_scaler.fit(X_train)

MinMaxScaler()

In [537]:
# 0~1로 바뀐 train을 조회해보면...
X_train_transformed = mm_scaler.transform(X_train)
X_train_transformed

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.71755289e-04, 0.00000000e+00, 2.81150160e-01, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.14351058e-03, 2.35294118e-01, 3.64217252e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [9.98856489e-01, 0.00000000e+00, 1.98083067e-01, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.99428245e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 3.52941176e-01, 2.36421725e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [None]:
# 교차검증 실시
for k in range(1,50,2): #복잡 -> 단순
    k_model = KNeighborsRegressor(n_neighbors=k)
    result = cross_val_score(k_model,X_train_transformed,y_train,cv=5) # cv는 데이터의 크기에 따라 조절. 데이터가 많으면 크게. (보통 3~6조각)
    print('k:',k,'score:',result.mean() )

In [None]:
#학습
knn_model = KNeighborsRegressor(n_neighbors=9)
knn_model.fit(X_train_transformed,y_train)

In [None]:
#KNN모델 결과 예측, 출력
X_test_transformed = mm_scaler.transform(X_test) # test 스케일링
knn_pre = knn_model.predict(X_test_transformed)
knn_pre

In [None]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = knn_pre
gender_sub.to_csv('1_KNNonly.csv', index=False)

### 3.Tree모델

In [None]:
#DecisionTree 모델
from sklearn.tree import DecisionTreeRegressor #decisionTree

In [None]:
for md in range(1,50): # 트리 모델 단순 -> 복잡으로 바꾸며 점수 확인
    t_model = DecisionTreeRegressor(max_depth=md)
    t_model.fit(X_train,y_train)
    
    #교차검증
    #train data를 5조각으로 w쪼개어 점수를 내준다.
    result = cross_val_score(t_model,X_train,y_train,cv=5)
    print('max_depth:',md,'score :',result.mean()) # md=1 부터 5부분을 각각 5회 실시했을 때 평균값 출력

In [None]:
#학습
final_tree = DecisionTreeRegressor(max_depth=7)
final_tree.fit(X_train,y_train)

In [None]:
final_pre = final_tree.predict(X_test)

In [None]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = final_pre # 내 모델의 예측결과 양식에 넣기
gender_sub.to_csv('1_Treeonly.csv', index=False) # csv파일로 뽑아내기

### 선형모델

In [None]:
#선형모델
from sklearn.linear_model import LinearRegression #선형모델 임포트

In [None]:
#선형모델 객체 생성 및 학습
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)

In [None]:
result = cross_val_score(linear_model,X_train,y_train,cv=5)
result.mean()

#### 선형모델 특성확장

In [None]:
#선형모델 가중치 확인
linear_model.coef_

In [None]:
# 확인된 가중치 dataframe으로 만들어 확인
# 가중치 중 라벨인코딩, 수치데이터 중 높은 가중치를 확인하자. 
# 높은 양의 가중치-> OverallQual, Fireplaces, OverallCond
# 높은 음의 가중치-> KitchenAbvGr, BedroomAbvGr, MoSold
# 이중 양,음 가장 높은 2가지만 선택

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

coef_df = pd.DataFrame(linear_model.coef_, index=X_train.columns)
coef_df[0].sort_values()

In [None]:
# 특성확장
# 최고차항이 2차항이 되도록 만들기 위해서 특성을 제곱
X_train['FullBath x FullBath'] = X_train['FullBath'] * X_train['FullBath']
X_train['Heating_Gas x Heating_Gas'] = X_train['Heating_Gas'] * X_train['Heating_Gas']
X_train['encoded_Neighborhood x encoded_Neighborhood'] = X_train['encoded_Neighborhood'] * X_train['encoded_Neighborhood']
X_train['OverallQual x OverallQual'] = X_train['OverallQual'] * X_train['OverallQual']

In [None]:
# 특성확장된 컬럼을 바탕으로 다시 예측
extend_linear_model = LinearRegression()

In [None]:
extend_linear_model.fit(X_train,y_train)

In [None]:
result = cross_val_score(linear_model,X_train,y_train,cv=5)
result.mean()

In [None]:
#학습
extend_linear_model = LinearRegression()
extend_linear_model.fit(X_train,y_train)

In [None]:
# 특성확장
# 최고차항이 2차항이 되도록 만들기 위해서 특성을 제곱
X_test['FullBath x FullBath'] = X_test['FullBath'] * X_test['FullBath']
X_test['Heating_Gas x Heating_Gas'] = X_test['Heating_Gas'] * X_test['Heating_Gas']
X_test['encoded_Neighborhood x encoded_Neighborhood'] = X_test['encoded_Neighborhood'] * X_test['encoded_Neighborhood']
X_test['OverallQual x OverallQual'] = X_test['OverallQual'] * X_test['OverallQual']

In [None]:
final_pre = extend_linear_model.predict(X_test)

In [None]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = final_pre # 내 모델의 예측결과 양식에 넣기
gender_sub.to_csv('1_LinearExtendonly.csv', index=False) # csv파일로 뽑아내기

#### 앙상블 Voting

In [None]:
best_tree_model = DecisionTreeRegressor(max_depth=7) # 스케일링이 안들어간 데이터
best_knn_model = KNeighborsRegressor(n_neighbors=9) # 스케일링이 들어간 데이터
best_linear_model = LinearRegression() # 선형모델

In [None]:
best_tree_model.fit(X_train,y_train)
best_knn_model.fit(X_train_transformed,y_train)
best_linear_model.fit(X_train,y_train)

In [None]:
#tree모델 결과 예측, 출력
tree_pre = best_tree_model.predict(X_test)
tree_pre

In [None]:
# #KNN모델 결과 예측, 출력
# X_test_transformed = mm_scaler.transform(X_test) # test 스케일링
# knn_pre = best_knn_model.predict(X_test_transformed)
knn_pre

In [None]:
#linear모델 결과 예측, 출력
linear_pre = best_linear_model.predict(X_test)
linear_pre

In [None]:
# tree, knn, linear 합치기
# Numpy 연산은 같은 인덱스 요소 끼리 연산된다.
tree_knn_linear_mix = (tree_pre + knn_pre + linear_pre)/3

In [None]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = tree_knn_linear_mix
gender_sub.to_csv('1_Voting3.csv', index=False)

### 앙상블 Bagging

#### KNN Bagging

In [None]:
from sklearn.ensemble import BaggingRegressor #모델별 bagging지원 툴
from sklearn.model_selection import GridSearchCV #bagging 하이퍼파라미터 튜닝을 위한 툴

In [None]:
clf = BaggingRegressor(base_estimator=KNeighborsRegressor(n_neighbors=7),
                        n_estimators=50,
                        max_features=0.5,
                        random_state=0).fit(X_train, y_train)

In [None]:
params = {
    'n_estimators' : [15,50,80,100,200,400],
    'max_features' : [0.3,0.4,0.5,0.6,0.7],
    'n_jobs' : [-1] # cpu core를 그리드서치에 최대한 집중해달라
} 

In [None]:
# 교차검증 5회 실시
gird = GridSearchCV(clf,params,cv=5)

In [None]:
# 훈련실시 (컴퓨터 사양 낮으면 구글 colab이용)
gird.fit(X_train,y_train)

In [None]:
gird.best_params_ #최적의 조합

In [None]:
best_model = gird.best_estimator_

In [None]:
KNNbagging_pre = best_model.predict(X_test)

In [None]:
KNNbagging_pre

In [None]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = KNNbagging_pre
gender_sub.to_csv('1_KNN_Bagging.csv',index=False)

#### Tree Bagging

In [None]:
from sklearn.ensemble import RandomForestRegressor #RandomForest방식 이용
from sklearn.model_selection import GridSearchCV #RandomForest 하이퍼파라미터 튜닝을 위한 툴

In [None]:
#RandomForest형식()
rf_model = RandomForestRegressor(random_state=916, #random_state 난수고정 속성(값 일정하게 하기 위함)
                                 n_estimators=50,  # n_estimators 나무를 몇 개를 만들건지(많을수록 과소적합)
                                 max_depth=10, #max_depth 상승할수록 과대적합
                                 max_features=0.5) # 나무를 뽑을 때 마다 내가 가진 features 중에서 몇 %를 뽑을건가?(영상확인...)

In [None]:
params = {
    'n_estimators' : [15,50,80,100,200,400],
    'max_depth' : [3,5,7,9],
    'max_features' : [0.3,0.4,0.5,0.6,0.7],
    'n_jobs' : [-1] # cpu core를 그리드서치에 최대한 집중해달라
} 

In [None]:
# 교차검증 5회 실시
gird = GridSearchCV(rf_model,params,cv=5)

In [None]:
# 훈련실시 (컴퓨터 사양 낮으면 구글 colab이용)
gird.fit(X_train,y_train)

In [None]:
gird.best_params_ #최적의 조합

In [None]:
best_model = gird.best_estimator_ # grid.best_estimator_에 최적의 조합이 저장되어 잇음 이를 변수에 저장

In [None]:
rf_pre = best_model.predict(X_test)

In [None]:
rf_pre

In [None]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = rf_pre
gender_sub.to_csv('1_Tree_Bagging.csv',index=False)

#### Linear Bagging

In [None]:
from sklearn.ensemble import BaggingRegressor #모델별 bagging지원 툴
from sklearn.model_selection import GridSearchCV #bagging 하이퍼파라미터 튜닝을 위한 툴

In [None]:
clf = BaggingRegressor(base_estimator=LinearRegression(),
                        n_estimators=50,
                        max_features=0.5,
                        random_state=0).fit(X_train, y_train)

In [None]:
params = {
    'n_estimators' : [15,50,80,100,200,400],
    'max_features' : [0.3,0.4,0.5,0.6,0.7],
    'n_jobs' : [-1] # cpu core를 그리드서치에 최대한 집중해달라
} 

In [None]:
# 교차검증 5회 실시
gird = GridSearchCV(clf,params,cv=5)

In [None]:
# 훈련실시 (컴퓨터 사양 낮으면 구글 colab이용)
gird.fit(X_train,y_train)

In [None]:
gird.best_params_ #최적의 조합

In [None]:
best_model = gird.best_estimator_

In [None]:
Linearbagging_pre = best_model.predict(X_test)

In [None]:
Linearbagging_pre

In [None]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = Linearbagging_pre
gender_sub.to_csv('1_Linear_Bagging.csv',index=False)

#### Bagging Voting

In [None]:
tree_knn_linear_mix = (KNNbagging_pre+rf_pre+Linearbagging_pre)/3

### GradientBoostingRegressor

In [115]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer
from IPython.display import display
import matplotlib.pyplot as plt

import warnings
def ignore_warn(*args, **kwargs): pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

%matplotlib inline

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

In [116]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [117]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [118]:
model_gb = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber')

In [119]:
score = rmsle_cv(model_gb)
gb_score = score.mean()
print("GradientBoostingRegressor score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

GradientBoostingRegressor score: 15869.3979 (2484.9477)


In [120]:
model_gb.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_depth=4,
                          max_features='sqrt', min_samples_leaf=15,
                          min_samples_split=10, n_estimators=3000)

In [121]:
pre = model_gb.predict(X_test)

In [122]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = pre
gender_sub.to_csv('kh.csv',index=False)

In [None]:
X_train = X_train.astype('int')
y_train = y_train.astype('int')
X_test = X_test.astype('int')

### Xgboost

In [123]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.0-py3-none-win_amd64.whl (86.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.0


In [124]:
import xgboost as xgb


In [125]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [126]:
score = rmsle_cv(model_xgb)
xgb_score = score.mean()
print("XGBRegressor score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

### lightgbm

In [129]:
import lightgbm as lgb

In [128]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.0.0-py2.py3-none-win_amd64.whl (737 kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.0.0


In [130]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [131]:

score = rmsle_cv(model_lgb)
lgb_score = score.mean()
print("LGBMRegressor score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

LGBMRegressor score: 15652.9327 (2387.1464)


### 3개 앙상블

In [132]:
model_gb.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
              feature_fraction=0.2319, feature_fraction_seed=9,
              learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
              min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
              objective='regression')

In [133]:
pred_gb = model_gb.predict(X_test)
pred_xgb = model_xgb.predict(X_test)
pred_lgb = model_lgb.predict(X_test)

In [134]:
total_weight = (1. / gb_score) + (1. / xgb_score) + (1. / lgb_score)
pred = (pred_gb * (1. / gb_score) + pred_xgb * (1. / xgb_score) +  pred_lgb * (1. / lgb_score)) / total_weight

In [135]:
gender_sub = pd.read_csv('data/sample_submission.csv')
gender_sub['SalePrice'] = pred
gender_sub.to_csv('sb_test.csv',index=False)