In [66]:
import pandas as pd

# 데이터 로드
test_df = pd.read_csv("test.csv", index_col='Id')  # 파일 경로가 올바른지 확인하세요

# 수치형 및 범주형 칼럼 구분
numerical_cols = test_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = test_df.select_dtypes(include=['object']).columns

# 수치형 및 범주형 칼럼 개수 계산
num_numerical = len(numerical_cols)
num_categorical = len(categorical_cols)

# 결과 출력
print(f"Number of Numerical Columns: {num_numerical}")
print(f"Number of Categorical Columns: {num_categorical}")


Number of Numerical Columns: 36
Number of Categorical Columns: 43


In [67]:
# 확인
test_df.columns
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 1461 to 2919
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1455 non-null   object 
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          107 non-null    object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearBuilt 

In [68]:
# 새로운 변수 추가
test_df['BuildingAge'] = test_df['YearRemodAdd'] - test_df['YearBuilt']  # 건축 연령 = 리모델링 연도 - 건축 연도
test_df['TotalBsmtArea'] = test_df['TotalBsmtSF']  # 총 지하실 면적 (TotalBsmtSF만 사용)
test_df['TotalBaths'] = test_df['FullBath'] + test_df['HalfBath'] * 0.5 + test_df['BsmtFullBath'] + test_df['BsmtHalfBath'] * 0.5

# 총 욕실 개수 = 전체 욕실 + 반 욕실 (0.5로 환산) + 지하실 전체 욕실 + 지하실 반 욕실 (0.5로 환산)
test_df['BuildingQuality'] = (test_df['OverallQual'] + test_df['OverallCond']) / 2  # 건물 품질 = (전반적인 품질 + 전반적인 상태) / 2

# 필요 없는 컬럼 드롭 (BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, 1stFlrSF, 2ndFlrSF)
test_df.drop(columns=['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF'], inplace=True, errors='ignore')

In [69]:
# MSSubClass는 단순 명목형 feature이므로 dtype을 object로 변경
test_df['MSSubClass'] = test_df['MSSubClass'].astype('object')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 1461 to 2919
Data columns (total 78 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MSSubClass       1459 non-null   object 
 1   MSZoning         1455 non-null   object 
 2   LotFrontage      1232 non-null   float64
 3   LotArea          1459 non-null   int64  
 4   Street           1459 non-null   object 
 5   Alley            107 non-null    object 
 6   LotShape         1459 non-null   object 
 7   LandContour      1459 non-null   object 
 8   Utilities        1457 non-null   object 
 9   LotConfig        1459 non-null   object 
 10  LandSlope        1459 non-null   object 
 11  Neighborhood     1459 non-null   object 
 12  Condition1       1459 non-null   object 
 13  Condition2       1459 non-null   object 
 14  BldgType         1459 non-null   object 
 15  HouseStyle       1459 non-null   object 
 16  OverallQual      1459 non-null   int64  
 17  OverallCond     

In [70]:
# test셋에서 결측치 비율 높은 컬럼 4건 삭제
test_df = test_df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1)

In [71]:
# 확인
test_df.columns
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 1461 to 2919
Data columns (total 74 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MSSubClass       1459 non-null   object 
 1   MSZoning         1455 non-null   object 
 2   LotFrontage      1232 non-null   float64
 3   LotArea          1459 non-null   int64  
 4   Street           1459 non-null   object 
 5   LotShape         1459 non-null   object 
 6   LandContour      1459 non-null   object 
 7   Utilities        1457 non-null   object 
 8   LotConfig        1459 non-null   object 
 9   LandSlope        1459 non-null   object 
 10  Neighborhood     1459 non-null   object 
 11  Condition1       1459 non-null   object 
 12  Condition2       1459 non-null   object 
 13  BldgType         1459 non-null   object 
 14  HouseStyle       1459 non-null   object 
 15  OverallQual      1459 non-null   int64  
 16  OverallCond      1459 non-null   int64  
 17  YearBuilt       

In [72]:
numeric_feature = test_df.select_dtypes(['int64', 'float64'])
categorical_feature = test_df.select_dtypes('object')

numeric_feature

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,BuildingAge,TotalBsmtArea,TotalBaths,BuildingQuality
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
1461,80.0,11622,5,6,1961,1961,0.0,882.0,0,896,0.0,0.0,1,0,2,1,5,0,1961.0,1.0,730.0,140,0,0,0,120,0,0,6,2010,0,882.0,1.0,5.5
1462,81.0,14267,6,6,1958,1958,108.0,1329.0,0,1329,0.0,0.0,1,1,3,1,6,0,1958.0,1.0,312.0,393,36,0,0,0,0,12500,6,2010,0,1329.0,1.5,6.0
1463,74.0,13830,5,5,1997,1998,0.0,928.0,0,1629,0.0,0.0,2,1,3,1,6,1,1997.0,2.0,482.0,212,34,0,0,0,0,0,3,2010,1,928.0,2.5,5.0
1464,78.0,9978,6,6,1998,1998,20.0,926.0,0,1604,0.0,0.0,2,1,3,1,7,1,1998.0,2.0,470.0,360,36,0,0,0,0,0,6,2010,0,926.0,2.5,6.0
1465,43.0,5005,8,5,1992,1992,0.0,1280.0,0,1280,0.0,0.0,2,0,2,1,5,0,1992.0,2.0,506.0,0,82,0,0,144,0,0,1,2010,0,1280.0,2.0,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,21.0,1936,4,7,1970,1970,0.0,546.0,0,1092,0.0,0.0,1,1,3,1,5,0,,0.0,0.0,0,0,0,0,0,0,0,6,2006,0,546.0,1.5,5.5
2916,21.0,1894,4,5,1970,1970,0.0,546.0,0,1092,0.0,0.0,1,1,3,1,6,0,1970.0,1.0,286.0,0,24,0,0,0,0,0,4,2006,0,546.0,1.5,4.5
2917,160.0,20000,5,7,1960,1996,0.0,1224.0,0,1224,1.0,0.0,1,0,4,1,7,1,1960.0,2.0,576.0,474,0,0,0,0,0,0,9,2006,36,1224.0,2.0,6.0
2918,62.0,10441,5,5,1992,1992,0.0,912.0,0,970,0.0,1.0,1,0,3,1,6,0,,0.0,0.0,80,32,0,0,0,0,700,7,2006,0,912.0,1.5,5.0


In [73]:
# 수치형 및 범주형 칼럼 구분 갱신
numerical_cols = test_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = test_df.select_dtypes(include=['object']).columns

# 수치형 및 범주형 칼럼 개수 갱신
num_numerical = len(numerical_cols)
num_categorical = len(categorical_cols)

pd.set_option('display.max_columns', None)
test_df[categorical_cols].describe(include='all')

Unnamed: 0,MSSubClass,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
count,1459,1455,1459,1459,1459,1457,1459,1459,1459,1459,1459,1459,1459,1459,1459,1458,1458,565,1459,1459,1459,1415,1414,1415,1417,1417,1459,1459,1459,1459,1458,1457,729,1383,1381,1381,1381,1459,1458,1459
unique,16,5,2,4,4,1,5,3,25,9,5,5,7,6,4,13,15,3,4,5,6,4,4,4,6,6,4,5,2,4,4,7,5,6,3,4,5,3,9,6
top,20,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,TA,TA,PConc,TA,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Gd,Attchd,Unf,TA,TA,Y,WD,Normal
freq,543,1114,1453,934,1311,1457,1081,1396,218,1251,1444,1205,745,1169,1442,510,510,434,892,1256,661,634,1295,951,431,1237,1446,752,1358,1337,757,1357,364,853,625,1293,1328,1301,1258,1204


In [74]:
# 범주형 값을 최빈값으로 처리하기.
for col in categorical_feature.columns:
    mode_value = test_df[col].mode(dropna=True)[0]  # 최빈값 계산
    test_df[col].fillna(mode_value, inplace=True)   # 결측치 대체

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(mode_value, inplace=True)   # 결측치 대체


In [75]:
# 검증
test_df[categorical_feature.columns].isnull().sum()

MSSubClass       0
MSZoning         0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
SaleType         0
SaleCondition    0
dtype: int64

In [76]:
# 수치형 값을 평균값으로 처리하기.
for col in numeric_feature.columns:
    mean_value = test_df[col].mean()
    test_df[col].fillna(mean_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(mean_value, inplace=True)


In [77]:
test_df[numeric_feature.columns].isnull().sum()

LotFrontage        0
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         0
TotalBsmtSF        0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt        0
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
BuildingAge        0
TotalBsmtArea      0
TotalBaths         0
BuildingQuality    0
dtype: int64

In [78]:
# 서수형 피처를 라벨링하기 전 unique 값 확인

ordinal_feature = ['ExterQual','ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond', 'FireplaceQu', 'OverallQual','OverallCond' ]

for i in ordinal_feature:
    print(f"{i}: ", test_df[i].unique())

ExterQual:  ['TA' 'Gd' 'Ex' 'Fa']
ExterCond:  ['TA' 'Gd' 'Fa' 'Po' 'Ex']
BsmtQual:  ['TA' 'Gd' 'Ex' 'Fa']
BsmtCond:  ['TA' 'Po' 'Fa' 'Gd']
HeatingQC:  ['TA' 'Gd' 'Ex' 'Fa' 'Po']
KitchenQual:  ['TA' 'Gd' 'Ex' 'Fa']
GarageQual:  ['TA' 'Fa' 'Gd' 'Po']
GarageCond:  ['TA' 'Fa' 'Gd' 'Po' 'Ex']
FireplaceQu:  ['Gd' 'TA' 'Po' 'Fa' 'Ex']
OverallQual:  [ 5  6  8  7  4  9  2  3 10  1]
OverallCond:  [6 5 7 8 2 9 3 4 1]


In [79]:
# 서수형 피처를 라벨링 하기.
# 서수형 데이터에 대한 상관관계 확인을 위해 labelEncoder를 활용.

# 5점 척도형 변수만 라벨링 진행
ordinal_feature_map = ['ExterQual','ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond', 'FireplaceQu']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# 사용자 지정 매핑 사전
mapping = {
    'Ex': 5,  # Excellent
    'Gd': 4,  # Good
    'TA': 3,  # Typical/Average
    'Fa': 2,  # Fair
    'Po': 1   # Poor
}

for col in ordinal_feature_map:
    test_df[col] = test_df[col].map(mapping)

In [80]:
test_df[ordinal_feature].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 1461 to 2919
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   ExterQual    1459 non-null   int64
 1   ExterCond    1459 non-null   int64
 2   BsmtQual     1459 non-null   int64
 3   BsmtCond     1459 non-null   int64
 4   HeatingQC    1459 non-null   int64
 5   KitchenQual  1459 non-null   int64
 6   GarageQual   1459 non-null   int64
 7   GarageCond   1459 non-null   int64
 8   FireplaceQu  1459 non-null   int64
 9   OverallQual  1459 non-null   int64
 10  OverallCond  1459 non-null   int64
dtypes: int64(11)
memory usage: 136.8 KB


In [81]:
# 명목형 변수 확인
nominal_cols = list(x for x in categorical_cols if x not in ordinal_feature)
test_df[nominal_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 1461 to 2919
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSSubClass     1459 non-null   object
 1   MSZoning       1459 non-null   object
 2   Street         1459 non-null   object
 3   LotShape       1459 non-null   object
 4   LandContour    1459 non-null   object
 5   Utilities      1459 non-null   object
 6   LotConfig      1459 non-null   object
 7   LandSlope      1459 non-null   object
 8   Neighborhood   1459 non-null   object
 9   Condition1     1459 non-null   object
 10  Condition2     1459 non-null   object
 11  BldgType       1459 non-null   object
 12  HouseStyle     1459 non-null   object
 13  RoofStyle      1459 non-null   object
 14  RoofMatl       1459 non-null   object
 15  Exterior1st    1459 non-null   object
 16  Exterior2nd    1459 non-null   object
 17  MasVnrType     1459 non-null   object
 18  Foundation     1459 non-null  

In [82]:
# 명목형 변수 원핫인코딩 진행
test_df = pd.get_dummies(test_df, drop_first=True)
test_df.head()

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,TotalBsmtSF,HeatingQC,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,BuildingAge,TotalBsmtArea,TotalBaths,BuildingQuality,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasW,Heating_Grav,Heating_Wall,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1
1461,80.0,11622,5,6,1961,1961,0.0,3,3,3,3,882.0,3,0,896,0.0,0.0,1,0,2,1,3,5,0,4,1961.0,1.0,730.0,3,3,140,0,0,0,120,0,0,6,2010,0,882.0,1.0,5.5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
1462,81.0,14267,6,6,1958,1958,108.0,3,3,3,3,1329.0,3,0,1329,0.0,0.0,1,1,3,1,4,6,0,4,1958.0,1.0,312.0,3,3,393,36,0,0,0,0,12500,6,2010,0,1329.0,1.5,6.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
1463,74.0,13830,5,5,1997,1998,0.0,3,3,4,3,928.0,4,0,1629,0.0,0.0,2,1,3,1,3,6,1,3,1997.0,2.0,482.0,3,3,212,34,0,0,0,0,0,3,2010,1,928.0,2.5,5.0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
1464,78.0,9978,6,6,1998,1998,20.0,3,3,3,3,926.0,5,0,1604,0.0,0.0,2,1,3,1,4,7,1,4,1998.0,2.0,470.0,3,3,360,36,0,0,0,0,0,6,2010,0,926.0,2.5,6.0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
1465,43.0,5005,8,5,1992,1992,0.0,4,3,4,3,1280.0,5,0,1280,0.0,0.0,2,0,2,1,4,5,0,4,1992.0,2.0,506.0,3,3,0,82,0,0,144,0,0,1,2010,0,1280.0,2.0,6.5,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False


In [83]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 1461 to 2919
Columns: 210 entries, LotFrontage to SaleCondition_Partial
dtypes: bool(167), float64(11), int64(32)
memory usage: 739.5 KB


In [84]:
# Yeo-Johnson 변환 전 수치형 피쳐의 정규성 확인
from scipy.stats import shapiro

# 수치형 변수 갱신(라벨 인코딩한 피쳐 포함)
numerical_cols = test_df.select_dtypes(include=['int64', 'float64']).columns

not_rejected_cols = []

for col in numerical_cols:
    stat, p = shapiro(test_df[numerical_cols])
    alpha = 0.05
    print(col , 'shapiro test result:')
    if p > alpha:
        print('Fail to reject H0')
        not_rejected_cols.append(col)
    else:
        print('reject H0') # 귀무가설 기각 - 정규성을 따르지 않음
    print('------------------')
    
print("정규성을 따르는 칼럼:", not_rejected_cols)

  res = hypotest_fun_out(*samples, **kwds)


LotFrontage shapiro test result:
reject H0
------------------
LotArea shapiro test result:
reject H0
------------------
OverallQual shapiro test result:
reject H0
------------------
OverallCond shapiro test result:
reject H0
------------------
YearBuilt shapiro test result:
reject H0
------------------
YearRemodAdd shapiro test result:
reject H0
------------------
MasVnrArea shapiro test result:
reject H0
------------------
ExterQual shapiro test result:
reject H0
------------------
ExterCond shapiro test result:
reject H0
------------------
BsmtQual shapiro test result:
reject H0
------------------
BsmtCond shapiro test result:
reject H0
------------------
TotalBsmtSF shapiro test result:
reject H0
------------------
HeatingQC shapiro test result:
reject H0
------------------
LowQualFinSF shapiro test result:
reject H0
------------------
GrLivArea shapiro test result:
reject H0
------------------
BsmtFullBath shapiro test result:
reject H0
------------------
BsmtHalfBath shapiro test 

수치형 피쳐 중 정규성을 따르는 피쳐는 없으므로 모든 수치형 피쳐에 대해 Yeo-Johnson 변환을 적용

In [86]:
# Yeo-Johnson 변환으로 수치형 피쳐 스케일링
from scipy import stats

lambdas = {}

for col in numerical_cols:
    trans_num, lambda_ = stats.yeojohnson(test_df[col], lmbda=None)  # yeojohnson 변환
    test_df[col] = trans_num  # 변환된 값을 저장
    lambdas[col] = lambda_  # 각 열에 대해 람다 값을 저장

print(lambdas) # 람다 값 출력

{'LotFrontage': 0.7014888560430448, 'LotArea': 0.3387880284826124, 'OverallQual': 0.7666591725204914, 'OverallCond': 0.726888268616322, 'YearBuilt': 22.06532094043285, 'YearRemodAdd': 31.775206020135464, 'MasVnrArea': -0.2578496345224758, 'ExterQual': -1.1286059029229418, 'ExterCond': -0.021593019113439173, 'BsmtQual': 0.2848820674129174, 'BsmtCond': 1.7394455378039066, 'TotalBsmtSF': 0.7661355392721746, 'HeatingQC': 2.861147707742023, 'LowQualFinSF': -18.39206135667097, 'GrLivArea': -0.028704805140338033, 'BsmtFullBath': -1.4902465943742669, 'BsmtHalfBath': -22.188285308232118, 'FullBath': 0.21442365878633934, 'HalfBath': -2.3470090783350415, 'BedroomAbvGr': 0.6592746062354237, 'KitchenAbvGr': -1.275645918978354, 'KitchenQual': -0.34462967712231807, 'TotRmsAbvGrd': -0.20506386507016153, 'Fireplaces': -0.6409452800169984, 'FireplaceQu': 4.79822300077415, 'GarageYrBlt': 2.7778960066433567, 'GarageCars': 1.1080735123216998, 'GarageArea': 0.7904729731333346, 'GarageQual': 4.40073265417847

In [87]:
# StandardScaler로 정규화를 추가로 진행
from sklearn.preprocessing import StandardScaler

# StandardScaler 생성 및 데이터 변환
scaler = StandardScaler()
test_df[numerical_cols] = scaler.fit_transform(test_df[numerical_cols])

In [88]:
test_df.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,TotalBsmtSF,HeatingQC,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,BuildingAge,TotalBsmtArea,TotalBaths,BuildingQuality
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,6.623291e-16,1.631472e-16,-7.012896e-16,-8.023435e-16,-2.970741e-16,5.844080000000001e-17,-1.9480270000000002e-17,8.133011e-16,2.371722e-15,7.670355e-16,1.78975e-16,1.241867e-16,2.605486e-16,3.65255e-18,-7.962559e-16,2.9220400000000005e-17,-3.65255e-17,-2.313282e-16,8.035610000000001e-17,-2.593311e-16,-3.491838e-15,4.047025e-15,-1.924894e-15,1.4610200000000003e-17,1.667998e-16,-9.167901e-16,8.035610000000001e-17,1.290568e-16,-5.284022e-16,2.203705e-16,-9.740133e-18,-5.722328000000001e-17,-4.3830600000000005e-17,-3.65255e-18,1.217517e-17,-7.3051e-18,2.4350330000000003e-17,-3.116843e-16,-9.511849e-18,7.792107000000001e-17,1.241867e-16,-9.253127e-17,1.631472e-16
std,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,4.246212e-14,1.000343,1.000343,1.000343,1.000343
min,-2.620788,-2.861693,-3.953397,-4.65608,-2.178443,-1.448264,-0.8089771,-3.585553,-7.626704,-2.464238,-5.536414,-2.932316,-2.077014,-0.09843059,-4.00343,-0.8455651,-0.2637312,-3.858388,-0.7639761,-4.107155,-14.60904,-2.92534,-3.154854,-0.9788708,-2.355991,-3.115031,-2.205685,-2.559142,-4.292881,-5.814104,-0.9475916,-1.089204,-0.4558237,-0.09481729,-0.3257929,-0.06426031,-0.1903197,-2.065453,-5.778711e-14,-3.935798,-2.932316,-1.684324,-3.421759
25%,-0.3791806,-0.428774,-0.7391087,-0.4798607,-0.7383641,-1.030609,-0.8089771,-0.639366,-0.1829393,-0.7620247,-0.009199809,-0.5382352,-1.230631,-0.09843059,-0.741336,-0.8455651,-0.2637312,-1.042865,-0.7639761,-1.033876,-0.1603035,-0.7484603,-0.9430591,-0.9788708,0.4243615,-0.677123,-0.993626,-0.6492785,0.1264799,0.09441298,-0.9475916,-1.089204,-0.4558237,-0.09481729,-0.3257929,-0.06426031,-0.1903197,-0.7423601,-2.509104e-14,-0.8107424,-0.5382352,-0.8685507,-0.4466168
50%,0.04508873,0.06105563,-0.03078146,-0.4798607,-0.1055888,0.2717117,-0.8089771,-0.639366,-0.1829393,-0.7620247,-0.009199809,-0.07188439,0.9189997,-0.09843059,0.0458101,-0.8455651,-0.2637312,0.8088622,-0.7639761,0.2115875,-0.1603035,-0.7484603,-0.1492598,-0.9788708,0.4243615,-0.01155579,0.2887587,0.09095136,0.1264799,0.09441298,-0.9475916,0.4440849,-0.4558237,-0.09481729,-0.3257929,-0.06426031,-0.1903197,0.01553778,7.549517e-15,-0.8107424,-0.07188439,-0.1772195,0.1473622
75%,0.4932359,0.507623,0.6542358,0.4204051,1.0405,1.00676,1.212692,1.070583,-0.1829393,0.6578094,-0.009199809,0.6085433,0.9189997,-0.09843059,0.6258309,1.159074,-0.2637312,0.8088622,1.300356,0.2115875,-0.1603035,0.797532,0.5183672,0.8610997,0.4243615,0.9046408,0.2887587,0.504071,0.1264799,0.09441298,1.046486,0.8917707,-0.4558237,-0.09481729,-0.3257929,-0.06426031,-0.1903197,0.717506,4.013456e-14,1.270152,0.6085433,0.9609039,0.7931878
max,5.359559,5.305756,2.600933,2.93412,1.486856,1.428336,1.520164,2.181001,4.120015,1.886803,3.732695,7.149213,0.9189997,10.15944,3.989573,1.872629,3.807731,3.383052,1.610378,3.428332,3.945638,1.975361,3.704888,2.322186,4.361022,9.865937,4.383338,3.944268,7.872543,17.85563,1.655287,2.072365,2.206053,10.5466,3.069554,15.56171,5.254317,2.009473,7.26641e-14,1.650549,7.149213,3.958219,3.906808


In [89]:
'''
# 아래는 예측값(pred) 역변환에 필요한 람다값 및 코드

# Yeo-Johnson 변환에 활용된 람다값
lambda_pred = -0.044048042422657335

# 정규화를 역변환
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
pred = scaler.inverse_transform(pred)

# Yeo-Johnson 변환을 역변환
# 역변환 함수 정의
def inverse_yeojohnson(y, lambda_):
    if lambda_ == 0:
        return np.exp(y) - 1
    elif lambda_ == 2:
        return -np.exp(-y) + 1
    elif lambda_ > 0:
        return (y * lambda_ + 1) ** (1 / lambda_) - 1
    else:
        return -(-y * (2 - lambda_) + 1) ** (1 / (2 - lambda_))

# 예측값(pred)에 대해 역변환 수행
pred = inverse_yeojohnson(pred, lambda_pred) # 변수명 주의

'''

'\n# 아래는 예측값(pred) 역변환에 필요한 람다값 및 코드\n\n# Yeo-Johnson 변환에 활용된 람다값\nlambda_pred = -0.044048042422657335\n\n# 정규화를 역변환\nfrom sklearn.preprocessing import StandardScaler\n\nscaler = StandardScaler()\npred = scaler.inverse_transform(pred)\n\n# Yeo-Johnson 변환을 역변환\n# 역변환 함수 정의\ndef inverse_yeojohnson(y, lambda_):\n    if lambda_ == 0:\n        return np.exp(y) - 1\n    elif lambda_ == 2:\n        return -np.exp(-y) + 1\n    elif lambda_ > 0:\n        return (y * lambda_ + 1) ** (1 / lambda_) - 1\n    else:\n        return -(-y * (2 - lambda_) + 1) ** (1 / (2 - lambda_))\n\n# 예측값(pred)에 대해 역변환 수행\npred = inverse_yeojohnson(pred, lambda_pred) # 변수명 주의\n\n'

In [90]:
# 전처리 결과 csv로 저장
test_df.to_csv('preprocessing_DataFrame_test_fin.csv', index=True)