In [62]:
import pandas as pd

# 데이터 로드
train_df = pd.read_csv("train.csv", index_col='Id')  # 파일 경로가 올바른지 확인하세요

# 수치형 및 범주형 칼럼 구분
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# 수치형 및 범주형 칼럼 개수 계산
num_numerical = len(numerical_cols)
num_categorical = len(categorical_cols)

# 결과 출력
print(f"Number of Numerical Columns: {num_numerical}")
print(f"Number of Categorical Columns: {num_categorical}")


Number of Numerical Columns: 37
Number of Categorical Columns: 43


In [63]:
# 확인
train_df.columns
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuilt    

In [64]:
# 새로운 변수 추가
train_df['BuildingAge'] = train_df['YearRemodAdd'] - train_df['YearBuilt']  # 건축 연령 = 리모델링 연도 - 건축 연도
train_df['TotalBsmtArea'] = train_df['TotalBsmtSF']  # 총 지하실 면적 (TotalBsmtSF만 사용)
train_df['TotalBaths'] = train_df['FullBath'] + train_df['HalfBath'] * 0.5 + train_df['BsmtFullBath'] + train_df['BsmtHalfBath'] * 0.5

# 총 욕실 개수 = 전체 욕실 + 반 욕실 (0.5로 환산) + 지하실 전체 욕실 + 지하실 반 욕실 (0.5로 환산)
train_df['BuildingQuality'] = (train_df['OverallQual'] + train_df['OverallCond']) / 2  # 건물 품질 = (전반적인 품질 + 전반적인 상태) / 2

# 필요 없는 컬럼 드롭 (BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, 1stFlrSF, 2ndFlrSF)
train_df.drop(columns=['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF'], inplace=True, errors='ignore')

In [65]:
# MSSubClass는 단순 명목형 feature이므로 dtype을 object로 변경
train_df['MSSubClass'] = train_df['MSSubClass'].astype('object')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MSSubClass       1460 non-null   object 
 1   MSZoning         1460 non-null   object 
 2   LotFrontage      1201 non-null   float64
 3   LotArea          1460 non-null   int64  
 4   Street           1460 non-null   object 
 5   Alley            91 non-null     object 
 6   LotShape         1460 non-null   object 
 7   LandContour      1460 non-null   object 
 8   Utilities        1460 non-null   object 
 9   LotConfig        1460 non-null   object 
 10  LandSlope        1460 non-null   object 
 11  Neighborhood     1460 non-null   object 
 12  Condition1       1460 non-null   object 
 13  Condition2       1460 non-null   object 
 14  BldgType         1460 non-null   object 
 15  HouseStyle       1460 non-null   object 
 16  OverallQual      1460 non-null   int64  
 17  OverallCond      14

In [66]:
# 결측치 찾고 결측치 소팅
missing = train_df.isnull().sum() / len(train_df) * 100
missing = missing[missing>0].sort_values(ascending=False)

# 결측치 비율 80% 넘는 칼럼 삭제위해 missing_drop 만듦
missing_drop = missing[missing>80]
missing_drop

PoolQC         99.520548
MiscFeature    96.301370
Alley          93.767123
Fence          80.753425
dtype: float64

In [67]:
# missing_drop 삭제
train_df = train_df.drop(missing_drop.index, axis=1)

In [68]:
# 확인
train_df.columns
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 75 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MSSubClass       1460 non-null   object 
 1   MSZoning         1460 non-null   object 
 2   LotFrontage      1201 non-null   float64
 3   LotArea          1460 non-null   int64  
 4   Street           1460 non-null   object 
 5   LotShape         1460 non-null   object 
 6   LandContour      1460 non-null   object 
 7   Utilities        1460 non-null   object 
 8   LotConfig        1460 non-null   object 
 9   LandSlope        1460 non-null   object 
 10  Neighborhood     1460 non-null   object 
 11  Condition1       1460 non-null   object 
 12  Condition2       1460 non-null   object 
 13  BldgType         1460 non-null   object 
 14  HouseStyle       1460 non-null   object 
 15  OverallQual      1460 non-null   int64  
 16  OverallCond      1460 non-null   int64  
 17  YearBuilt        14

In [69]:
# 나머지 결측치 자료는 최빈값 대체하기.
# 최빈값 대체 이전에 missing 데이터프레임에도 drop 함께 해주기

missing = missing.drop(missing_drop.index, errors='ignore')
missing

# ex = train_df[missing]

ex = missing.index
numeric_feature = train_df[ex].select_dtypes(['int64', 'float64'])
categorical_feature = train_df[ex].select_dtypes('object')

numeric_feature

Unnamed: 0_level_0,LotFrontage,GarageYrBlt,MasVnrArea
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,65.0,2003.0,196.0
2,80.0,1976.0,0.0
3,68.0,2001.0,162.0
4,60.0,1998.0,0.0
5,84.0,2000.0,350.0
...,...,...,...
1456,62.0,1999.0,0.0
1457,85.0,1978.0,119.0
1458,66.0,1941.0,0.0
1459,68.0,1950.0,0.0


In [70]:
# Electrical BsmtFinType1 GarageCond GarageQual GarageYrBlt가 없습니다.

In [71]:
# 수치형 및 범주형 칼럼 구분 갱신
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# 수치형 및 범주형 칼럼 개수 갱신
num_numerical = len(numerical_cols)
num_categorical = len(categorical_cols)

pd.set_option('display.max_columns', None)
train_df[categorical_cols].describe(include='all')

Unnamed: 0,MSSubClass,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
count,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,588,1460,1460,1460,1423,1423,1422,1423,1422,1460,1460,1460,1459,1460,1460,770,1379,1379,1379,1379,1460,1460,1460
unique,15,5,2,4,4,2,5,3,25,9,8,5,8,6,8,15,16,3,4,5,6,4,4,4,6,6,6,5,2,5,4,7,5,6,3,5,5,3,9,6
top,20,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,TA,TA,PConc,TA,TA,No,Unf,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Gd,Attchd,Unf,TA,TA,Y,WD,Normal
freq,536,1151,1454,925,1311,1459,1052,1382,225,1260,1445,1220,726,1141,1434,515,504,445,906,1282,647,649,1311,953,430,1256,1428,741,1365,1334,735,1360,380,870,605,1311,1326,1340,1267,1198


In [72]:
# 범주형 값을 최빈값으로 처리하기.
for col in categorical_feature.columns:
    mode_value = train_df[col].mode(dropna=True)[0]  # 최빈값 계산
    train_df[col].fillna(mode_value, inplace=True)   # 결측치 대체

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(mode_value, inplace=True)   # 결측치 대체


In [73]:
# 검증
train_df[categorical_feature.columns].isnull().sum()

MasVnrType      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
BsmtFinType2    0
BsmtExposure    0
BsmtFinType1    0
BsmtCond        0
BsmtQual        0
Electrical      0
dtype: int64

In [74]:
# 수치형 값을 평균값으로 처리하기.
for col in numeric_feature.columns:
    mean_value = train_df[col].mean()
    train_df[col].fillna(mean_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(mean_value, inplace=True)


In [75]:
train_df[numeric_feature.columns].isnull().sum()

LotFrontage    0
GarageYrBlt    0
MasVnrArea     0
dtype: int64

In [76]:
# GrLivArea, LotArea, GarageArea 이상치 있는 레코드 삭제

train_df = train_df = train_df[(train_df.index != 692) & (train_df.index != 1183)]
train_df = train_df.drop(train_df[(train_df['GrLivArea']>4000) & (train_df['SalePrice']<12.5)].index)
train_df = train_df.drop(train_df[(train_df['LotArea']>150000)].index)
train_df = train_df.drop(train_df[(train_df['GarageArea']>1200) & (train_df['SalePrice']<12.5)].index)

In [77]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1455 entries, 1 to 1460
Data columns (total 75 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MSSubClass       1455 non-null   object 
 1   MSZoning         1455 non-null   object 
 2   LotFrontage      1455 non-null   float64
 3   LotArea          1455 non-null   int64  
 4   Street           1455 non-null   object 
 5   LotShape         1455 non-null   object 
 6   LandContour      1455 non-null   object 
 7   Utilities        1455 non-null   object 
 8   LotConfig        1455 non-null   object 
 9   LandSlope        1455 non-null   object 
 10  Neighborhood     1455 non-null   object 
 11  Condition1       1455 non-null   object 
 12  Condition2       1455 non-null   object 
 13  BldgType         1455 non-null   object 
 14  HouseStyle       1455 non-null   object 
 15  OverallQual      1455 non-null   int64  
 16  OverallCond      1455 non-null   int64  
 17  YearBuilt        14

In [78]:
# LotFrontage, LotArea 이상치 중 test 데이터 상의 최대값을 넘어가는 극단치들만 평균치로 대체
test_df = pd.read_csv('test.csv')

train_df['LotFrontage'].where(
    train_df['LotFrontage'] < test_df['LotFrontage'].agg('max'), 
    other=test_df['LotFrontage'].agg('mean'), 
    inplace=True)

train_df['LotArea'].where(
    train_df['LotArea'] < test_df['LotArea'].agg('max'), 
    other=test_df['LotArea'].agg('mean'), 
    inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['LotFrontage'].where(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['LotArea'].where(
  train_df['LotArea'].where(


In [79]:
# 서수형 피처를 라벨링하기 전 unique 값 확인

ordinal_feature = ['ExterQual','ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond', 'FireplaceQu', 'OverallQual','OverallCond' ]

for i in ordinal_feature:
    print(f"{i}: ", train_df[i].unique())

ExterQual:  ['Gd' 'TA' 'Ex' 'Fa']
ExterCond:  ['TA' 'Gd' 'Fa' 'Po' 'Ex']
BsmtQual:  ['Gd' 'TA' 'Ex' 'Fa']
BsmtCond:  ['TA' 'Gd' 'Fa' 'Po']
HeatingQC:  ['Ex' 'Gd' 'TA' 'Fa' 'Po']
KitchenQual:  ['Gd' 'TA' 'Ex' 'Fa']
GarageQual:  ['TA' 'Fa' 'Gd' 'Ex' 'Po']
GarageCond:  ['TA' 'Fa' 'Gd' 'Po' 'Ex']
FireplaceQu:  ['Gd' 'TA' 'Fa' 'Ex' 'Po']
OverallQual:  [ 7  6  8  5  9  4 10  3  1  2]
OverallCond:  [5 8 6 7 4 2 3 9 1]


In [80]:
# 서수형 피처를 라벨링 하기.
# 서수형 데이터에 대한 상관관계 확인을 위해 labelEncoder를 활용.

# 5점 척도형 변수만 라벨링 진행
ordinal_feature_map = ['ExterQual','ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond', 'FireplaceQu']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# 사용자 지정 매핑 사전
mapping = {
    'Ex': 5,  # Excellent
    'Gd': 4,  # Good
    'TA': 3,  # Typical/Average
    'Fa': 2,  # Fair
    'Po': 1   # Poor
}

for col in ordinal_feature_map:
    train_df[col] = train_df[col].map(mapping)

In [81]:
train_df[ordinal_feature].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1455 entries, 1 to 1460
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   ExterQual    1455 non-null   int64
 1   ExterCond    1455 non-null   int64
 2   BsmtQual     1455 non-null   int64
 3   BsmtCond     1455 non-null   int64
 4   HeatingQC    1455 non-null   int64
 5   KitchenQual  1455 non-null   int64
 6   GarageQual   1455 non-null   int64
 7   GarageCond   1455 non-null   int64
 8   FireplaceQu  1455 non-null   int64
 9   OverallQual  1455 non-null   int64
 10  OverallCond  1455 non-null   int64
dtypes: int64(11)
memory usage: 136.4 KB


In [82]:
# 명목형 변수 확인
nominal_cols = list(x for x in categorical_cols if x not in ordinal_feature)
train_df[nominal_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1455 entries, 1 to 1460
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSSubClass     1455 non-null   object
 1   MSZoning       1455 non-null   object
 2   Street         1455 non-null   object
 3   LotShape       1455 non-null   object
 4   LandContour    1455 non-null   object
 5   Utilities      1455 non-null   object
 6   LotConfig      1455 non-null   object
 7   LandSlope      1455 non-null   object
 8   Neighborhood   1455 non-null   object
 9   Condition1     1455 non-null   object
 10  Condition2     1455 non-null   object
 11  BldgType       1455 non-null   object
 12  HouseStyle     1455 non-null   object
 13  RoofStyle      1455 non-null   object
 14  RoofMatl       1455 non-null   object
 15  Exterior1st    1455 non-null   object
 16  Exterior2nd    1455 non-null   object
 17  MasVnrType     1455 non-null   object
 18  Foundation     1455 non-null   ob

In [83]:
# 명목형 변수 원핫인코딩 진행
train_df = pd.get_dummies(train_df, drop_first=True)
train_df.head()

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,TotalBsmtSF,HeatingQC,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,BuildingAge,TotalBsmtArea,TotalBaths,BuildingQuality,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1
1,65.0,8450.0,7,5,2003,2003,196.0,4,3,4,3,856,5,0,1710,1,0,2,1,3,1,4,8,0,4,2003.0,2,548,3,3,0,61,0,0,0,0,0,2,2008,208500,0,856,3.5,6.0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
2,80.0,9600.0,6,8,1976,1976,0.0,3,3,4,3,1262,5,0,1262,0,1,2,0,3,1,3,6,1,3,1976.0,2,460,3,3,298,0,0,0,0,0,0,5,2007,181500,0,1262,2.5,7.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
3,68.0,11250.0,7,5,2001,2002,162.0,4,3,4,3,920,5,0,1786,1,0,2,1,3,1,4,6,1,3,2001.0,2,608,3,3,0,42,0,0,0,0,0,9,2008,223500,1,920,3.5,6.0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
4,60.0,9550.0,7,5,1915,1970,0.0,3,3,3,4,756,4,0,1717,1,0,1,0,3,1,4,7,1,4,1998.0,3,642,3,3,0,35,272,0,0,0,0,2,2006,140000,55,756,2.0,6.0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
5,84.0,14260.0,8,5,2000,2000,350.0,4,3,4,3,1145,5,0,2198,1,0,2,1,4,1,4,9,1,3,2000.0,3,836,3,3,192,84,0,0,0,0,0,12,2008,250000,0,1145,3.5,6.5,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False


In [84]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1455 entries, 1 to 1460
Columns: 225 entries, LotFrontage to SaleCondition_Partial
dtypes: bool(181), float64(6), int64(38)
memory usage: 768.7 KB


In [85]:
# Yeo-Johnson 변환 전 수치형 피쳐의 정규성 확인
from scipy.stats import shapiro

# 수치형 변수 갱신(라벨 인코딩한 피쳐 포함)
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

not_rejected_cols = []

for col in numerical_cols:
    stat, p = shapiro(train_df[numerical_cols])
    alpha = 0.05
    print(col , 'shapiro test result:')
    if p > alpha:
        print('Fail to reject H0')
        not_rejected_cols.append(col)
    else:
        print('reject H0') # 귀무가설 기각 - 정규성을 따르지 않음
    print('------------------')
    
print("정규성을 따르는 칼럼:", not_rejected_cols)

  res = hypotest_fun_out(*samples, **kwds)


LotFrontage shapiro test result:
reject H0
------------------
LotArea shapiro test result:
reject H0
------------------
OverallQual shapiro test result:
reject H0
------------------
OverallCond shapiro test result:
reject H0
------------------
YearBuilt shapiro test result:
reject H0
------------------
YearRemodAdd shapiro test result:
reject H0
------------------
MasVnrArea shapiro test result:
reject H0
------------------
ExterQual shapiro test result:
reject H0
------------------
ExterCond shapiro test result:
reject H0
------------------
BsmtQual shapiro test result:
reject H0
------------------
BsmtCond shapiro test result:
reject H0
------------------
TotalBsmtSF shapiro test result:
reject H0
------------------
HeatingQC shapiro test result:
reject H0
------------------
LowQualFinSF shapiro test result:
reject H0
------------------
GrLivArea shapiro test result:
reject H0
------------------
BsmtFullBath shapiro test result:
reject H0
------------------
BsmtHalfBath shapiro test 

수치형 피쳐 중 정규성을 따르는 피쳐는 없으므로 모든 수치형 피쳐에 대해 Yeo-Johnson 변환을 적용

In [87]:
# Yeo-Johnson 변환으로 수치형 피쳐 스케일링
from scipy import stats

lambdas = {}

for col in numerical_cols:
    trans_num, lambda_ = stats.yeojohnson(train_df[col], lmbda=None)  # yeojohnson 변환
    train_df[col] = trans_num  # 변환된 값을 저장
    lambdas[col] = lambda_  # 각 열에 대해 람다 값을 저장

print(lambdas) # 람다 값 출력

{'LotFrontage': 0.6946667655825296, 'LotArea': 0.2854579854911228, 'OverallQual': 0.7192432330855125, 'OverallCond': 0.23733005033591364, 'YearBuilt': 22.11061658680637, 'YearRemodAdd': 31.775206020135464, 'MasVnrArea': -0.23287643489132376, 'ExterQual': -1.5974108378575134, 'ExterCond': -0.2994274886563322, 'BsmtQual': 0.09270030415062114, 'BsmtCond': 1.2314032448160812, 'TotalBsmtSF': 0.743177392771308, 'HeatingQC': 2.7611381596783606, 'LowQualFinSF': -9.984326633734911, 'GrLivArea': 0.039219512810829874, 'BsmtFullBath': -1.5539239981673898, 'BsmtHalfBath': -25.860818797314963, 'FullBath': 0.781536002411875, 'HalfBath': -2.2394195537130983, 'BedroomAbvGr': 0.9287904674570548, 'KitchenAbvGr': -1.8547421613208153, 'KitchenQual': -0.005436568845509851, 'TotRmsAbvGrd': 0.07843449762085529, 'Fireplaces': -0.3946270950594283, 'FireplaceQu': 4.409621417558356, 'GarageYrBlt': 28.85873893451191, 'GarageCars': 1.3443867519892132, 'GarageArea': 0.810834493579752, 'GarageQual': 1.883990227978145

In [88]:
# 정규화 전 평균 및 표준편차 기록
numerical_cols_mean = {} 
numerical_cols_std = {}

for col in numerical_cols:
    trans_num, lambda_ = stats.yeojohnson(train_df[col], lmbda=None)  # yeojohnson 변환
    numerical_cols_mean[col] = train_df[col].agg('mean')  # 변환된 값을 저장
    numerical_cols_std[col] = train_df[col].agg('std')  # 각 열에 대해 람다 값을 저장

print('SalePrice_mean: ', numerical_cols_mean['SalePrice'])
print('SalePrice_std: ', numerical_cols_std['SalePrice'])

SalePrice_mean:  9.3309226951343
SalePrice_std:  0.23274601184129493


In [89]:
# StandardScaler로 정규화를 추가로 진행
from sklearn.preprocessing import StandardScaler

# StandardScaler 생성 및 데이터 변환
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])

In [90]:
train_df.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,TotalBsmtSF,HeatingQC,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,BuildingAge,TotalBsmtArea,TotalBaths,BuildingQuality
count,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0,1455.0
mean,2.661483e-16,-1.61154e-16,1.733627e-16,2.335512e-15,-9.522738000000001e-17,5.054376e-16,-2.1975550000000002e-17,-7.508312e-15,1.709209e-16,1.489454e-16,7.319079e-16,-1.294116e-16,-4.028851e-17,3.4184190000000005e-17,3.793834e-15,9.888997000000001e-17,3.784678e-17,-5.396218e-16,-8.362917e-17,6.348492e-17,-2.450274e-15,4.962811e-16,-1.684792e-15,-8.790219000000001e-17,-8.546047e-17,2.7469440000000002e-17,8.790219000000001e-17,-2.099886e-16,4.883455e-16,1.61154e-16,-5.860146e-17,1.098777e-17,-6.592665e-17,2.930073e-17,1.098777e-17,-2.441728e-18,4.578239e-17,-1.648166e-16,-1.449776e-18,-1.729964e-15,-8.057701e-17,-1.294116e-16,4.4561530000000005e-17,6.00665e-16
std,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,4.382649e-14,1.000344,1.000344,1.000344,1.000344,1.000344
min,-2.789207,-3.201546,-4.24262,-6.125109,-2.273141,-1.53043,-0.8275383,-4.086289,-9.495216,-2.712485,-6.608245,-3.131904,-2.100252,-0.1348871,-4.274103,-0.8378948,-0.2412091,-3.074095,-0.7735473,-3.635939,-20.01825,-2.718817,-4.049491,-1.024886,-2.435069,-2.250637,-2.151497,-2.537014,-6.543212,-6.185503,-0.9496145,-1.070925,-0.408411,-0.1295048,-0.2943328,-0.06434895,-0.1886457,-2.181034,-5.995204e-14,-4.080256,-0.8482414,-3.131904,-1.645916,-4.208274
25%,-0.4478317,-0.4211146,-0.7833332,-0.4760758,-0.7098303,-0.9553623,-0.8275383,-0.6544803,-0.1948115,-0.8172829,-0.04700219,-0.5476075,-1.214138,-0.1348871,-0.7203273,-0.8378948,-0.2412091,-1.02351,-0.7735473,-1.059673,-0.1729824,-0.7477711,-0.9446681,-1.024886,-0.4814771,-0.8181892,-1.04541,-0.6082683,0.07180706,0.07023751,-0.9496145,-1.070925,-0.408411,-0.1295048,-0.2943328,-0.06434895,-0.1886457,-0.4451873,-2.68674e-14,-0.6181993,-0.8482414,-0.5476075,-0.2078886,-0.4433828
50%,0.06768134,0.05208206,-0.04077851,-0.4760758,-0.1010213,0.3378893,-0.8275383,-0.6544803,-0.1948115,0.6879536,-0.04700219,-0.08735977,0.9307026,-0.1348871,0.05159024,-0.8378948,-0.2412091,0.8044529,-0.7735473,0.1741817,-0.1729824,-0.7477711,-0.2309331,0.7925466,0.4323573,-0.1698555,0.2770955,0.08274381,0.07180706,0.07023751,-0.9496145,0.4322396,-0.408411,-0.1295048,-0.2943328,-0.06434895,-0.1886457,-0.0685124,6.106227e-15,-0.0418604,-0.8482414,-0.08735977,-0.2078886,0.1540078
75%,0.5082447,0.4823494,0.6724711,0.4410559,1.030609,0.971564,1.183223,1.109464,-0.1948115,0.6879536,-0.04700219,0.5821016,0.9307026,-0.1348871,0.6478893,1.178944,-0.2412091,0.8044529,1.284347,0.1741817,-0.1729824,0.7789691,0.3943461,0.7925466,0.4323573,0.9945133,0.2770955,0.5072386,0.07180706,0.07023751,1.04007,0.8931627,-0.408411,-0.1295048,-0.2943328,-0.06434895,-0.1886457,0.6454491,3.902434e-14,0.639278,1.295285,0.5821016,0.4255291,0.7843141
max,4.834042,4.864027,2.677305,2.696282,1.50447,1.401206,1.54499,2.150175,4.415027,1.941146,3.567251,8.687042,0.9307026,7.413605,4.25463,1.865844,4.145781,2.501362,1.614241,6.087534,5.313877,2.025032,3.427494,2.175042,4.035255,1.572797,3.384134,3.796773,10.47536,13.51154,1.519106,1.888376,2.45244,7.721723,3.397538,15.54027,5.300943,1.960678,7.188694e-14,3.265572,1.605797,8.687042,4.013035,5.228735


In [125]:
'''
# 아래는 예측값(preds) 역변환에 필요한 람다값 및 코드

# Yeo-Johnson 변환에 활용된 람다값
lambda_ = -0.044048042422657335

# 정규화에 활용된 SalePrice의 평균 및 표준편차
SalePrice_mean = 9.3309226951343
SalePrice_std = 0.23274601184129493

# 정규화를 먼저 역변환
preds = preds * SalePrice_std
preds = preds + SalePrice_mean

# Yeo-Johnson 변환을 역변환
# 역변환 함수 정의
def inverse_yeojohnson(y, lambda_):
    # y가 NumPy 배열인 경우를 대비
    y = np.asarray(y)

    # 각 요소에 대해 역변환 수행
    for i in range(len(y)):
        if y[i] >= 0 and lambda_ == 0:
            y[i] = exp(y[i]) - 1
        elif y[i] >= 0 and lambda_ != 0:
            y[i] = (y[i] * lambda_ + 1) ** (1 / lambda_) - 1
        elif y[i] < 0 and lambda_ != 2:
            y[i] = 1 - (-(2 - lambda_) * y[i] + 1) ** (1 / (2 - lambda_))
        elif y[i] < 0 and lambda_ == 2:
            y[i] = 1 - exp(-y[i])
    
    return y  # 수정된 결과 반환

# 예측값(preds)에 대해 역변환 수행
preds = inverse_yeojohnson(preds, lambda_)

'''

'\n# 아래는 예측값(preds) 역변환에 필요한 람다값 및 코드\n\n# Yeo-Johnson 변환에 활용된 람다값\nlambda_ = -0.044048042422657335\n\n# 정규화에 활용된 SalePrice의 평균 및 표준편차\nSalePrice_mean = 9.3309226951343\nSalePrice_std = 0.23274601184129493\n\n# 정규화를 먼저 역변환\npreds = preds * SalePrice_std\npreds = preds + SalePrice_mean\n\n# Yeo-Johnson 변환을 역변환\n# 역변환 함수 정의\ndef inverse_yeojohnson(y, lambda_):\n    # y가 NumPy 배열인 경우를 대비\n    y = np.asarray(y)\n\n    # 각 요소에 대해 역변환 수행\n    for i in range(len(y)):\n        if y[i] >= 0 and lambda_ == 0:\n            y[i] = exp(y[i]) - 1\n        elif y[i] >= 0 and lambda_ != 0:\n            y[i] = (y[i] * lambda_ + 1) ** (1 / lambda_) - 1\n        elif y[i] < 0 and lambda_ != 2:\n            y[i] = 1 - (-(2 - lambda_) * y[i] + 1) ** (1 / (2 - lambda_))\n        elif y[i] < 0 and lambda_ == 2:\n            y[i] = 1 - exp(-y[i])\n    \n    return y  # 수정된 결과 반환\n\n# 예측값(preds)에 대해 역변환 수행\npreds = inverse_yeojohnson(preds, lambda_)\n\n'

In [92]:
# 전처리 결과 csv로 저장
train_df.to_csv('preprocessing_DataFrame_train_fin.csv', index=True)