In [89]:
import pandas as pd

# 데이터 로드
train_df = pd.read_csv("./house_prediction/train.csv")  # 파일 경로가 올바른지 확인하세요

# 수치형 및 범주형 칼럼 구분
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# 수치형 및 범주형 칼럼 개수 계산
num_numerical = len(numerical_cols)
num_categorical = len(categorical_cols)

# 결과 출력
print(f"Number of Numerical Columns: {num_numerical}")
print(f"Number of Categorical Columns: {num_categorical}")


Number of Numerical Columns: 38
Number of Categorical Columns: 43


In [90]:
# 결측치 찾고 결측치 소팅
missing = train_df.isnull().sum() / len(train_df) * 100
missing = missing[missing>0].sort_values(ascending=False)

# 결측치 비율 80% 넘는 칼럼 삭제위해 missing_drop 만듦
missing_drop = missing[missing>80]
missing_drop

PoolQC         99.520548
MiscFeature    96.301370
Alley          93.767123
Fence          80.753425
dtype: float64

In [91]:
# missing_drop 삭제
train_df = train_df.drop(missing_drop.index, axis=1)

In [92]:
# 확인
train_df.columns
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 77 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [93]:
# 나머지 결측치 자료는 최빈값 대체하기.
# 최빈값 대체 이전에 missing 데이터프레임에도 drop 함께 해주기

missing = missing.drop(missing_drop.index, errors='ignore')
missing

# ex = train_df[missing]

ex = missing.index
numeric_feature = train_df[ex].select_dtypes(['int64', 'float64'])
categorical_feature = train_df[ex].select_dtypes('object')

numeric_feature

Unnamed: 0,LotFrontage,GarageYrBlt,MasVnrArea
0,65.0,2003.0,196.0
1,80.0,1976.0,0.0
2,68.0,2001.0,162.0
3,60.0,1998.0,0.0
4,84.0,2000.0,350.0
...,...,...,...
1455,62.0,1999.0,0.0
1456,85.0,1978.0,119.0
1457,66.0,1941.0,0.0
1458,68.0,1950.0,0.0


In [59]:
# Electrical BsmtFinType1 GarageCond GarageQual GarageYrBlt가 없습니다.

In [100]:
# 범주형 값을 최빈값으로 처리하기.
for col in categorical_feature.columns:
    mode_value = train_df[col].mode(dropna=True)[0]  # 최빈값 계산
    train_df[col].fillna(mode_value, inplace=True)   # 결측치 대체

In [102]:
# 검증
train_df[categorical_feature.columns].isnull().sum()

FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
BsmtExposure    0
BsmtFinType2    0
BsmtFinType1    0
BsmtCond        0
BsmtQual        0
MasVnrType      0
Electrical      0
dtype: int64

In [103]:
# 수치형 값을 평균값으로 처리하기.
for col in numeric_feature.columns:
    mean_value = train_df[col].mean()
    train_df[col].fillna(mean_value, inplace=True)

In [104]:
train_df[numeric_feature.columns].isnull().sum()

LotFrontage    0
GarageYrBlt    0
MasVnrArea     0
dtype: int64

In [105]:
# 서수형 피처를 라벨링 하기.
# 서수형 데이터에 대한 상관관계 확인을 위해 labelEncoder를 활용.

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
ordinal_feature = ['ExterQual','ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond', 'SaleCondition', 'FireplaceQu', 'OverallQual','OverallCond' ]

# 사용자 지정 매핑 사전
mapping = {
    'Ex': 5,  # Excellent
    'Gd': 4,  # Good
    'TA': 3,  # Typical/Average
    'Fa': 2,  # Fair
    'Po': 1   # Poor
}

for col in ordinal_feature:
    train_df[col] = train_df[col].map(mapping)


In [106]:
train_df[ordinal_feature]

Unnamed: 0,ExterQual,ExterCond,BsmtQual,BsmtCond,HeatingQC,KitchenQual,GarageQual,GarageCond,SaleCondition,FireplaceQu,OverallQual,OverallCond
0,4,3,4,3,5,4,3,3,,4,,
1,3,3,4,3,5,3,3,3,,3,,
2,4,3,4,3,5,4,3,3,,3,,
3,3,3,3,4,4,4,3,3,,4,,
4,4,3,4,3,5,4,3,3,,3,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1455,3,3,4,3,5,3,3,3,,3,,
1456,3,3,4,3,3,3,3,3,,3,,
1457,5,4,3,4,5,4,3,3,,4,,
1458,3,3,3,3,4,4,3,3,,4,,


In [107]:
train_df.index

RangeIndex(start=0, stop=1460, step=1)

In [108]:
train_df.to_csv('preprocessing_DataFrame.csv', index=False)