## 데이터 수집

In [74]:
import matplotlib.pyplot as plt
import matplotlib as mpl # 한글 폰트 설정 (NanumGothic) 
mpl.rcParams['font.family'] = 'NanumGothic'
mpl.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지
import seaborn as sns
import pandas as pd
import numpy as np 
import scipy.stats as stats
from sklearn import datasets 
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [75]:
data_df = pd.read_csv('../../datasets/house-prices-advanced-regression-techniques_train.csv')
data_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 데이터 분석

In [76]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [77]:
data_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [78]:
data_df.describe(include='object').T


Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


In [79]:
# 세 가지 컬럼이 동시에 NaN인 행 확인
empty_rows = data_df[data_df[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']].isnull().all(axis=1)]

# 결과 출력
print("세 가지 컬럼이 동시에 비어 있는 행 수:", empty_rows.shape[0])
print("비어 있는 행의 샘플:")
print(empty_rows)

세 가지 컬럼이 동시에 비어 있는 행 수: 0
비어 있는 행의 샘플:
Empty DataFrame
Columns: [Id, MSSubClass, MSZoning, LotFrontage, LotArea, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, MasVnrArea, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, Heating, HeatingQC, CentralAir, Electrical, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, KitchenQual, TotRmsAbvGrd, Functional, Fireplaces, FireplaceQu, GarageType, GarageYrBlt, GarageFinish, GarageCars, GarageArea, GarageQual, GarageCond, PavedDrive, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, PoolQC, Fence, MiscFeature, MiscVal, MoSold, YrSold, SaleType, SaleCondition, SalePrice]

## 데이터 전처리 

#### 스케일링 
- 우선, 4개 컬럼만 가지고 수행하자.

- Target : LotArea - 토지 면적 (제곱피트)
- Feature : BsmtUnfSF, LowQualFinSF, HalfBath, Fireplaces 
- 지하실 미완성 면적 (제곱피트), 저품질 마감 면적 (모든 층), 지상 반 욕실 수, 벽난로 수

In [80]:
numeric_df = data_df.select_dtypes(exclude=['object'])
numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [81]:
target_column = 'LotArea'
feature_columns = ['BsmtUnfSF','LowQualFinSF','HalfBath','Fireplaces', target_column]

select_df = numeric_df[feature_columns]
select_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   BsmtUnfSF     1460 non-null   int64
 1   LowQualFinSF  1460 non-null   int64
 2   HalfBath      1460 non-null   int64
 3   Fireplaces    1460 non-null   int64
 4   LotArea       1460 non-null   int64
dtypes: int64(5)
memory usage: 57.2 KB


In [82]:
standardscaler = StandardScaler()
standardscaler.fit(select_df.drop(columns=[target_column]))

In [83]:
# 스케일링 적용
scailing_array = standardscaler.transform(select_df.drop(columns=[target_column]))

# 스케일링된 데이터를 DataFrame으로 변환
scailing_df = pd.DataFrame(scailing_array, columns=select_df.drop(columns=[target_column]).columns)

# 타겟 컬럼 추가
scailing_df[target_column] = select_df[target_column].values
scailing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   BsmtUnfSF     1460 non-null   float64
 1   LowQualFinSF  1460 non-null   float64
 2   HalfBath      1460 non-null   float64
 3   Fireplaces    1460 non-null   float64
 4   LotArea       1460 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 57.2 KB


In [84]:
scailing_df.head()

Unnamed: 0,BsmtUnfSF,LowQualFinSF,HalfBath,Fireplaces,LotArea
0,-0.944591,-0.120242,1.227585,-0.951226,8450
1,-0.641228,-0.120242,-0.761621,0.600495,9600
2,-0.301643,-0.120242,1.227585,0.600495,11250
3,-0.06167,-0.120242,-0.761621,0.600495,9550
4,-0.174865,-0.120242,1.227585,0.600495,14260


#### 데이터 분할

In [85]:
scailing_df.info()
train_features, test_features, train_label, test_label =\
      train_test_split(scailing_df.drop(columns=[target_column]), scailing_df[target_column], test_size=0.2)
train_features.shape, test_features.shape, train_label.shape, test_label.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   BsmtUnfSF     1460 non-null   float64
 1   LowQualFinSF  1460 non-null   float64
 2   HalfBath      1460 non-null   float64
 3   Fireplaces    1460 non-null   float64
 4   LotArea       1460 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 57.2 KB


((1168, 4), (292, 4), (1168,), (292,))

## 모델 학습 
- 연속형 데이터 타입 대상 학습, 배포 시엔 컬럼을 4개 선별해 서비스

- Target : LotArea - 토지 면적 (제곱피트)
- Feature : BsmtUnfSF, LowQualFinSF, HalfBath, Fireplaces 
- 지하실 미완성 면적 (제곱피트), 저품질 마감 면적 (모든 층), 지상 반 욕실 수, 벽난로 수

In [86]:
randomforest = RandomForestRegressor()
randomforest.fit(X=train_features, y=train_label)

In [87]:
# LowQualFinSF 컬럼을 기준으로 오름차순 정렬
numeric_df[feature_columns].sort_values(by='LowQualFinSF',ascending=False).head()

Unnamed: 0,BsmtUnfSF,LowQualFinSF,HalfBath,Fireplaces,LotArea
185,1107,572,1,2,22950
170,360,528,1,0,12358
635,1184,515,0,0,10896
1009,1008,514,0,0,6000
88,1013,513,0,0,8470


## 모델 평가 

- 아무래 split을 다시해도 test r2가 양수가 나오지 않음. 
- 아무래도 데이터 분포와, 중복레코드 제거 안해서 그런듯? 

In [88]:
predict_train = randomforest.predict(X=train_features)

r2_score(y_true=train_label, y_pred=predict_train)

0.7183778775093148

In [89]:
predict_test = randomforest.predict(X=test_features)

r2_score(y_true=test_label, y_pred=predict_test)

-1.0988099750394462

## 모델 배포

In [90]:
import pickle

# 모델과 선택된 특성을 저장할 딕셔너리 생성
model_info = {
    'model': randomforest,  # 학습된 모델
    'selector': standardscaler  # 선택된 특성
}

# 파일 경로 설정
save_file_name = f'../../models/house-prices-advanced-regression-techniques.pkl'

# 모델과 선택된 특성을 함께 저장
with open(save_file_name, 'wb') as save_file:
    pickle.dump(model_info, save_file)

print("모델과 선택된 특성이 저장되었습니다.")


모델과 선택된 특성이 저장되었습니다.
