## 데이터 수집

In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl # 한글 폰트 설정 (NanumGothic) 
mpl.rcParams['font.family'] = 'NanumGothic'
mpl.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지
import seaborn as sns
import pandas as pd
import numpy as np 
import scipy.stats as stats
from sklearn import datasets 
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

In [3]:
train_df = pd.read_csv('../../datasets/house-prices-advanced-regression-techniques_train.csv')
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
test_df = pd.read_csv('../../datasets/house-prices-advanced-regression-techniques_test.csv')
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


## 데이터 분석

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [9]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [8]:
train_df.describe(include='object').T


Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


In [10]:
# 세 가지 컬럼이 동시에 NaN인 행 확인
empty_rows = train_df[train_df[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']].isnull().all(axis=1)]

# 결과 출력
print("세 가지 컬럼이 동시에 비어 있는 행 수:", empty_rows.shape[0])
print("비어 있는 행의 샘플:")
print(empty_rows)

세 가지 컬럼이 동시에 비어 있는 행 수: 0
비어 있는 행의 샘플:
Empty DataFrame
Columns: [Id, MSSubClass, MSZoning, LotFrontage, LotArea, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, MasVnrArea, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, Heating, HeatingQC, CentralAir, Electrical, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, KitchenQual, TotRmsAbvGrd, Functional, Fireplaces, FireplaceQu, GarageType, GarageYrBlt, GarageFinish, GarageCars, GarageArea, GarageQual, GarageCond, PavedDrive, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, PoolQC, Fence, MiscFeature, MiscVal, MoSold, YrSold, SaleType, SaleCondition, SalePrice]

## 데이터 전처리 
- 우선, 연속형 결측치 부터 채우고 가자

#### 추가 고려사항
- 결측치만 고려할 게 아니라 값의 분포도 고려 해야해
- 0 이 많은 값이라던가, 한쪽 쏠림이 있던가, 테일이 길던가, 이상치가 있던가, 정규분포가 아니던가 하는 상황도 고려해야함. 
- 애초에 데이터가 정규분포가 아니면 학습해도 성능 안좋음. 그런 상황이라면 데이터가 정규분포가 되도록 추가 데이터 수집을 해야한다. 

In [57]:
def train_model(df, target_column, model=RandomForestRegressor()):
    """
    주어진 데이터프레임과 타겟 컬럼, 모델로 학습을 수행하고 평가 지표를 반환하는 함수
    
    Parameters:
    df : DataFrame - 학습에 사용할 데이터프레임
    target_column : str - 예측하고자 하는 타겟 컬럼명
    model : sklearn model object - 학습에 사용할 모델 (기본값: RandomForestRegressor)
    
    Returns:
    dict - 모델 성능 평가 결과
    """
    
    # 특성과 타겟 분리
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # 모델 학습
    model.fit(X, y)
    
    # 예측
    predictions = model.predict(X)
    
    # 평가 지표 계산
    results = {
        'model_name': model.__class__.__name__,
        'r2_score': r2_score(y_true=y, y_pred=predictions),
        'mse': mean_squared_error(y_true=y, y_pred=predictions),
        'model_score': model.score(X, y)
    }
    
    return results, model

In [61]:
def evaluate_predictions(df, target_column, trained_model):
    """
    학습된 모델을 사용하여 예측을 수행하고 결과를 평가하는 함수
    
    Parameters:
    df : DataFrame - 예측에 사용할 데이터프레임
    target_column : str - 예측하고자 하는 타겟 컬럼명
    trained_model : sklearn model - 학습된 모델
    
    Returns:
    DataFrame - 실제값과 예측값을 포함한 데이터프레임
    """
    # 타겟 컬럼의 실제값이 있는 데이터만 선택 (검증용)
    test_df = df[df[target_column].notna()].copy()
      
    # 예측을 위한 특성 준비
    X_test = test_df.drop(columns=[target_column])
    
    # 예측 수행
    predictions = trained_model.predict(X_test)
    
    # 결과 비교 데이터프레임 생성
    comparison = pd.DataFrame({
        'actual': test_df[target_column],
        'predicted': predictions,
        'difference': test_df[target_column] - predictions
    })
    
    return comparison

In [13]:
numeric_df = train_df.select_dtypes(exclude=['object'])
numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [14]:
rf_results, rf_model = train_model(numeric_df.dropna(), 'LotFrontage')

In [15]:
rf_results

{'model_name': 'RandomForestRegressor',
 'r2_score': 0.9265891089245236,
 'mse': 43.19150758251561,
 'model_score': 0.9265891089245236}

In [19]:
numeric_df.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Index: 1121 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1121 non-null   int64  
 1   MSSubClass     1121 non-null   int64  
 2   LotFrontage    1121 non-null   float64
 3   LotArea        1121 non-null   int64  
 4   OverallQual    1121 non-null   int64  
 5   OverallCond    1121 non-null   int64  
 6   YearBuilt      1121 non-null   int64  
 7   YearRemodAdd   1121 non-null   int64  
 8   MasVnrArea     1121 non-null   float64
 9   BsmtFinSF1     1121 non-null   int64  
 10  BsmtFinSF2     1121 non-null   int64  
 11  BsmtUnfSF      1121 non-null   int64  
 12  TotalBsmtSF    1121 non-null   int64  
 13  1stFlrSF       1121 non-null   int64  
 14  2ndFlrSF       1121 non-null   int64  
 15  LowQualFinSF   1121 non-null   int64  
 16  GrLivArea      1121 non-null   int64  
 17  BsmtFullBath   1121 non-null   int64  
 18  BsmtHalfBath 

In [68]:
# 메인 실행 코드
models = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'SVR': SVR(),
    'LinearRegression': LinearRegression()
}

target_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [69]:

# 결과를 저장할 딕셔너리 초기화
results = {model_name: {} for model_name in models.keys()}
trained_models = {model_name: {} for model_name in models.keys()}

# 타겟별로 각 모델의 성능을 비교
for target in target_columns:
    print(f"\n{'='*50}")
    print(f"Target Variable: {target}")
    print('='*50)
    
    target_results = {}
    
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        
        # 모델 학습
        result, trained_model = train_model(numeric_df.dropna(), target, model=model)
        results[model_name][target] = result
        trained_models[model_name][target] = trained_model
        
        # 결과 저장
        target_results[model_name] = {
            'R2': result['r2_score'],
            'MSE': result['mse']
        }
        
        # 학습 결과 출력
        print(f"Training Results:")
        print(f"R2 Score: {result['r2_score']:.4f}")
        print(f"MSE: {result['mse']:.4f}")
        
        # 예측 평가
        comparison = evaluate_predictions(numeric_df.dropna(), target, trained_model)
        print("\nSample predictions (first 5 rows):")
        print(comparison.head().round(2))
    
    # 타겟별 모델 성능 비교
    print(f"\n{'-'*50}")
    print(f"Performance Comparison for {target}:")
    print('-'*50)
    comparison_df = pd.DataFrame(target_results).round(4)
    print("\nR2 Scores:")
    print(comparison_df.loc['R2'].sort_values(ascending=False))
    print("\nMSE Scores:")
    print(comparison_df.loc['MSE'].sort_values())


Target Variable: LotFrontage

Training RandomForest...
Training Results:
R2 Score: 0.9267
MSE: 43.1488

Sample predictions (first 5 rows):
   actual  predicted  difference
0    65.0      65.92       -0.92
1    80.0      78.97        1.03
2    68.0      74.81       -6.81
3    60.0      60.55       -0.55
4    84.0      86.63       -2.63

Training SVR...
Training Results:
R2 Score: 0.1247
MSE: 514.9867

Sample predictions (first 5 rows):
   actual  predicted  difference
0    65.0      72.80       -7.80
1    80.0      69.34       10.66
2    68.0      74.92       -6.92
3    60.0      64.29       -4.29
4    84.0      78.28        5.72

Training LinearRegression...
Training Results:
R2 Score: 0.4547
MSE: 320.8157

Sample predictions (first 5 rows):
   actual  predicted  difference
0    65.0      69.67       -4.67
1    80.0      74.09        5.91
2    68.0      72.82       -4.82
3    60.0      67.51       -7.51
4    84.0      89.04       -5.04

------------------------------------------------

In [72]:
# 결측치 채우기
filled_df = numeric_df.copy()

for target in target_columns:
    print(f"\nFilling missing values for {target}...")
    
    # 결측치가 있는 행 찾기
    missing_rows = filled_df[filled_df[target].isna()]
    
    if len(missing_rows) > 0:
        # 예측에 사용할 특성 준비 (타겟 컬럼 제외)
        X_missing = missing_rows.drop(columns=target_columns)
        
        # RandomForest 모델로 예측
        rf_model = trained_models['RandomForest'][target]
        print(X_missing.info())
        predictions = rf_model.predict(X_missing)
        
        # 결측치 채우기
        filled_df.loc[missing_rows.index, target] = predictions
        
        print(f"Filled {len(missing_rows)} missing values")
        print(f"Sample of filled values (first 5):")
        print(pd.DataFrame({
            'Predicted_Value': predictions[:5]
        }).round(2))

# 결과 확인
print("\nMissing values after filling:")
print(filled_df[target_columns].isnull().sum())

# 원본과 비교
print("\nOriginal missing values:")
print(numeric_df[target_columns].isnull().sum())


Filling missing values for LotFrontage...
<class 'pandas.core.frame.DataFrame'>
Index: 259 entries, 7 to 1446
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Id             259 non-null    int64
 1   MSSubClass     259 non-null    int64
 2   LotArea        259 non-null    int64
 3   OverallQual    259 non-null    int64
 4   OverallCond    259 non-null    int64
 5   YearBuilt      259 non-null    int64
 6   YearRemodAdd   259 non-null    int64
 7   BsmtFinSF1     259 non-null    int64
 8   BsmtFinSF2     259 non-null    int64
 9   BsmtUnfSF      259 non-null    int64
 10  TotalBsmtSF    259 non-null    int64
 11  1stFlrSF       259 non-null    int64
 12  2ndFlrSF       259 non-null    int64
 13  LowQualFinSF   259 non-null    int64
 14  GrLivArea      259 non-null    int64
 15  BsmtFullBath   259 non-null    int64
 16  BsmtHalfBath   259 non-null    int64
 17  FullBath       259 non-null    int64
 18  HalfBath   

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- LotFrontage
- MasVnrArea


## 모델 학습 
- 연속형 데이터 타입 대상 학습, 배포 시엔 컬럼을 4개 선별해 서비스

- Target : LotArea - 토지 면적 (제곱피트)
- Feature : BsmtUnfSF, LowQualFinSF, HalfBath, Fireplaces 
- 지하실 미완성 면적 (제곱피트), 저품질 마감 면적 (모든 층), 지상 반 욕실 수, 벽난로 수

In [35]:
target_column = 'LotArea'
feature_columns = ['BsmtUnfSF','LowQualFinSF','HalfBath','Fireplaces', target_column]

result, trained_model = train_model(numeric_df[feature_columns], target_column, model=model)

In [44]:
# LowQualFinSF 컬럼을 기준으로 오름차순 정렬
numeric_df[feature_columns].sort_values(by='LowQualFinSF',ascending=False).head()


Unnamed: 0,BsmtUnfSF,LowQualFinSF,HalfBath,Fireplaces,LotArea
185,1107,572,1,2,22950
170,360,528,1,0,12358
635,1184,515,0,0,10896
1009,1008,514,0,0,6000
88,1013,513,0,0,8470


In [46]:
# 1107,572,1,2
data = [1107,572,1,2]

In [48]:
trained_model.predict([data])




array([19290.85])

## 모델 평가 

In [49]:
result

{'model_name': 'RandomForestRegressor',
 'r2_score': 0.713316676713385,
 'mse': 28541449.989138145,
 'model_score': 0.713316676713385}

## 모델 배포

In [50]:
import pickle 
save_file_name = f'../../models/house-prices-advanced-regression-techniques.pkl'

with open(save_file_name, 'wb') as save_file:
    pickle.dump(trained_model, file=save_file)
    pass
