In [1]:
# 데이터 load
import pandas as pd
import numpy as np

df_floor = pd.read_excel('use_data/침수관련_데이터.xlsx')

# 컬럼 이름 변경
cols = ['자치구', '날짜', '1hr_최대강수량', '일평균강수량', '경사도', '해발고도', '불투수면',
       '녹지면적율', '하천면적율', '복개하천개수', '맨홀개수', '빗물받이개수',
       '빗물펌프개수', 'AVG_하수관로비율', 'MAX_하수관로비율', '침수피해']

df_floor.columns = cols

# 이상치 제거(강수량 0일 때 침수피해 1인 경우)
df_floor.loc[(df_floor['1hr_최대강수량'] == 0) & (df_floor['일평균강수량'] == 0)
             & (df_floor['침수피해'] == 1), '침수피해'] = np.NaN


# datetime 형태로 변경 및 na값 처리
df_floor['날짜'] = pd.to_datetime(df_floor['날짜'], format='%Y%m%d')
df_floor.loc[df_floor['AVG_하수관로비율'] == 'na', 'AVG_하수관로비율'] = np.NaN
df_floor.loc[df_floor['MAX_하수관로비율'] == 'na', 'MAX_하수관로비율'] = np.NaN

# 수치형 데이터로 변경
df_floor['AVG_하수관로비율'] = df_floor['AVG_하수관로비율'].astype('float')
df_floor['MAX_하수관로비율'] = df_floor['MAX_하수관로비율'].astype('float')

# null값 제거
df_floor = df_floor.dropna(axis=0, how='any')

del df_floor['침수피해']

# 월/일로 분리
# df_floor['year'] = df_floor['날짜'].dt.year
df_floor['month'] = df_floor['날짜'].dt.month
df_floor['day'] = df_floor['날짜'].dt.day

del df_floor['날짜']

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# x, y 분리 함수 생성
def split_xy(df, encoding, target):
    from sklearn.model_selection import train_test_split
    if encoding == 'onehot':
        if target == 'avg':
            df_x = df.iloc[:, [i for i in range(len(df.columns)) if i not in (11, 12)]]
            df_y = df.iloc[:, 11]
            return df_x, df_y
        elif target == 'max':
            df_x = df.iloc[:, [i for i in range(len(df.columns)) if i not in (11, 12)]]
            df_y = df.iloc[:, 12]
            return df_x, df_y
        else:
            print('target is avg or max')
    elif encoding == 'label':
        if target == 'avg':
            df_x = df.iloc[:, [i for i in range(len(df.columns)) if i not in (12, 13)]]
            df_y = df.iloc[:, 12]
            return df_x, df_y
        elif target == 'max':
            df_x = df.iloc[:, [i for i in range(len(df.columns)) if i not in (12, 13)]]
            df_y = df.iloc[:, 13]
            return df_x, df_y
        else:
            print('target is avg or max')
    else:
        print('encoding is onehot or label')

# 평가지표 함수 생성
def regressor_metrics(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f'MAE: {mae:.4f}')
    print(f'MSE: {mse:.4f}')
    print(f'RMSE: {rmse:.4f}')
    print(f'R2: {r2:.4f}')

In [4]:
# Onehot encoding
df_floor_onehot = pd.get_dummies(df_floor)

# LightGBM의 경우 bool type이 허용되지 않아 수치형으로 변경
df_floor_onehot.iloc[:, 15:] = df_floor_onehot.iloc[:, 15:].astype('int')

# x, y 데이터 생성
df_x, df_y = split_xy(df_floor_onehot, 'onehot', 'max')
del df_x['경사도']
del df_x['빗물받이개수']
del df_x['일평균강수량']
print(df_x.shape, df_y.shape)

# 학습 및 평가를 위해 train, test set 분리
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0, shuffle=True)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(10812, 35) (10812,)
(8649, 35) (2163, 35) (8649,) (2163,)


In [5]:
# 결과가 제일 좋았던 LGBMRegressor 및 파라미터 채택
pipe_lgb5 = Pipeline(
    [
        ('scaler', MinMaxScaler()),
        ('LGBM', LGBMRegressor(random_state=0, n_jobs=-1, force_col_wise=True, verbose=-1, learning_rate=0.01, max_depth=25,
                               n_estimators=300, num_leaves=300))
    ]
)

pipe_lgb5.fit(x_train, y_train)
pred = pipe_lgb5.predict(x_test)
regressor_metrics(y_test, pred)

MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575


In [9]:
import joblib
import pickle

saved_model = joblib.dump(pipe_lgb5, 'result_model/LightGBMRegressor.pkl')

In [10]:
saved = joblib.load('result_model/LightGBMRegressor.pkl')
saved

In [11]:
df_floor_onehot.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10812 entries, 0 to 11499
Data columns (total 40 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   1hr_최대강수량   10812 non-null  float64
 1   일평균강수량      10812 non-null  float64
 2   경사도         10812 non-null  float64
 3   해발고도        10812 non-null  float64
 4   불투수면        10812 non-null  float64
 5   녹지면적율       10812 non-null  float64
 6   하천면적율       10812 non-null  float64
 7   복개하천개수      10812 non-null  int64  
 8   맨홀개수        10812 non-null  int64  
 9   빗물받이개수      10812 non-null  int64  
 10  빗물펌프개수      10812 non-null  int64  
 11  AVG_하수관로비율  10812 non-null  float64
 12  MAX_하수관로비율  10812 non-null  float64
 13  month       10812 non-null  int32  
 14  day         10812 non-null  int32  
 15  자치구_강남      10812 non-null  int32  
 16  자치구_강동      10812 non-null  int32  
 17  자치구_강북      10812 non-null  int32  
 18  자치구_강서      10812 non-null  int32  
 19  자치구_관악      10812 non-null  in

In [13]:
# 데이터 프레임과 각 자치구의 이름에 해당하는 열을 가져오기
district_columns = [col for col in df_floor_onehot.columns if col.startswith('자치구_')]

# 딕셔너리를 사용하여 각 자치구의 모델을 저장할 준비
models = {}

# 각 자치구별로 모델을 만들고 저장
for district in district_columns:
    print(f"Training model for {district}")
    
    # 해당 자치구 데이터 추출
    district_data = df_floor_onehot[df_floor_onehot[district] == 1]
    
    # X와 y 분할
    df_x, df_y = split_xy(district_data, 'onehot', 'max')
    del df_x['경사도']
    del df_x['빗물받이개수']
    del df_x['일평균강수량']
    
    # 모델 선택 및 훈련
    pipe_lgb5.fit(x_train, y_train)
    pred = pipe_lgb5.predict(x_test)
    regressor_metrics(y_test, pred)
    
    # 모델을 딕셔너리에 저장
    models[district] = pipe_lgb5
    
    # 모델을 파일로 저장
    model_filename = f"result_model/{district}_model.pkl"
    joblib.dump(pipe_lgb5, model_filename)

print("Models trained and saved.")

Training model for 자치구_강남
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_강동
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_강북
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_강서
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_관악
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_광진
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_구로
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_금천
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_노원
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_도봉
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_동대문
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_동작
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_마포
MAE: 0.1522
MSE: 0.9555
RMSE: 0.9775
R2: 0.0575
Training model for 자치구_서대문
MAE: 0.152