# 1. 데이터 전처리
- 필요한 라이브러리 불러오기

In [None]:
import math
import time
import datetime
from tqdm import tqdm
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

## 1-1. 데이터 불러오기

In [None]:
df = pd.read_csv('C:/wanted/ml/env311/preject/data/rawData_utf.csv',encoding='utf-8')
df['date'] = df['date'].astype('str')
df.info()

## 1-2. 데이터 전처리
- 결측치 확인 및 제거
    - 강수량(rain) : 0
    - 토지 (total_area, field_area, paddy_area, cemetery_area): 동일 지역 전년도 값
    - 기온 (tempAvg, tempMin, tempMax): 동일 지역 같은 달의 평균
    - 습도 (humMin, humAvg): 동일 지역 같은 달의 평균
    - 풍속 (windMax, windAvg): 동일 지역 같은 달의 평균

In [None]:
{col:df[col].isnull().sum() for col in df.columns[df.isnull().sum() > 0].tolist()}

In [None]:
# def fill_with_average(x):
#     return x.apply(lambda col: col.fillna(x[col.name].mean()))

filled_df = df.copy()
filled_df.insert(0, 'dateYear', filled_df['date'].str[:4])
filled_df.insert(1, 'dateMonth', filled_df['date'].str[4:6])
filled_df.insert(2, 'dateDay', filled_df['date'].str[6:])

filled_df.drop('date', axis=1)
# # 1월 ~ 6월까지만 추출
filled_df = filled_df[filled_df['dateMonth'].isin(['01', '02', '03', '04', '05', '06'])]
filled_df['rain'] = filled_df['rain'].fillna(0)

weather_columns = ['tempAvg', 'tempMin', 'tempMax', 'windMax', 'windAvg', 'humMin', 'humAvg']

for col in weather_columns:
    filled_df[col].fillna(filled_df.groupby(['dateYear', 'dateMonth', 'sgg_nm'])[col].transform('mean'), inplace=True)

area_columns = ['total_area', 'field_area', 'paddy_area', 'cemetery_area']
for col in area_columns:
    filled_df[col] = filled_df.groupby('sgg_nm')[col].transform(lambda group: group.ffill().bfill())

data = filled_df.copy()
data['farm_ratio'] = (data['paddy_area'] + data['field_area']) / data['total_area']
data['cemetary_ratio'] = data['cemetery_area'] / data['total_area']
data['population_density'] = data['population'] / data['total_area']
data.insert(len(data.columns)-1, 'occur', data.pop('occur'))
data = data.drop(columns=['date', 'dateYear', 'dateMonth', 'dateDay', 'sgg_nm', 'field_area', 'paddy_area', 'cemetery_area', 'total_area', 'population'])

loc_datas = {do:None for do in data.sd_nm.unique()}

for do in loc_datas.keys():
    loc_df = data.copy()
    loc_df = loc_df[loc_df['sd_nm'] == do]
    loc_datas[do] = loc_df


- 머신러닝 관련 패키지 Import

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (confusion_matrix, accuracy_score, roc_curve, auc as metrics_auc,
                             precision_recall_curve, precision_score, recall_score, roc_auc_score)
from sklearn.metrics import *

# 2. 피쳐엔지니어링

In [None]:
def DataProcessing_v2(input_data):
    """
    주어진 데이터에서 머신러닝 모델 학습을 위한 전처리(특성 선택, 샘플링, 스케일링)를 수행하는 함수

    Args:
    - input_data (pd.DataFrame): 원본 데이터셋

    Returns:
    - X_train_scaled (ndarray): 학습 데이터(입력 변수) - 표준화 적용됨
    - Y_train_resampled (ndarray): 학습 데이터(타겟 변수) - 샘플링 적용됨
    - X_test_scaled (ndarray): 테스트 데이터(입력 변수) - 표준화 적용됨
    - Y_test (ndarray): 테스트 데이터(타겟 변수) - 원본 분포 유지
    """
    
    ml_data = input_data.copy()
    ml_data_X = ml_data.drop(['sd_nm',"windMax",'tempAvg','humAvg','tempMin', 'occur'], axis=1)
    ml_data_Y = ml_data['occur']
    
    X_train, X_test, Y_train, Y_test = train_test_split(ml_data_X, ml_data_Y, test_size=0.2, random_state=42)
    
    # 샘플링 적용
    smote = SMOTE(sampling_strategy=0.5,random_state=42)
    under = RandomUnderSampler(sampling_strategy=0.5,random_state=42)
    pipeline = Pipeline(steps=[('smote',smote),('under',under)])
    X_train_resampled, Y_train_resampled = pipeline.fit_resample(X_train, Y_train)
    
    # 스케일링 적용
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, Y_train_resampled, X_test_scaled, Y_test

# 3. 모델 학습 및 평가

In [None]:
def EvaluateModel(model, param_grid, xtrain, ytrain, xtest, ytest):
    """
    모델을 학습시키고 최적 하이퍼파라미터를 찾은 후 평가 및 시각화를 수행하는 함수

    Args:
    - model: 머신러닝 모델
    - param_grid: 하이퍼파라미터 그리드
    - xtrain, ytrain: 학습 데이터
    - xtest, ytest: 테스트 데이터
    - model_name: 모델 이름 (str)

    Returns:
    - best_model: 최적의 하이퍼파라미터를 가진 모델
    """
    
    grid_search = GridSearchCV(
        model
        , param_grid
        , cv=5
        , scoring=['roc_auc', 'average_precision']
        , refit='average_precision'
        , n_jobs=-1
        # , verbose=1
    )
    grid_search.fit(xtrain, ytrain)

    # 최적 모델 가져오기
    best_model = grid_search.best_estimator_
    
    # 2. 예측 수행
    Y_pred = best_model.predict(xtest)
    Y_pred_proba = best_model.predict_proba(xtest)[:, 1]

    # 임계값 찾기
    precision,recall,thresholds = precision_recall_curve(ytest,Y_pred_proba)
    gmean_scores = []
    for thresh in thresholds:  # 
        Y_pred = (Y_pred_proba >= thresh).astype(int)  
        sensitivity = recall_score(ytest, Y_pred)
        specificity = recall_score(ytest, Y_pred, pos_label=0)
        gmean = np.sqrt(sensitivity * specificity)
        gmean_scores.append(gmean)

    # 오류 수정: thresholds 배열 참조
    best_idx_gmean = np.argmax(gmean_scores)
    best_threshold_gmean = thresholds[best_idx_gmean] if best_idx_gmean < len(thresholds) else 0.3

    Y_pred_optimal = (Y_pred_proba >= best_threshold_gmean).astype(int)
     # 3. 평가 지표 계산
    metrics = {
        'best_params': grid_search.best_params_,
        'accuracy': accuracy_score(ytest, Y_pred_optimal),
        'precision': precision_score(ytest, Y_pred_optimal),
        'recall': recall_score(ytest, Y_pred_optimal),
        'roc_auc': roc_auc_score(ytest, Y_pred_proba),
        'confusion_matrix': confusion_matrix(ytest, Y_pred_optimal),
        'fpr_vals': roc_curve(ytest, Y_pred_proba)[0],
        'tpr_vals': roc_curve(ytest, Y_pred_proba)[1],
        'precision_vals': precision,
        'recall_vals': recall,
        'pr_auc': metrics_auc(recall, precision)
    }

    return metrics

In [None]:
def ensemble(X_train,y_train,model,estimators):
    '''
    여러 개의 머신러닝 모델을 앙상블(Ensemble)하여 최종 예측 성능을 향상시키는 함수.
    StackingClassifier를 사용하여 개별 모델들의 예측 결과를 종합한 후, 최종 모델(final_estimator)이 이를 기반으로 학습.
    '''
    model = StackingClassifier(
        estimators=estimators,
        final_estimator=model
    )

    model.fit(X_train,y_train)
    return model

# 4. 하이퍼파라미터 지정

In [None]:
model_params_dict = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100]
        },
        'color': 'blue'
    },
    'XGBClassifier': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [100, 300, 500]
            , 'max_depth': [3, 6, 9]
            , 'learning_rate': [0.01, 0.1, 0.2]
            # , 'tree_method': ['gpu_hist']
            # , 'gpu_id': [0]
        },
        'color': 'red'
    },
    'LGBMClassifier': {
        'model': LGBMClassifier(),
        'params': {
            'n_estimators': [100, 300, 500]
            , 'num_leaves': [31, 63, 127]
            , 'learning_rate': [0.01, 0.1, 0.2]
            # , 'device': ['gpu']
        },
        'color': 'purple'
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 300, 500]
            , 'max_depth': [None, 10, 20, 30]
            , 'min_samples_leaf': [2, 5, 10]
        },
        'color': 'green'
    }
}
model_name = [i["model"] for i in model_params_dict.values()]
model_params_dict.keys()

# 5. 지역별 모델 학습 및 평가

In [None]:
# 지역별 데이터_v2
loc_results = dict()

for loc, loc_data in loc_datas.items():
    tmp_dict = dict()
    
    x_loc_train, y_loc_train, x_loc_test, y_loc_test = DataProcessing_v2(loc_data)
    
    for model in model_params_dict.keys():
        print(f'{loc} - {model} Start : {datetime.datetime.now()}')
        estimators = [(name, model_params_dict[name]['model']) for name in model_params_dict.keys() if name != model]
        param_grid = {f'final_estimator__{param}': values for param, values in model_params_dict[model]['params'].items()}

        tmp_dict[model] = EvaluateModel(
                            model = ensemble(x_loc_train,y_loc_train,model_params_dict[model]['model'],estimators)
                            , param_grid = param_grid
                            , xtrain = x_loc_train
                            , ytrain = y_loc_train
                            , xtest = x_loc_test
                            , ytest = y_loc_test
        )
        print(f'{loc} - {model} Finish : {datetime.datetime.now()}')
    
    loc_results[loc] = tmp_dict

In [None]:
tmp_dict.keys()

import joblib
for key,model in tmp_dict.items():
    joblib.dump(model, f"{key}.pkl")
    print(f"{key} 저장 완료!")

In [None]:
region_result_df = pd.DataFrame(
    {(region, model): metrics for region, models in loc_results.items() for model, metrics in models.items()}
).T

region_result_df.index.names = ["Region", "Model"]
region_result_df.to_csv('../region_result.csv')

region_result_df.insert(1, 'true_negative', region_result_df['confusion_matrix'].apply(lambda x: x[0][0]))
region_result_df.insert(2, 'fale_positive', region_result_df['confusion_matrix'].apply(lambda x: x[0][1]))
region_result_df.insert(3, 'fale_negative', region_result_df['confusion_matrix'].apply(lambda x: x[1][0]))
region_result_df.insert(4, 'true_positive', region_result_df['confusion_matrix'].apply(lambda x: x[1][1]))

filt_region_result_df = region_result_df.copy()
filt_region_result_df = filt_region_result_df.drop(['best_params','confusion_matrix'] + [col for col in region_result_df.columns if col.endswith('vals')], axis=1)

filt_region_result_df.to_csv('../region_result_filt.csv')
filt_region_result_df

# 6. 전국 모델 학습 및 평가

In [None]:
# 전국 데이터
nat_results = dict()

x_train, y_train, x_test, y_test = DataProcessing_v2(data)

for model in model_params_dict.keys():
    print(f'{model} Start : {datetime.datetime.now()}')
    estimators = [(name, model_params_dict[name]['model']) for name in model_params_dict.keys() if name != model]
    param_grid = {f'final_estimator__{param}': values for param, values in model_params_dict[model]['params'].items()}
    
    nat_results[model] = EvaluateModel(
                        model = ensemble(x_train,y_train,model_params_dict[model]['model'],estimators)
                        , param_grid = param_grid
                        , xtrain = x_train
                        , ytrain = y_train
                        , xtest = x_test
                        , ytest = y_test
    )
    print(f'{model} Finish : {datetime.datetime.now()}')

In [None]:
nat_results.keys()

import joblib
for key,model in nat_results.items():
    joblib.dump(model, f"korea_{key}.pkl")
    print(f"전국{key} 저장 완료!")

In [None]:
nation_result_df = pd.DataFrame(
    {model: metrics for model, metrics in nat_results.items()}
).T

nation_result_df.index.names = ["Model"]
nation_result_df
# nation_result_df.to_csv('../result/nation_result.csv')

nation_result_df.insert(1, 'true_negative', nation_result_df['confusion_matrix'].apply(lambda x: x[0][0]))
nation_result_df.insert(2, 'fale_positive', nation_result_df['confusion_matrix'].apply(lambda x: x[0][1]))
nation_result_df.insert(3, 'fale_negative', nation_result_df['confusion_matrix'].apply(lambda x: x[1][0]))
nation_result_df.insert(4, 'true_positive', nation_result_df['confusion_matrix'].apply(lambda x: x[1][1]))

nation_result_df = nation_result_df.copy()
filt_nation_result_df = nation_result_df.drop(['best_params','confusion_matrix'] + [col for col in nation_result_df.columns if col.endswith('vals')], axis=1)

# filt_nation_result_df.to_csv('../result/nation_result_filt.csv')
filt_nation_result_df

# 7. 평가 결과 시각화

## 지역별 모델 평가 시각화

- ROC-AUC

In [None]:
plt.figure(figsize=(20, 10))

for idx, (r, m) in enumerate(region_result_df.index):
    if idx % 4 == 0:
        plt.subplot(2,4,(idx//4 +1))
    else:
        pass

    plt.plot(
        region_result_df.loc[(r, m), 'fpr_vals']
        , region_result_df.loc[(r, m), 'tpr_vals']
        , color=model_params_dict[m]['color']
        , lw=2
        , label=f"{m} (AUC = {region_result_df.loc[(r, m), 'roc_auc']:.2f})"
    )
            
    plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate (FPR)")
    plt.ylabel("True Positive Rate (TPR)")
    plt.title(f"ROC Curve ({r})")
    plt.legend(loc="lower right")
    plt.grid()

plt.show()

- PR-AUC

In [None]:
plt.figure(figsize=(20, 10))

for idx, (r, m) in enumerate(region_result_df.index):
    if idx % 4 == 0:
        plt.subplot(2,4,(idx//4 +1))
    else:
        pass

    plt.plot(
        region_result_df.loc[(r, m), 'recall_vals']
        , region_result_df.loc[(r, m), 'precision_vals']
        , color=model_params_dict[m]['color']
        , lw=2
        , label=f"{m} (AUC = {region_result_df.loc[(r, m), 'pr_auc']:.2f})"
    )
            
    plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall Curve ({r})")
    plt.legend(loc="upper right")
    plt.grid()

plt.show()

## 전국 모델 평가 시각화

In [None]:
# ROC Curve
plt.figure(figsize=(5, 5))

for m in nation_result_df.index:
    plt.plot(
        nation_result_df.loc[m, 'fpr_vals']
        , nation_result_df.loc[m, 'tpr_vals']
        , color=model_params_dict[m]['color']
        , lw=2
        , label=f"{m} (AUC = {nation_result_df.loc[m, 'roc_auc']:.2f})"
    )
            
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title(f"ROC Curve (전국)")
plt.legend(loc="lower right")
plt.grid()

plt.show()

In [None]:
# Precision-Recall Curve
plt.figure(figsize=(5, 5))

for m in nation_result_df.index:
    plt.plot(
        nation_result_df.loc[m, 'recall_vals']
        , nation_result_df.loc[m, 'precision_vals']
        , color=model_params_dict[m]['color']
        , lw=2
        , label=f"{m} (AUC = {nation_result_df.loc[m, 'pr_auc']:.2f})"
    )
        
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision-Recall Curve (전국)")
plt.legend(loc="upper right")
plt.grid()

plt.show()