# **날씨 빅데이터 모델링**

## 데이터 불러오기

In [162]:
import pandas as pd
import numpy as np
import os

In [163]:
df = pd.read_csv('WBGT_call119.csv', encoding='cp949')
df


Unnamed: 0.1,Unnamed: 0,AWS 정보,강수량,WBGT,지역구,행정동,신고대분류,신고세부항목,신고건수
0,0,904,0.0,17.851324,0,34,0,0,1.0
1,1,921,0.0,18.873149,0,121,1,0,1.0
2,2,940,0.0,18.753480,1,16,0,5,1.0
3,3,941,0.0,17.865345,1,17,2,11,1.0
4,4,939,0.0,19.042418,1,21,0,5,1.0
...,...,...,...,...,...,...,...,...,...
74505,74505,937,0.0,19.042591,15,56,0,5,
74506,74506,937,0.0,19.042591,15,101,0,0,
74507,74507,937,0.0,19.042591,15,101,0,5,
74508,74508,937,0.0,19.042591,15,101,2,11,


In [164]:
# 인덱스 제거
df = df.drop('Unnamed: 0', axis = 1)
df

Unnamed: 0,AWS 정보,강수량,WBGT,지역구,행정동,신고대분류,신고세부항목,신고건수
0,904,0.0,17.851324,0,34,0,0,1.0
1,921,0.0,18.873149,0,121,1,0,1.0
2,940,0.0,18.753480,1,16,0,5,1.0
3,941,0.0,17.865345,1,17,2,11,1.0
4,939,0.0,19.042418,1,21,0,5,1.0
...,...,...,...,...,...,...,...,...
74505,937,0.0,19.042591,15,56,0,5,
74506,937,0.0,19.042591,15,101,0,0,
74507,937,0.0,19.042591,15,101,0,5,
74508,937,0.0,19.042591,15,101,2,11,


In [165]:
X_train = df.loc[0:61770].iloc[:, :-1]
y_train = df.loc[0:61770].iloc[:, -1]

X_test = df.loc[61771:].iloc[:,:-1] 
y_test = df.loc[61771:].iloc[:,-1] 

In [166]:
X_train

Unnamed: 0,AWS 정보,강수량,WBGT,지역구,행정동,신고대분류,신고세부항목
0,904,0.0,17.851324,0,34,0,0
1,921,0.0,18.873149,0,121,1,0
2,940,0.0,18.753480,1,16,0,5
3,941,0.0,17.865345,1,17,2,11
4,939,0.0,19.042418,1,21,0,5
...,...,...,...,...,...,...,...
61766,937,0.0,14.832534,15,56,0,0
61767,937,0.0,14.832534,15,56,0,5
61768,940,0.0,15.602224,15,76,0,5
61769,937,0.0,14.832534,15,79,2,11


In [167]:
X_test

Unnamed: 0,AWS 정보,강수량,WBGT,지역구,행정동,신고대분류,신고세부항목
61771,904,0.0,10.926137,0,34,0,0
61772,904,0.0,10.926137,0,73,0,5
61773,937,0.0,11.454401,0,79,0,0
61774,950,0.0,9.556113,0,87,2,11
61775,940,0.0,11.745615,1,12,2,9
...,...,...,...,...,...,...,...
74505,937,0.0,19.042591,15,56,0,5
74506,937,0.0,19.042591,15,101,0,0
74507,937,0.0,19.042591,15,101,0,5
74508,937,0.0,19.042591,15,101,2,11


# Modeling

In [168]:
# Random seed 설정
_RANDOM_SEED = 42

## 1. XGBoost

In [169]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [170]:
# 1. 모델링 생성
xgb = XGBRegressor(objective='reg:squarederror', random_state=_RANDOM_SEED)

# 2. 하이퍼파라미터 그리드 설정
param_grid_xgb = {
    'n_estimators': [100, 300, 500],           # 생성할 트리 수
    'max_depth': [3, 5, 7],                    # 트리 최대 깊이
    'learning_rate': [0.01, 0.05, 0.1],        # 학습률
    'subsample': [0.7, 0.8, 1.0],              # 데이터 샘플링 비율
    'colsample_bytree': [0.7, 0.9, 1.0],       # 피처 샘플링 비율
    'gamma': [0, 1, 5],                        # 분할 손실 최소 감소값
    'min_child_weight': [1, 3, 5],             # 리프 노드 최소 가중치
    'reg_alpha': [0, 0.1, 0.5],                # L1 정규화
    'reg_lambda': [1, 5, 10]                   # L2 정규화
}

# 3. GridSearchCV 객체 생성
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=10,
    verbose=1,
    n_jobs=-1
)

# 4. 하이퍼파라미터 탐색 학습
grid_search.fit(X_train, y_train)

# 5. 최적 하이퍼파라미터와 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best RMSE (CV):", -grid_search.best_score_)

# 6. 최적 모델로 검증 세트 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# rmse_val = np.sqrt(mean_squared_error(y_val, y_pred))
# print(f"Validation RMSE: {rmse_val:.4f}")

Fitting 10 folds for each of 19683 candidates, totalling 196830 fits


KeyboardInterrupt: 

In [None]:
# Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 10, 'subsample': 0.8}
# Best RMSE (CV): 0.9627962598800812

In [38]:
y_pred

array([1.2809066, 1.2280526, 1.2370169, ..., 1.4136965, 1.2588621,
       1.3373302], dtype=float32)

In [84]:
y_pred_fixed = np.round(np.clip(y_pred, 0, None)).astype(int)
y_pred_fixed

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
# 1. 데이터프레임으로 변환
df_pred = pd.DataFrame({'predicted': y_pred_fixed})

# 2. CSV로 저장
df_pred.to_csv('test_call119_pred_xgboost.csv', index=False, encoding='cp949')

In [86]:
df_pred

Unnamed: 0,predicted
0,1
1,1
2,1
3,1
4,1
...,...
12734,1
12735,1
12736,1
12737,1


## 2. LightGBM

In [101]:
param_grid_lgbm = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.01, 0.05],
    'max_depth': [7, 10, -1],
    'num_leaves': [31, 63, 127],
    'min_child_samples': [10, 20],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.7, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 5]
}

# ----------------------------
# LightGBM 모델 정의
# ----------------------------
lgbm = LGBMRegressor(objective='regression', random_state=_RANDOM_SEED)

# ----------------------------
# GridSearchCV 설정
# ----------------------------
# 3. GridSearchCV 객체 생성
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid_lgbm,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=10,
    verbose=1,
    n_jobs=-1
)

# 4. 하이퍼파라미터 탐색 학습
grid_search.fit(X_train, y_train)

# 5. 최적 하이퍼파라미터와 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best RMSE (CV):", -grid_search.best_score_)

# 6. 최적 모델로 검증 세트 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# rmse_val = np.sqrt(mean_squared_error(y_val, y_pred))
# print(f"Validation RMSE: {rmse_val:.4f}")

Fitting 10 folds for each of 1728 candidates, totalling 17280 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 688
[LightGBM] [Info] Number of data points in the train set: 61771, number of used features: 7
[LightGBM] [Info] Start training from score 1.331936
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_samples': 10, 'n_estimators': 500, 'num_leaves': 127, 'reg_alpha': 1, 'reg_lambda': 5, 'subsample': 0.8}
Best RMSE (CV): 1.010807364014783


In [None]:
# Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_samples': 10, 'n_estimators': 500, 'num_leaves': 127, 'reg_alpha': 1, 'reg_lambda': 5, 'subsample': 0.8}
# Best RMSE (CV): 1.010807364014783

In [118]:
y_pred

array([1.14388506, 1.0968068 , 1.1702125 , ..., 1.56888924, 1.23326612,
       1.33858492])

In [119]:
y_pred_fixed = np.round(np.clip(y_pred, 0, None)).astype(int)
y_pred_fixed

array([1, 1, 1, ..., 2, 1, 1])

In [120]:
# 1. 데이터프레임으로 변환
df_pred = pd.DataFrame({'predicted': y_pred_fixed})

# 2. CSV로 저장
df_pred.to_csv('test_call119_pred_lightGBM.csv', index=False, encoding='cp949')

## 3. CatBoost

In [133]:
catb = CatBoostRegressor(silent=True, random_state = _RANDOM_SEED)

param_grid_catb = {
    'iterations': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 7],
    'bagging_temperature': [0.0, 0.5, 1.0],
    'border_count': [32, 64, 128]
}

grid_search = GridSearchCV(
    estimator=catb,
    param_grid=param_grid_catb,
    scoring='neg_root_mean_squared_error',  # RMSE를 기준으로 함
    cv=10,
    verbose=1,
    n_jobs=-1
)

# 하이퍼파라미터 탐색 학습
grid_search.fit(X_train, y_train)

# 최적 하이퍼파라미터와 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best RMSE (CV):", -grid_search.best_score_)

# 최적 모델로 검증 세트 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# rmse_val = np.sqrt(mean_squared_error(y_val, y_pred))
# print(f"Validation RMSE: {rmse_val:.4f}")

Fitting 10 folds for each of 972 candidates, totalling 9720 fits


27 fits failed out of a total of 9720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\asia\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\asia\AppData\Roaming\Python\Python312\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Use

Best parameters: {'bagging_temperature': 0.0, 'border_count': 32, 'depth': 8, 'iterations': 100, 'l2_leaf_reg': 7, 'learning_rate': 0.01}
Best RMSE (CV): 0.9905980209951254


In [None]:
# Best parameters: {'bagging_temperature': 0.0, 'border_count': 32, 'depth': 8, 'iterations': 100, 'l2_leaf_reg': 7, 'learning_rate': 0.01}
# Best RMSE (CV): 0.9905980209951254

In [134]:
y_pred_fixed = np.round(np.clip(y_pred, 0, None)).astype(int)
y_pred_fixed

array([1, 1, 1, ..., 1, 1, 1])

In [135]:
# 1. 데이터프레임으로 변환
df_pred = pd.DataFrame({'predicted': y_pred_fixed})

# 2. CSV로 저장
df_pred.to_csv('test_call119_pred_catb.csv', index=False, encoding='cp949')

# 4. RandomForest

In [147]:
# 1. 랜덤포레스트 모델 생성
rf = RandomForestRegressor(_RANDOM_SEED)

# 2. 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# 3. GridSearchCV 객체 생성
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE를 기준으로 최적화
    cv=10,
    n_jobs=-1,
    verbose=2
)

# 4. 하이퍼파라미터 탐색 학습
grid_search.fit(X_train, y_train)

# 5. 최적 하이퍼파라미터와 점수 출력
print("Best parameters:", grid_search.best_params_)
print("Best RMSE (CV):", -grid_search.best_score_)

# 6. 최적 모델로 검증 세트 평가
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
# rmse_val = np.sqrt(mean_squared_error(y_val, y_pred))
# print(f"Validation RMSE: {rmse_val:.4f}")

Fitting 10 folds for each of 432 candidates, totalling 4320 fits
Best parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best RMSE (CV): 0.9931321543817541


In [None]:
# Best parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
# Best RMSE (CV): 0.9931321543817541

In [148]:
y_pred_fixed = np.round(np.clip(y_pred, 0, None)).astype(int)
y_pred_fixed

array([1, 1, 1, ..., 2, 1, 1])

In [150]:
# 1. 데이터프레임으로 변환
df_pred = pd.DataFrame({'predicted': y_pred_fixed})

# 2. CSV로 저장
df_pred.to_csv('test_call119_pred_rf.csv', index=False, encoding='cp949')

### 출력물 기준 신고건수 병합

In [152]:
test_cat119 = pd.read_csv('test_cat119.csv', encoding='cp949')
test_call119 = pd.read_csv('test_call119.csv', encoding='cp949')
test_cat119


Unnamed: 0,TM,address_city,address_gu,sub_address,cat,sub_cat,STN
0,20240501,부산광역시,강서구,대저2동,구급,교통사고,904
1,20240501,부산광역시,강서구,생곡동,구급,부상,904
2,20240501,부산광역시,강서구,송정동,구급,교통사고,937
3,20240501,부산광역시,강서구,신호동,기타,업무운행,950
4,20240501,부산광역시,금정구,구서동,기타,상황출동,940
...,...,...,...,...,...,...,...
12734,20241031,부산광역시,해운대구,반여동,구급,부상,937
12735,20241031,부산광역시,해운대구,우동,구급,교통사고,937
12736,20241031,부산광역시,해운대구,우동,구급,부상,937
12737,20241031,부산광역시,해운대구,우동,기타,업무운행,937


In [153]:
test_call119['number'] = range(1, len(test_call119) + 1)
test_call119

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count,number
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,,1
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,,2
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,,3
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,,4
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9596,20241031,부산광역시,영도구,청학동,159,23.2,16.0,7.2,53.5,72.6,4.3,8.0,0.0,,9597
9597,20241031,부산광역시,중구,중앙동,159,23.2,16.0,7.2,53.5,72.6,4.3,8.0,0.0,,9598
9598,20241031,부산광역시,해운대구,반여동,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0,,9599
9599,20241031,부산광역시,해운대구,우동,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0,,9600


In [154]:
test_cat_call = pd.merge(
    test_cat119,
    test_call119,
    on=['TM', 'address_city', 'address_gu', 'sub_address'],
    how='left'
)
test_cat_call

Unnamed: 0,TM,address_city,address_gu,sub_address,cat,sub_cat,STN_x,STN_y,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count,number
0,20240501,부산광역시,강서구,대저2동,구급,교통사고,904,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,,1
1,20240501,부산광역시,강서구,생곡동,구급,부상,904,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,,2
2,20240501,부산광역시,강서구,송정동,구급,교통사고,937,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,,3
3,20240501,부산광역시,강서구,신호동,기타,업무운행,950,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,,4
4,20240501,부산광역시,금정구,구서동,기타,상황출동,940,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12734,20241031,부산광역시,해운대구,반여동,구급,부상,937,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0,,9599
12735,20241031,부산광역시,해운대구,우동,구급,교통사고,937,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0,,9600
12736,20241031,부산광역시,해운대구,우동,구급,부상,937,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0,,9600
12737,20241031,부산광역시,해운대구,우동,기타,업무운행,937,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0,,9600


In [155]:
df_pred = pd.concat([df_pred, test_cat_call[['number']]], axis=1)

In [156]:
df_pred

Unnamed: 0,predicted,number
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
12734,1,9599
12735,1,9600
12736,2,9600
12737,1,9600


In [157]:
df_grouped = df_pred.groupby(['number'])['predicted'].sum().reset_index()

In [158]:
df_grouped

Unnamed: 0,number,predicted
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
9596,9597,1
9597,9598,1
9598,9599,1
9599,9600,4


In [159]:
test_call119_final = test_call119.drop(['call_count', 'number'], axis=1)
test_call119_final

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9596,20241031,부산광역시,영도구,청학동,159,23.2,16.0,7.2,53.5,72.6,4.3,8.0,0.0
9597,20241031,부산광역시,중구,중앙동,159,23.2,16.0,7.2,53.5,72.6,4.3,8.0,0.0
9598,20241031,부산광역시,해운대구,반여동,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0
9599,20241031,부산광역시,해운대구,우동,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0


In [160]:
test_call119_final['call_count'] = df_grouped['predicted'].values

In [161]:
test_call119_final.to_csv('250738_rf.csv', encoding='cp949')
test_call119_final

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,1
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,1
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9596,20241031,부산광역시,영도구,청학동,159,23.2,16.0,7.2,53.5,72.6,4.3,8.0,0.0,1
9597,20241031,부산광역시,중구,중앙동,159,23.2,16.0,7.2,53.5,72.6,4.3,8.0,0.0,1
9598,20241031,부산광역시,해운대구,반여동,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0,1
9599,20241031,부산광역시,해운대구,우동,937,25.4,14.8,10.6,52.9,86.6,2.6,6.1,0.0,4
