In [8]:
import requests
import pandas as pd
from io import StringIO
from datetime import datetime, timedelta
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
# 1. 기상청 API 데이터 수집 (2021년)
start_date = datetime(2021, 1, 1)
end_date = datetime(2021, 12, 31)
date_list = [(start_date + timedelta(days=i)).strftime('%Y%m%d') for i in range((end_date - start_date).days + 1)]

weather_records = []
col_names = [f'col{i}' for i in range(56)]
url_weather = "https://apihub.kma.go.kr/api/typ01/url/kma_sfcdd3.php"

for date in date_list:
    params_weather = {
        "tm1": date,
        "tm2": date,
        "stn": "108",
        "help": "1",
        "authKey": "n7wd-Im8T368HfiJvO9-5Q"  # 실제 인증키로 교체 필요
    }
    try:
        response_weather = requests.get(url_weather, params=params_weather, timeout=10)
        response_weather.encoding = 'euc-kr'
        lines = response_weather.text.split('\n')
        lines = [line for line in lines if line and not line.startswith('#')]
        if not lines:
            continue
        csv_data = '\n'.join(lines)
        df_w = pd.read_csv(StringIO(csv_data), sep=r'\s+', header=None, names=col_names, engine='python')
        df_w = df_w.reset_index(drop=True)
        df_w['date'] = df_w['col0'].astype(str).str[:8]
        df_w['date_dt'] = pd.to_datetime(df_w['date'], format='%Y%m%d', errors='coerce')
        df_w = df_w[['date_dt', 'col16', 'col21', 'col2']]
        df_w.columns = ['date_dt', 'TA_AVG', 'HM_AVG', 'WS_AVG']
        weather_records.append(df_w)
    except Exception as e:
        continue

if weather_records:
    df_weather_all = pd.concat(weather_records, ignore_index=True)
else:
    df_weather_all = pd.DataFrame()

print('기상청 데이터 수집 완료, 샘플 데이터:')
print(df_weather_all.head())
print('총 수집된 날짜 수:', df_weather_all['date_dt'].nunique())

기상청 데이터 수집 완료, 샘플 데이터:
     date_dt  TA_AVG  HM_AVG  WS_AVG
0 2021-01-01    -3.4     2.9     2.0
1 2021-01-02    -3.9     1.6     2.6
2 2021-01-03    -4.9     1.8     2.0
3 2021-01-04    -3.3     2.5     1.7
4 2021-01-05    -3.2     2.2     2.9
총 수집된 날짜 수: 365


In [17]:
df_risk = pd.read_csv('산림청 국립산림과학원_대형산불위험예보목록정보_20250430.csv', encoding='euc-kr')
df_risk = df_risk.rename(columns={
    '예보일시': 'date',
    '시도명': 'province',
    '시군구명': 'city',
    '실효습도': 'effective_humidity',
    '풍속': 'wind_speed',
    '등급': 'risk_grade'
})
df_risk['date_dt'] = pd.to_datetime(df_risk['date'], errors='coerce').dt.date
df_risk['effective_humidity'] = pd.to_numeric(df_risk['effective_humidity'], errors='coerce')
df_risk['wind_speed'] = pd.to_numeric(df_risk['wind_speed'], errors='coerce')

# 2021년 데이터만 필터링
df_risk_21 = df_risk[
    (df_risk['date_dt'] >= pd.to_datetime('2021-01-01').date()) &
    (df_risk['date_dt'] <= pd.to_datetime('2021-12-31').date())
]

if not df_weather_all.empty:
    df_weather_all['date_dt'] = pd.to_datetime(df_weather_all['date_dt']).dt.date
    risk_weather_21 = pd.merge(df_risk_21, df_weather_all, on='date_dt', how='inner')
else:
    risk_weather_21 = pd.DataFrame()

In [18]:
df_fire = pd.read_csv('sanbul.csv', encoding='cp949')
df_fire = df_fire.rename(columns={
    '발생일시_년': 'year',
    '발생일시_월': 'month',
    '발생일시_일': 'day',
    '발생장소_시도': 'province',
    '발생장소_시군구': 'city'
})

df_fire['date_dt'] = pd.to_datetime(
    df_fire['year'].astype(str) + '-' +
    df_fire['month'].astype(str).str.zfill(2) + '-' +
    df_fire['day'].astype(str).str.zfill(2),
    errors='coerce'
).dt.date

# 2021년 데이터만 필터링
df_fire_21 = df_fire[
    (df_fire['date_dt'] >= pd.to_datetime('2021-01-01').date()) &
    (df_fire['date_dt'] <= pd.to_datetime('2021-12-31').date())
]

fire_group_21 = df_fire_21.groupby(['date_dt', 'province', 'city']).size().reset_index(name='fire_count')
fire_group_21['fire_occurred'] = (fire_group_21['fire_count'] > 0).astype(int)


In [20]:
final_df_21 = pd.merge(
    risk_weather_21,
    fire_group_21,
    on=['date_dt', 'province', 'city'],
    how='left'
)
final_df_21['fire_occurred'] = final_df_21['fire_occurred'].fillna(0).astype(int)
final_df_21['fire_count'] = final_df_21['fire_count'].fillna(0).astype(int)

features = ['TA_AVG', 'HM_AVG', 'WS_AVG', 'effective_humidity', 'wind_speed']
final_df_21 = final_df_21.dropna(subset=features)

print('2021년 데이터 전처리 완료, 샘플 데이터:')
print(final_df_21.head())
print('총 데이터 수:', len(final_df_21))


2021년 데이터 전처리 완료, 샘플 데이터:
               date province city 읍면동명  effective_humidity  wind_speed  \
0  2021-01-01 15:00       강원   동해  심곡동                31.2         7.1   
1  2021-01-02 09:00       강원   양양  손양면                32.9         7.0   
2  2021-01-02 10:00       강원   양양  손양면                32.7         7.3   
3  2021-01-02 11:00       강원   양양  손양면                32.4         7.5   
4  2021-01-02 11:00       강원   양양  현남면                32.4         7.1   

  risk_grade     date_dt  TA_AVG  HM_AVG  WS_AVG  fire_count  fire_occurred  
0        주의보  2021-01-01    -3.4     2.9     2.0           0              0  
1        주의보  2021-01-02    -3.9     1.6     2.6           0              0  
2        주의보  2021-01-02    -3.9     1.6     2.6           0              0  
3        주의보  2021-01-02    -3.9     1.6     2.6           0              0  
4        주의보  2021-01-02    -3.9     1.6     2.6           0              0  
총 데이터 수: 4619


In [21]:
try:
    from xgboost import XGBClassifier
except ImportError:
    !pip install xgboost
    from xgboost import XGBClassifier

model = XGBClassifier()
model.load_model('xgb_fire_model_smote.json')  # 파일명 확인

X_21 = final_df_21[features].astype(float)
y_21 = final_df_21['fire_occurred']

preds_21 = model.predict(X_21)

print("2021년 데이터 정확도:", accuracy_score(y_21, preds_21))
print(classification_report(y_21, preds_21))
print(confusion_matrix(y_21, preds_21))

result_df_21 = pd.concat([
    X_21.reset_index(drop=True),
    pd.DataFrame({
        'actual_fire_occurred': y_21.reset_index(drop=True),
        'predicted_fire_occurred': preds_21
    })
], axis=1)

print("\n예측 결과 샘플:")
print(result_df_21.head())
print("\n예측 결과 요약:")
print(result_df_21['predicted_fire_occurred'].value_counts())

2021년 데이터 정확도: 0.9688244208703183
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      4562
           1       0.00      0.00      0.00        57

    accuracy                           0.97      4619
   macro avg       0.49      0.49      0.49      4619
weighted avg       0.98      0.97      0.97      4619

[[4475   87]
 [  57    0]]

예측 결과 샘플:
   TA_AVG  HM_AVG  WS_AVG  effective_humidity  wind_speed  \
0    -3.4     2.9     2.0                31.2         7.1   
1    -3.9     1.6     2.6                32.9         7.0   
2    -3.9     1.6     2.6                32.7         7.3   
3    -3.9     1.6     2.6                32.4         7.5   
4    -3.9     1.6     2.6                32.4         7.1   

   actual_fire_occurred  predicted_fire_occurred  
0                     0                        0  
1                     0                        0  
2                     0                        0  
3                     0       