### 데이터 읽기

In [None]:
import pandas as pd

# CSV 파일 읽기
call119 = pd.read_csv('call119_train.csv')
cat119 = pd.read_csv('cat119_train.csv')
test_call119 = pd.read_csv('test_call119.csv', encoding='cp949')
test_cat119 = pd.read_csv('test_cat119.csv', encoding='cp949')
solar = pd.read_csv('solar_data.csv', encoding='cp949')


In [None]:
# 데이터 확인
print("call119_train.csv")
call119.head(10)

In [None]:
print("cat119_train.csv")
cat119.head(10)

In [None]:
print("cat119_train.csv")
solar.head(10)

In [None]:
call119.tail()

In [None]:
cat119.tail()

In [None]:
solar.tail()

In [None]:
# Info
call119.info()

In [None]:
cat119.info()

In [None]:
solar.info()

In [None]:
# Describe
call119.describe()

In [None]:
cat119.describe()

In [None]:
solar.describe()

### 컬럼명 정리와 전처리

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb


call119.columns = [col.replace('call119_train.', '') for col in call119.columns]
test_call119.columns = test_call119.columns.str.lower()

test_call119.columns = [col.upper() if col == 'tm' else col for col in test_call119.columns]


In [None]:
# WBGT 함수
def calculate_wbgt(ta_max, ta_min, rh_max, rh_min, ws_ins_max, avg_solar):
    t_avg = (ta_max + ta_min) / 2
    rh_avg = (rh_max + rh_min) / 2
    e = rh_avg / 100 * 6.105 * np.exp((17.27 * t_avg) / (237.7 + t_avg))
    wbgt_base = 0.567 * t_avg + 0.393 * e + 3.94
    alpha = 1.0
    beta = 0.5
    solar_factor = (avg_solar / 800) if avg_solar is not None else 0
    wind = ws_ins_max if ws_ins_max is not None else 0
    return wbgt_base + alpha * solar_factor - beta * wind

In [None]:
call119['TM'] = pd.to_datetime(call119['tm'], format='%Y%m%d')
cat119['TM'] = pd.to_datetime(cat119['cat119_train.tm'], format='%Y%m%d')
solar['TM'] = pd.to_datetime(solar['일시'], format='%Y-%m-%d')
test_call119['TM'] = pd.to_datetime(test_call119['TM'], format='%Y%m%d')
test_cat119['TM'] = pd.to_datetime(test_call119['TM'], format='%Y%m%d')


call119['WBGT'] = calculate_wbgt(
    call119['ta_max'], call119['ta_min'], 
    call119['hm_max'], call119['hm_min'], 
    call119['ws_ins_max'], call119['avg_solar']
)

# test도 동일하게 처리
test_call119['WBGT'] = calculate_wbgt(
    test_call119['ta_max'], test_call119['ta_min'], 
    test_call119['hm_max'], test_call119['hm_min'], 
    test_call119['ws_ins_max'], test_call119['avg_solar']
)


# --- 계절성 반영 변수 추가 (month, weekday) ---
call119['month'] = call119['TM'].dt.month
call119['weekday'] = call119['TM'].dt.weekday
test_call119['month'] = test_call119['TM'].dt.month
test_call119['weekday'] = test_call119['TM'].dt.weekday



# solar 전처리
solar['avg_solar'] = solar['합계 일사량(MJ/m2)'] * 11.574
solar = solar[['TM', '지점', 'avg_solar']].rename(columns={'지점': 'stn'})

In [None]:
# sub_address 원-핫 인코딩 + 시간 변수 원-핫 인코딩
call119 = pd.get_dummies(call119, columns=['sub_address', 'month', 'weekday'], prefix=['addr', 'month', 'weekday'])
test_call119 = pd.get_dummies(test_call119, columns=['sub_address', 'month', 'weekday'], prefix=['addr', 'month', 'weekday'])

# train과 test 간 sub_addr 컬럼 맞추기
train_cols = set(call119.columns)
test_cols = set(test_call119.columns)

for col in train_cols - test_cols:
    test_call119[col] = 0
for col in test_cols - train_cols:
    call119[col] = 0

# 정렬 (열 순서 일치)
call119 = call119.sort_index(axis=1)
test_call119 = test_call119.sort_index(axis=1)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 6. 피처 및 타겟 설정
features = [col for col in call119.columns if col not in ['call_count', 'TM', 'tm']]  # 불필요 컬럼 제외
X = call119[features]
y = np.log1p(call119['call_count'])  # 로그 변환

# 7. 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. 모델 학습 (튜닝 포함)
model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

# 9. 평가
val_pred = model.predict(X_val)
val_pred_real = np.expm1(val_pred)
y_val_real = np.expm1(y_val)

r2 = r2_score(y_val_real, val_pred_real)
rmse = np.sqrt(mean_squared_error(y_val_real, val_pred_real))

print(f"✅ R² Score: {r2:.4f}")
print(f"✅ RMSE: {rmse:.4f}")

In [None]:
solar.columns

In [None]:
call119.columns


In [None]:
cat119 = cat119.rename(columns=lambda x: x.replace('cat119_train.', '') if x.startswith('cat119_train.') else x)
cat119.columns

In [None]:
test_call119.columns

In [None]:
# 병합 전 데이터타입 확인
print(test_call119['TM'].dtype)
print(solar['TM'].dtype)
print(test_cat119['TM'].dtype)

### 변경 후 학습/예측 코드 수정

In [None]:
call119['WBGT'] = calculate_wbgt(
    call119['ta_max'], call119['ta_min'], 
    call119['hm_max'], call119['hm_min'], 
    call119['ws_ins_max'], call119['avg_solar']
)

# test도 동일하게 처리
test_call119['WBGT'] = calculate_wbgt(
    test_call119['ta_max'], test_call119['ta_min'], 
    test_call119['hm_max'], test_call119['hm_min'], 
    test_call119['ws_ins_max'], test_call119['avg_solar']
)

In [None]:
X = call119[features]

In [None]:
# 원-핫 인코딩된 sub_address 컬럼들 포함
sub_addr_cols = [col for col in call119.columns if col.startswith('sub_addr_')]
features = ['ta_max', 'ta_min', 'ta_max_min', 'hm_max', 'hm_min',
            'ws_max', 'ws_ins_max', 'rn_day', 'WBGT', 'month', 'weekday'] + sub_addr_cols

target = 'call_count'

X = call119[features]
y = call119[target]

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1)
model.fit(X_train, y_train)

# 예측 및 평가
val_pred = model.predict(X_val)
print("R² Score:", round(r2_score(y_val, val_pred), 4))
print("RMSE:", round(np.sqrt(mean_squared_error(y_val, val_pred)), 4))

In [None]:
solar['avg_solar'] = solar['avg_solar'] * 11.574
solar = solar[['TM', 'stn', 'avg_solar']]
solar = solar.rename(columns={'지점': 'stn'})

# call119 데이터 병합
call119 = call119.merge(solar, left_on=['TM', 'stn'], right_on=['TM', 'stn'], how='left')
test_call119 = test_call119.merge(solar, left_on=['TM', 'stn'], right_on=['TM', 'stn'], how='left')

In [None]:
import numpy as np
call119['WBGT'] = calculate_wbgt(call119['ta_max'], call119['ta_min'], call119['hm_max'], call119['hm_min'], call119['ws_ins_max'], call119['avg_solar'])
test_call119['WBGT'] = calculate_wbgt(test_call119['ta_max'], test_call119['ta_min'], test_call119['hm_max'], test_call119['hm_min'], test_call119['ws_ins_max'], test_call119['avg_solar'])

In [None]:
# 경로확인
# import sys
# print(sys.executable)

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

features = ['ta_max', 'ta_min', 'ta_max_min', 'hm_max', 'hm_min', 'ws_max', 'ws_ins_max', 'rn_day', 'WBGT']
target = 'call_count'

X = call119[features]
y = call119[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1)
model.fit(X_train, y_train)

# 예측
val_pred = model.predict(X_val)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

r2 = r2_score(y_val, val_pred)
rmse = np.sqrt(mean_squared_error(y_val, val_pred))

print(f'R² Score: {r2:.4f}')
print(f'RMSE: {rmse:.4f}')

### 함수 선언 전에 컬럼 정리한 후 다시 ㄱㄱ

In [None]:
# def merge_and_predict(call119, solar):
#     import pandas as pd
#     from sklearn.model_selection import train_test_split
#     from sklearn.linear_model import LinearRegression
#     from sklearn.metrics import mean_squared_error, r2_score

#     # 컬럼명 정리
#     call119 = call119.rename(columns=lambda x: x.replace('call119_train.', '') if x.startswith('call119_train.') else x)

#     # 날짜 타입 변환
#     if 'TM' not in call119.columns:
#         call119['TM'] = pd.to_datetime(call119['tm'].astype(str), format='%Y%m%d')
#     else:
#         call119['TM'] = pd.to_datetime(call119['TM'])
#     solar['TM'] = pd.to_datetime(solar['TM'])

#     # 병합
#     merged = call119.merge(solar[['TM', 'avg_solar']], on='TM', how='left')

#     # 범주형 컬럼 자동 인코딩 (문자열 전부)
#     cat_cols = merged.select_dtypes(include=['object']).columns.tolist()
#     if 'TM' in cat_cols:
#         cat_cols.remove('TM')
#     merged = pd.get_dummies(merged, columns=cat_cols, drop_first=True)

#     # 결측치 제거(특히 타겟)
#     merged = merged.dropna(subset=['call_count'])

#     # 숫자형 피처만 추출
#     feature_cols = merged.select_dtypes(include=['float64', 'int64']).columns.tolist()
#     feature_cols = [c for c in feature_cols if c not in ['call_count', 'TM', 'stn', 'tm']]

#     # 결측치 보간
#     merged[feature_cols] = merged[feature_cols].interpolate(method='linear', limit_direction='both')

#     X = merged[feature_cols]
#     y = merged['call_count']

#     # 학습/평가
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#     model = LinearRegression()
#     model.fit(X_train, y_train)

#     y_pred = model.predict(X_test)
#     rmse = mean_squared_error(y_test, y_pred, squared=False)
#     r2 = r2_score(y_test, y_pred)

#     print(f"✅ RMSE: {rmse:.4f}")
#     print(f"✅ R^2: {r2:.4f}")

#     return merged, model, feature_cols


In [None]:
def merge_and_predict(call119, solar):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score
    import numpy as np

    # 컬럼명 정리
    

    # 병합
    merged = call119.merge(solar[['TM', 'avg_solar']], on='TM', how='left')

    # 범주형 컬럼 자동 인코딩 (문자열 전부)
    cat_cols = merged.select_dtypes(include=['object']).columns.tolist()
    if 'TM' in cat_cols:
        cat_cols.remove('TM')
    merged = pd.get_dummies(merged, columns=cat_cols, drop_first=True)

    # 결측치 제거(특히 타겟)
    merged = merged.dropna(subset=['call_count'])

    # 숫자형 피처만 추출
    feature_cols = merged.select_dtypes(include=['float64', 'int64']).columns.tolist()
    feature_cols = [c for c in feature_cols if c not in ['call_count', 'TM', 'stn', 'tm']]

    # 결측치 보간
    merged[feature_cols] = merged[feature_cols].interpolate(method='linear', limit_direction='both')

    X = merged[feature_cols]
    y = merged['call_count']

    # 학습/평가
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"✅ RMSE: {rmse:.4f}")
    print(f"✅ R^2: {r2:.4f}")

    return merged, model, feature_cols

In [None]:
# def preprocess_and_predict_test(test_call119, solar, model, train_columns):
#     import pandas as pd

#     # 1. 날짜 처리
#     test_call119['TM'] = pd.to_datetime(test_call119['TM'].astype(str), format='%Y%m%d')
#     solar['TM'] = pd.to_datetime(solar['TM'])

#     # 2. 컬럼명 정리
#     test_call119 = test_call119.rename(columns={
#         'stn': 'STN',
#         'ta_max': 'ta_max',
#         'ta_min': 'ta_min',
#         'ta_max_min': 'ta_max_min',
#         'hm_min': 'hm_min',
#         'hm_max': 'hm_max',
#         'ws_max': 'ws_max',
#         'ws_ins_max': 'ws_ins_max',
#         'rn_day': 'rn_day',
#         'sub_address': 'sub_address'
#     })

#     # 3. 병합
#     test_merged = test_call119.merge(solar, on=['TM', 'stn'], how='left')

#     # 4. 지역 더미 변수 (prefix 없이)
#     test_merged = pd.get_dummies(test_merged, columns=['sub_address'], drop_first=True)

#     # 5. 누락된 학습 컬럼 보정
#     for col in train_columns:
#         if col not in test_merged.columns:
#             test_merged[col] = 0

#     # 6. 컬럼 순서 정렬
#     test_merged = test_merged[train_columns]

#     # 7. 결측치 보간
#     numeric_cols = test_merged.select_dtypes(include=['float64', 'int64']).columns
#     test_merged[numeric_cols] = test_merged[numeric_cols].interpolate(method='linear', limit_direction='both')

#     # 8. 예측
#     y_pred = model.predict(test_merged)

#     return y_pred

In [None]:
def preprocess_and_predict_test(test_call119, solar, model, train_columns):
    import pandas as pd


    # 2. 병합 (STN 제거, TM만으로 병합)
    test_merged = test_call119.merge(solar, on='TM', how='left')

    # 3. 지역 더미 변수
    test_merged = pd.get_dummies(test_merged, columns=['sub_address'], drop_first=True)

    # 4. 학습시 사용된 컬럼과 동일하게 맞추기 (누락 컬럼 0으로 채움)
    for col in train_columns:
        if col not in test_merged.columns:
            test_merged[col] = 0

    # 5. 컬럼 순서 맞추기
    test_merged = test_merged[train_columns]

    # 6. 결측치 보간
    numeric_cols = test_merged.select_dtypes(include=['float64', 'int64']).columns
    test_merged[numeric_cols] = test_merged[numeric_cols].interpolate(method='linear', limit_direction='both')

    # 7. 예측
    y_pred = model.predict(test_merged)

    return y_pred

In [None]:
# 이상치(극단값)는 거의 없음 → 좋은 모델입니다.

# RMSE와 MAE의 차이는 자연스러운 현상이며, 예측이 어려운 일부 사례가 RMSE에 조금 더 영향을 준 정도입니다.

# R² = 0.9956, RMSE / 평균 = 1.43% → 실제로는 매우 안정적이고 정확한 모델이므로 걱정할 필요는 전혀 없습니다.