In [36]:
import pandas as pd
import numpy as np 
import os
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer, roc_auc_score, mean_squared_error,mean_absolute_error
from sklearn.cluster import KMeans
from scipy import stats
from sklearn.model_selection import GridSearchCV
from fbprophet.diagnostics import cross_validation
import joblib
# import optuna
# import xgboost as xgb
import catboost
# import lightgbm as lgb
from skopt import BayesSearchCV
from fbprophet import Prophet
from prophet.diagnostics import performance_metrics

In [123]:
data = pd.read_csv('./prep_data.csv')
data.ds = pd.to_datetime(data.ds)
data = data.rename(columns = {"평균기온":"y"})
data['days'] = data.ds.dt.day
data['years'] = data.ds.dt.year
summit_form = pd.read_csv('./sample_submission.csv')
regressors = [x for x in data.columns if x != "y" and x != "ds"]

In [124]:
train, test = data.iloc[:-365, :], data.iloc[-365:, :]

train_nofeature = train[["ds","y"]]

test_x, test_y = test[["ds"]],test.y

In [109]:
## 훈련용

df = train_nofeature.copy()

# Prophet 모델 초기화
model = Prophet(daily_seasonality=True, seasonality_prior_scale = 0.01)

# 모델에 데이터 피팅
model.fit(df)


<fbprophet.forecaster.Prophet at 0x18cff353348>

In [110]:
default = 2.469308583208345
forecast = model.predict(test_x)
answer = forecast.yhat
mean_absolute_error(test_y, answer)
mae = mean_absolute_error(test_y, answer)
print(default - mae)

-0.008745691311886628


In [85]:
## 제출용

df = data[["ds","y"]].copy()


# 모델에 데이터 피팅
model.fit(df)

# 미래 날짜 생성
future = model.make_future_dataframe(periods=358)[-358:]  # 예측할 기간 설정 (365일)

# 예측
forecast = model.predict(future)

summit_form['평균기온']  = model.predict(future).yhat

summit_form.to_csv(f"./answer/prophet_mae_{mae}.csv")

<fbprophet.forecaster.Prophet at 0x18cff380308>

In [113]:
summit_form

Unnamed: 0,일시,평균기온
0,2023-01-01,-2.397883
1,2023-01-02,-2.507636
2,2023-01-03,-2.635723
3,2023-01-04,-2.706753
4,2023-01-05,-2.747967
...,...,...
353,2023-12-20,-0.918986
354,2023-12-21,-1.032896
355,2023-12-22,-1.172703
356,2023-12-23,-1.241481


In [114]:
import xgboost as xgb

In [144]:
X_train , y_train = train[["years", "월", "days"]], train.y
X_test, y_test = test[["years", "월", "days"]], test.y

In [126]:
X_train , y_train = train.drop(["y", "ds"],axis = 1), train.y
X_test, y_test = test.drop(["y", "ds"],axis = 1), test.y

In [145]:
# XGBoost 모델 초기화 및 학습 (objective="reg:linear"로 설정)
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X_train, y_train)

# 학습된 모델로 예측
y_pred = model.predict(X_test)

# 예측 결과 평가 (Mean Absolute Error 사용)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 2.840700222278294


In [146]:
# XGBoost 모델 초기화
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# 튜닝할 하이퍼파라미터 선정
param_grid = {
    'learning_rate': [0.01, 0.05, 0.07, 0.1],
    'max_depth': [3, 4, 6, 8, 10,],
    'n_estimators': [50, 100, 200,300,400,500],
    'subsample': [0.8, 0.9, 0.7],
}

# 그리드 서치를 통한 튜닝
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=3)
grid_result = grid_search.fit(X_train, y_train)

# 최적의 파라미터 출력
print("Best Parameters:", grid_result.best_params_)

# 최적의 모델로 예측
best_model = grid_result.best_estimator_
y_pred = best_model.predict(X_test)

# 예측 결과 평가 (Mean Absolute Error 사용)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on Test Set: {mae}")

Best Parameters: {'learning_rate': 0.07, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.9}
Mean Absolute Error on Test Set: 2.5338632114292823


In [147]:
params = grid_result.best_params_

model = xgb.XGBRegressor(**params, objective="reg:squarederror", random_state=42)

In [156]:
data_for_summit = data[["years", "월", "days","y"]]
forms = summit_form.copy()
forms.일시 = pd.to_datetime(forms.일시)
forms['days'] = forms.일시.dt.day
forms['월'] = forms.일시.dt.month
forms['years']= forms.일시.dt.year
forms.drop(['일시','평균기온'], axis = 1, inplace = True)

In [157]:
model.fit(data_for_summit.drop(['y'],axis=1), data_for_summit.y)
summit_pred = model.predict(forms)
summit_form.평균기온 = summit_pred
summit_form.to_csv(f"./answer/prophet_mae_{mae}.csv", index = False)

In [133]:
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

In [134]:
## 훈련용

df = train_nofeature.copy()

## xgboost 결과 추가

df["xgb"] = y_pred_train

# Prophet 모델 초기화
model = Prophet(daily_seasonality=True)

model.add_regressor('xgb')

# 모델에 데이터 피팅
model.fit(df)

<fbprophet.forecaster.Prophet at 0x18d04732b48>

In [135]:
test_x_xgb = test_x.copy()
test_x_xgb['xgb'] = y_pred_test

In [140]:
default = 2.469308583208345
forecast = model.predict(test_x_xgb)
answer = forecast.yhat
mean_absolute_error(test_y, answer)
mae = mean_absolute_error(test_y, answer)
print(default - mae)

2.1061942857975446


In [139]:
answer = answer[:358]

In [None]:
## 제출용

df = data[["ds","y"]].copy()


# 모델에 데이터 피팅
model.fit(df)

# 미래 날짜 생성
future = model.make_future_dataframe(periods=358)[-358:]  # 예측할 기간 설정 (365일)

# 예측
forecast = model.predict(future)

summit_form['평균기온']  = model.predict(future).yhat

summit_form.to_csv(f"./answer/prophet_mae_{mae}.csv")

In [142]:
summit_form['평균기온']  = answer

summit_form.to_csv(f"./answer/prophet_mae_{mae}.csv", index = False)