In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [24]:
### input ###

input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')

print(input_data.head())
input_data.shape

    frmDist      date  inTp  inHd  otmsuplyqy  acSlrdQy  cunt   ph  outTp  \
0  DBSF1059  20220926   0.0   0.0         0.0       0.0   0.0  0.0    0.0   
1  DBSF1059  20221001   0.0   0.0         0.0       0.0   0.0  0.0    0.0   
2  DBSF1059  20221002   0.0   0.0         0.0       0.0   0.0  0.0    0.0   
3  DBSF1059  20221003   0.0   0.0         0.0       0.0   0.0  0.0    0.0   
4  DBSF1059  20221004   0.0   0.0         0.0       0.0   0.0  0.0    0.0   

   outWs  ...  outtrn_cumsum   WaterUsage   WaterCost  FertilizerUsage  \
0    0.0  ...            0.0     0.000000    0.000000         0.000000   
1    0.0  ...            0.0  1347.554178  202.133127      4711.308821   
2    0.0  ...            0.0    69.899353   10.484903       271.379890   
3    0.0  ...            0.0   412.225065   61.833760      1539.505534   
4    0.0  ...            0.0   590.608634   88.591295      2614.973165   

   FertilizerCost     CO2Usage    CO2Cost  MistUsageTime    Mist Cost  \
0        0.000000  

(84840, 47)

In [25]:
input_data["frmDist"] = input_data["frmDist"].str[-4:] # frmDist 컬럼 str문자 삭제
print(input_data.head())
input_data.shape

  frmDist      date  inTp  inHd  otmsuplyqy  acSlrdQy  cunt   ph  outTp  \
0    1059  20220926   0.0   0.0         0.0       0.0   0.0  0.0    0.0   
1    1059  20221001   0.0   0.0         0.0       0.0   0.0  0.0    0.0   
2    1059  20221002   0.0   0.0         0.0       0.0   0.0  0.0    0.0   
3    1059  20221003   0.0   0.0         0.0       0.0   0.0  0.0    0.0   
4    1059  20221004   0.0   0.0         0.0       0.0   0.0  0.0    0.0   

   outWs  ...  outtrn_cumsum   WaterUsage   WaterCost  FertilizerUsage  \
0    0.0  ...            0.0     0.000000    0.000000         0.000000   
1    0.0  ...            0.0  1347.554178  202.133127      4711.308821   
2    0.0  ...            0.0    69.899353   10.484903       271.379890   
3    0.0  ...            0.0   412.225065   61.833760      1539.505534   
4    0.0  ...            0.0   590.608634   88.591295      2614.973165   

   FertilizerCost     CO2Usage    CO2Cost  MistUsageTime    Mist Cost  \
0        0.000000     0.000000 

(84840, 47)

In [26]:
input_data = pd.get_dummies(input_data, columns=["frmDist"]) # 인코딩
print(input_data.head())
input_data.shape

       date  inTp  inHd  otmsuplyqy  acSlrdQy  cunt   ph  outTp  outWs  \
0  20220926   0.0   0.0         0.0       0.0   0.0  0.0    0.0    0.0   
1  20221001   0.0   0.0         0.0       0.0   0.0  0.0    0.0    0.0   
2  20221002   0.0   0.0         0.0       0.0   0.0  0.0    0.0    0.0   
3  20221003   0.0   0.0         0.0       0.0   0.0  0.0    0.0    0.0   
4  20221004   0.0   0.0         0.0       0.0   0.0  0.0    0.0    0.0   

   daysuplyqy  ...  frmDist_9761  frmDist_9777  frmDist_9793  frmDist_9838  \
0         0.0  ...             0             0             0             0   
1         0.0  ...             0             0             0             0   
2         0.0  ...             0             0             0             0   
3         0.0  ...             0             0             0             0   
4         0.0  ...             0             0             0             0   

   frmDist_9894  frmDist_9922  frmDist_9942  frmDist_9948  frmDist_9961  \
0          

(84840, 406)

In [27]:
X = input_data.drop(['outtrn_cumsum', 'HeatingEnergyUsage_cumsum'], axis=1)
y = input_data[['outtrn_cumsum', 'HeatingEnergyUsage_cumsum']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 데이터 스플릿

In [28]:
# XGBoost 회귀 모델 생성
model = xgb.XGBRegressor(objective='reg:squarederror')  # 'reg:squarederror'는 회귀 문제용 목적 함수

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

### output ###
print(f"RMSE:", rmse)
print(f"R2:", r2)

RMSE: 57214.45201949978
R2: 0.9364296255028436


In [14]:
### output ###
print(f"RMSE:", rmse)
print(f"R2:", r2)

RMSE: 57214.45201949978
R2: 0.9364296255028436
