In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

In [None]:
###  INPUT ###
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')


In [None]:
input_data = input_data.drop(columns=['frmDist'])

# Split the data into training and testing sets
X = input_data[input_data.drop(columns=['outtrn_cumsum','HeatingEnergyUsage_cumsum']).columns]
Y = input_data[['outtrn_cumsum','HeatingEnergyUsage_cumsum']]

In [None]:
value_counts_per_column = X.nunique()
multi_value_columns = value_counts_per_column[value_counts_per_column > 1].index
X = X[multi_value_columns]

In [None]:
X = X.drop(columns=['frmYear', 'frmWeek'])

In [None]:
num_features = X.columns[X.columns != 'date']

In [None]:
scaler = MinMaxScaler()
X[num_features] = scaler.fit_transform(X[num_features])

In [None]:
X_1 = X.drop(columns=['tcdmt',
                      'WaterUsage',
                      'WaterCost',
                      'FertilizerUsage',
                      'FertilizerCost',
                      'CO2Usage',
                      'CO2Cost',
                      'MistUsageTime',
                      'Mist Cost'])

In [None]:
X_2 = X.drop(columns=['WaterCost',
                      'frmDov',
                      'FertilizerCost',
                      'frmAr',
                      'stemThck',
                      'CO2Cost',
                      'inTp',
                      'hvstCo',
                      'inHd',
                      'hvstGrupp',
                      'frmhsFclu',
                      'flanGrupp',
                      'frtstGrupp',
                      'inCo2',
                      'outWs',
                      'outTp',
                      'cunt',
                      'acSlrdQy',
                      'otmsuplyqy',
                      'Mist Cost'])


In [None]:
date = pd.to_datetime(X_1['date'], format='%Y%m%d')
X_1.loc[:, 'year'] = date.dt.year
X_1.loc[:, 'month'] = date.dt.month
X_1.loc[:, 'week'] = date.dt.isocalendar().week.astype(np.int32)
X_1.loc[:, 'day'] = date.dt.weekday

In [None]:
date = pd.to_datetime(X_2['date'], format='%Y%m%d')
X_2.loc[:, 'year'] = date.dt.year
X_2.loc[:, 'month'] = date.dt.month
X_2.loc[:, 'week'] = date.dt.isocalendar().week.astype(np.int32)
X_2.loc[:, 'day'] = date.dt.weekday

In [None]:
X_1 = X_1.drop(columns='date')
X_2 = X_2.drop(columns='date')

In [None]:
X_train_01, X_test_01, y_train_01, y_test_01 = train_test_split(X_1, Y['outtrn_cumsum'], test_size=0.1, random_state=42)
X_train_02, X_test_02, y_train_02, y_test_02 = train_test_split(X_2, Y['HeatingEnergyUsage_cumsum'], test_size=0.1, random_state=42)

In [None]:
linear_model_01 = LinearRegression()
decision_tree_model_01 = DecisionTreeRegressor(random_state=42)
random_forest_model_01 = RandomForestRegressor(random_state=42)
gradient_boosting_model_01 = GradientBoostingRegressor(random_state=42)

linear_model_02 = LinearRegression()
decision_tree_model_02 = DecisionTreeRegressor(random_state=42)
random_forest_model_02 = RandomForestRegressor(random_state=42)
gradient_boosting_model_02 = GradientBoostingRegressor(random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# ensemble_model_01
random_forest_model_01 = RandomForestRegressor(n_estimators=100, random_state=1)
ensemble_model_01 = random_forest_model_01

# ensemble_model_02
random_forest_model_02 = RandomForestRegressor(n_estimators=100, random_state=2)
ensemble_model_02 = random_forest_model_02


In [None]:
ensemble_model_01.fit(X_train_01, y_train_01)
y_pred_01 = ensemble_model_01.predict(X_test_01)

ensemble_model_02.fit(X_train_02, y_train_02)
y_pred_02 = ensemble_model_02.predict(X_test_02)

In [None]:
rmse_01 = np.sqrt(mean_squared_error(y_test_01, y_pred_01))
r2_01 = r2_score(y_test_01, y_pred_01)

rmse_02 = np.sqrt(mean_squared_error(y_test_02, y_pred_02))
r2_02 = r2_score(y_test_02, y_pred_02)

In [None]:
print("데이터셋 1 앙상블 모델 RMSE:", rmse_01)
print("데이터셋 1 앙상블 모델 R-squared:", r2_01)

print("데이터셋 2 앙상블 모델 RMSE:", rmse_02)
print("데이터셋 2 앙상블 모델 R-squared:", r2_02)

데이터셋 1 앙상블 모델 RMSE: 1461.0234393376536
데이터셋 1 앙상블 모델 R-squared: 0.9985620731277804
데이터셋 2 앙상블 모델 RMSE: 38803.46034106135
데이터셋 2 앙상블 모델 R-squared: 0.9946655836239302


In [None]:
y_test = pd.concat([y_test_01, y_test_02], axis=1)

In [None]:
y_pred_concatenated = np.column_stack((y_pred_01, y_pred_02))
y_pred = pd.DataFrame(y_pred_concatenated, columns=['y_pred_01', 'y_pred_02'])

In [None]:
# Calculate RMSE between the predictions and actual 'y' values
def calculate_rmse(targets, predictions):
    """
    Calculate the Root Mean Squared Error (RMSE) between predicted and target values.

    :param predictions: Predicted values.
    :type predictions: array-like
    :param targets: Target values.
    :type targets: array-like
    :return: RMSE value.
    :rtype: float
    """
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(targets, predictions))


# Calculate r2_score between the predictions and actual 'y' values
def calculate_R2_score(y_test,y_pred):
    from sklearn.metrics import r2_score
    return r2_score(y_test, y_pred)


rmse = calculate_rmse(y_test, y_pred)
r2score = calculate_R2_score(y_test, y_pred)

In [None]:
rmse = calculate_rmse(y_test, y_pred)
r2score = calculate_R2_score(y_test, y_pred)

In [None]:
### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 27457.63212597379
R2_score: 0.9966138283758553
