## Import & Install

In [None]:
#!pip install -r requirements.txt

In [None]:
import pandas as pd
import random
import os
import numpy as np
import warnings
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from datetime import datetime

from sklearn.preprocessing import RobustScaler, PowerTransformer

from tqdm import tqdm



## Data Load

In [None]:
###  INPUT ###
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')

## Data Pre-processing

In [None]:
input_data = input_data.drop(columns=['frmDist'])

# Split the data into training and testing sets
X = input_data[input_data.drop(columns=['outtrn_cumsum','HeatingEnergyUsage_cumsum']).columns]
Y = input_data[['outtrn_cumsum','HeatingEnergyUsage_cumsum']]

In [None]:
X.columns = X.columns.str.replace(' ', '_')

In [None]:
value_counts_per_column = X.nunique()
multi_value_columns = value_counts_per_column[value_counts_per_column > 1].index
X = X[multi_value_columns]

In [None]:
value_counts_per_column

date                 771
inTp               72631
inHd               72631
otmsuplyqy         63091
acSlrdQy           53071
cunt               64561
ph                 66481
outTp              72631
outWs              38671
daysuplyqy             1
inCo2              72631
ec                 66481
frmYear                4
frmWeek               53
frtstGrupp         47461
lefstalklt             1
frtstSetCo             1
pllnLt                 1
flanGrupp          46621
frtstCo            51841
flanJnt                1
tcdmt               1441
frmhsFclu          47881
hvstGrupp          43051
hvstJnt                1
grwtLt             43861
fcluHg             42061
lefLt              43111
flwrCo                 1
hvstCo              6091
lefCunt            51511
frtstJnt               1
lefBt              43111
stemThck           41791
frmAr                  7
frmDov                 9
WaterUsage          8401
WaterCost           8401
FertilizerUsage     8371
FertilizerCost      8371


In [None]:
X = X.drop(columns=['frmYear', 'frmWeek'])

## Scaling

In [None]:
num_features = X.columns[X.columns != 'date']

In [None]:
scaler = PowerTransformer()
X[num_features] = scaler.fit_transform(X[num_features])

In [None]:
"""
for column in X[num_features].columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(X[f'{column}'], kde=True)
    plt.title('Distribution of inTp')
    plt.xlabel(f'{column}')
    plt.ylabel('Frequency')
    plt.show()
"""

## Feature Selection

In [None]:
import shap

In [None]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [None]:
# DF, based on which importance is checked
X_importance = X[num_features]

# Explain model predictions using shap library:
model = LGBMRegressor(random_state=42).fit(X[num_features], Y['outtrn_cumsum'])
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_importance)

# Plot summary_plot as barplot:
shap.summary_plot(shap_values, X_importance, plot_type='bar')

shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame([X_importance.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)
importance_df

In [None]:
# DF, based on which importance is checked
X_importance = X[num_features]

# Explain model predictions using shap library:
model = CatBoostRegressor(random_state=42).fit(X[num_features], Y['HeatingEnergyUsage_cumsum'])
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_importance)

# Plot summary_plot as barplot:
shap.summary_plot(shap_values, X_importance, plot_type='bar')

shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame([X_importance.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)
importance_df

In [None]:
correlations = X.corrwith(Y['HeatingEnergyUsage_cumsum'])

sorted_correlations = correlations.abs().sort_values(ascending=False)

plt.figure(figsize=(12, 8))
plt.barh(sorted_correlations.index, sorted_correlations.values)
plt.xlabel('Absolute Correlation with HeatingEnergyUsage_cumsum')
plt.ylabel('Features')
plt.title('Feature Importance based on Correlation')
plt.gca().invert_yaxis()  # Invert y-axis to display higher correlation at the top
plt.show()

In [None]:
X_1 = X.drop(columns=['tcdmt',
                      'WaterUsage',
                      'WaterCost',
                      'FertilizerUsage',
                      'FertilizerCost',
                      'CO2Usage',
                      'CO2Cost',
                      'MistUsageTime',
                      'Mist_Cost'])

In [None]:
X_2 = X.drop(columns=['WaterCost',
                      'frmDov',
                      'FertilizerCost',
                      'frmAr',
                      'stemThck',
                      'CO2Cost',
                      'inTp',
                      'hvstCo',
                      'inHd',
                      'hvstGrupp',
                      'frmhsFclu',
                      'flanGrupp',
                      'frtstGrupp',
                      'inCo2',
                      'outWs',
                      'outTp',
                      'cunt',
                      'acSlrdQy',
                      'otmsuplyqy',
                      'Mist_Cost'])

## Add time variable

In [None]:
date = pd.to_datetime(X_1['date'], format='%Y%m%d')
X_1.loc[:, 'year'] = date.dt.year
X_1.loc[:, 'month'] = date.dt.month
X_1.loc[:, 'week'] = date.dt.isocalendar().week.astype(np.int32)
X_1.loc[:, 'day'] = date.dt.weekday

In [None]:
date = pd.to_datetime(X_2['date'], format='%Y%m%d')
X_2.loc[:, 'year'] = date.dt.year
X_2.loc[:, 'month'] = date.dt.month
X_2.loc[:, 'week'] = date.dt.isocalendar().week.astype(np.int32)
X_2.loc[:, 'day'] = date.dt.weekday

In [None]:
X_1 = X_1.drop(columns='date')
X_2 = X_2.drop(columns='date')

## Train_test_split

In [None]:
X_train_01, X_test_01, y_train_01, y_test_01 = train_test_split(X_1, Y['outtrn_cumsum'], test_size=0.2, random_state=42)
X_train_02, X_test_02, y_train_02, y_test_02 = train_test_split(X_2, Y['HeatingEnergyUsage_cumsum'], test_size=0.2, random_state=42)

In [None]:
X_train_01 = X_train_01.reset_index(drop=True)
X_test_01 = X_test_01.reset_index(drop=True)
y_train_01 = y_train_01.reset_index(drop=True)
y_test_01 = y_test_01.reset_index(drop=True)

In [None]:
X_train_02 = X_train_02.reset_index(drop=True)
X_test_02 = X_test_02.reset_index(drop=True)
y_train_02 = y_train_02.reset_index(drop=True)
y_test_02 = y_test_02.reset_index(drop=True)

## Modeling

In [None]:
# Calculate RMSE between the predictions and actual 'y' values
def calculate_rmse(targets, predictions):
    """
    Calculate the Root Mean Squared Error (RMSE) between predicted and target values.

    :param predictions: Predicted values.
    :type predictions: array-like
    :param targets: Target values.
    :type targets: array-like
    :return: RMSE value.
    :rtype: float
    """
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(targets, predictions))


# Calculate r2_score between the predictions and actual 'y' values
def calculate_R2_score(y_test,y_pred):
    from sklearn.metrics import r2_score
    return r2_score(y_test, y_pred)

## Lgbm

In [None]:
X_2 = X
date = pd.to_datetime(X_2['date'], format='%Y%m%d')
X_2.loc[:, 'year'] = date.dt.year
X_2.loc[:, 'month'] = date.dt.month
X_2.loc[:, 'week'] = date.dt.isocalendar().week.astype(np.int32)
X_2.loc[:, 'day'] = date.dt.weekday

X_2 = X_2.drop(columns='date')

X_train_02, X_test_02, y_train_02, y_test_02 = train_test_split(X_2, Y['HeatingEnergyUsage_cumsum'], test_size=0.2, random_state=42)

X_train_02 = X_train_02.reset_index(drop=True)
X_test_02 = X_test_02.reset_index(drop=True)
y_train_02 = y_train_02.reset_index(drop=True)
y_test_02 = y_test_02.reset_index(drop=True)

In [None]:
model = LGBMRegressor(random_state=42)
model.fit(X_train_02, y_train_02)
y_pred = model.predict(X_test_02)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7245
[LightGBM] [Info] Number of data points in the train set: 67872, number of used features: 37
[LightGBM] [Info] Start training from score 135668.753849


In [None]:
rmse = calculate_rmse(y_test_02, y_pred)
r2score = calculate_R2_score(y_test_02, y_pred)

# ------------------------------------------------
### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 56542.9560601135
R2_score: 0.9897099672894213


In [None]:
model = CatBoostRegressor(task_type="GPU", random_state=42)
model.fit(X_train_02, y_train_02)
y_pred = model.predict(X_test_02)

In [None]:
rmse = calculate_rmse(y_test_02, y_pred)
r2score = calculate_R2_score(y_test_02, y_pred)

# ------------------------------------------------
### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

In [None]:
model = XGBRegressor(random_state=42)
model.fit(X_train_02, y_train_02)
y_pred = model.predict(X_test_02)

In [None]:
rmse = calculate_rmse(y_test_02, y_pred)
r2score = calculate_R2_score(y_test_02, y_pred)

# ------------------------------------------------
### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 57708.2601513269
R2_score: 0.9892814583966929


In [None]:
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

In [None]:
import optuna

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 16, 200),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': 42
    }

    model = LGBMRegressor(**params)
    model.fit(X_train_01, y_train_01)
    y_pred = model.predict(X_test_01)

    rmse = calculate_rmse(y_test_01, y_pred)
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 16, 200),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': 42
    }

    model = LGBMRegressor(**params)
    model.fit(X_train_02, y_train_02)
    y_pred = model.predict(X_test_02)

    rmse = calculate_rmse(y_test_02, y_pred)
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
y_test = pd.concat([y_test_01, y_test_02], axis=1)

In [None]:
y_pred_concatenated = np.column_stack((y_pred_01, y_pred_02))
y_pred = pd.DataFrame(y_pred_concatenated, columns=['y_pred_01', 'y_pred_02'])

In [None]:
rmse = calculate_rmse(y_test, y_pred)
r2score = calculate_R2_score(y_test, y_pred)

In [None]:
### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

In [None]:
y_pred