# This notebook uses the original train and test datasets

In [2]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import holidays
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import seaborn as sns
import datetime
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [3]:
# read in data

train = pd.read_csv('./train_E1GspfA.csv', parse_dates=['date'])
test = pd.read_csv('./test_6QvDdzb.csv', parse_dates=['date'])
sample = pd.read_csv('./sample_4E0BhPN.csv', parse_dates=['date'])

In [4]:
# set datetime as index

train.set_index(pd.to_datetime(train.date) + pd.to_timedelta(train.hour, unit='h'), inplace=True)

In [5]:
# set datetime as index

test.set_index(pd.to_datetime(test.date) + pd.to_timedelta(test.hour, unit='h'), inplace=True)

# Feature Engineering

In [6]:
# convert datetime to day, month, year

def convert_datetime_to_day_month_year(df):
    
    date_col = df.index
    
    df['day'] = date_col.day
    df['month'] = date_col.month
    df['year'] = date_col.year
    
    return df
    

In [7]:
# get day of week

def day_of_week(df):
    
    date_col = df.index
    
    df['day_of_week'] = date_col.weekday
    return df

In [8]:
# if weekend or weekday

def is_weekend(df):
    
    date_col = df.index
    
    df['is_weekend'] = np.where(df['day_of_week']<5, 0, 1)
    return df

In [9]:
# week of year

def week_of_year(df):
    
    date_col = df.index
    
    df['week_of_year'] = date_col.isocalendar().week.astype(int)
    
    return df

In [10]:
# Add holidays

def is_holiday(df):
    
    date_col = df.index.date
    
    Indian_holidays = holidays.India()
    US_holidays = holidays.US()
    
    df['is_holiday'] = [1 if (i in Indian_holidays) | (i in US_holidays) else 0 for i in date_col]
    return df



In [11]:
# whether it is day or night

def is_day(df):
    
    df['is_day'] = df.apply(lambda row: 1 if (row['hour']>=6 and row['hour']<=18) else 0, axis=1)
    
    return df
    

In [12]:
# seasons

def which_season(df):
    
    df['season'] = df.month%12 // 3 + 1
    
    return df

In [13]:
# convert hour to sin and cosine components

def cyclic_hour(df):
    
    hours_in_day = 24
    
    df['sin_hour'] = np.sin(2.*np.pi*df['hour']/hours_in_day)
    df['cos_hour'] = np.cos(2.*np.pi*df['hour']/hours_in_day)
    
    return df

In [14]:
# convert days of week into sin and cosine component

def cyclic_week(df):
    
    # monday = 0, sunday = 6
    days_in_week = 6
    
    df['sin_week'] = np.sin(2.*np.pi*df['day_of_week']/days_in_week)
    df['cos_week'] = np.cos(2.*np.pi*df['day_of_week']/days_in_week)
    
    return df

In [15]:
# convert months into sin and cosine components

def cyclic_month(df):
    
    months_in_year =12
    
    df['sin_month'] = np.sin(2.*np.pi*df['month']/months_in_year)
    df['cos_month'] = np.cos(2.*np.pi*df['month']/months_in_year)
    
    return df

In [16]:
# Add new features

def add_features(df):
    
    # add day, month, year
    df = convert_datetime_to_day_month_year(df)
    
    # day of week
    df = day_of_week(df)
    
    # is weekend
    df = is_weekend(df)
    
    # week of year
    df = week_of_year(df)
    
    # add holidays
    df = is_holiday(df)
    
    # is day
    df = is_day(df)
    
    # which season
    df = which_season(df)
    
    # cyclic features
    df = cyclic_hour(df)
    df = cyclic_month(df)
    df = cyclic_week(df)
    
    return df

In [17]:
train_with_features = add_features(train)

In [18]:
test_with_features = add_features(test)

# Modelling

In [19]:
# prepare train and test

def prepare_df(train):
    
    # make X_train
    X_train = train.drop(columns=['date','demand']).copy()
    
    # make y_train
    y_train = train['demand'].copy().values
    
    return X_train, y_train

In [43]:
X_train, y_train = prepare_df(train_with_features)

In [44]:
X_test = test_with_features.drop(columns=['date']).copy()

In [57]:
def model_fit_predict(model, X_train, y_train, X_test):
    
    # fit model
    model.fit(X_train, y_train)
    
    # predict
    y_predict = model.predict(X_test)
    
    # add to test 
    X_test_copy = X_test.copy()
    X_test_copy.loc[:,'demand'] = y_predict.astype(int)
    
    return X_test_copy
    

## LightGBM

In [58]:
# hyperparameter tuned using optuna

params = {'bagging_fraction': 0.5,
 'bagging_freq': 1,
 'feature_fraction': 0.6,
 'lambda_l1': 5,
 'lambda_l2': 20,
 'learning_rate': 0.095,
 'max_depth': 8,
 'min_child_weight': 28.3,
 'min_data_in_leaf': 100,
 'n_estimators': 1000,
 'num_leaves': 2180,
 'subsample': 0.77}

model_lgbm = LGBMRegressor(**params)
  

In [59]:
X_test_with_prediction_lgbm = model_fit_predict(model_lgbm, X_train, y_train, X_test)

## XGB

In [None]:
# hyperparameter tuned using optuna
params = {'max_depth' : 6,
          'learning_rate' : 0.093,
          'min_child_weight' : 92.68,
          'subsample' : 0.7,
          'n_estimators' : 10000}
        
model_xgb = XGBRegressor(objective='reg:squarederror', early_stopping_rounds =100, **params)

In [None]:
X_test_with_prediction_xgb = model_fit_predict(model_xgb, X_train, y_train, X_test)

## Linear Regression

In [None]:
model_linear = LinearRegression()

In [None]:
X_test_with_prediction_linear = model_fit_predict(model_linear, X_train, y_train, X_test)

## Random Forest

In [None]:
model_RF = RandomForestRegressor(n_estimators=700, max_depth=6, min_samples_split=10, min_samples_leaf=5)

In [None]:
X_test_with_prediction_RF =  model_fit_predict(model_RF,X_train, y_train, X_test)

# Submission

In [None]:
# set datetime as index

sample.set_index(pd.to_datetime(sample.date) +\
                            pd.to_timedelta(sample.hour, unit='h'), inplace=True)

In [63]:
# merge sample and prediction

# lightgbm
submission_lgbm = sample[['date','hour']].merge(X_test_with_prediction_lgbm[['demand']], \
                                               right_index=True, left_index=True, how='inner')

# # xgb
# submission_xgb = sample[['date','hour']].merge(X_test_with_prediction_xgb[['demand']], \
#                                                right_index=True, left_index=True, how='inner')

# # linear
# submission_linear = sample[['date','hour']].merge(X_test_with_prediction_linear[['demand']], \
#                                                right_index=True, left_index=True, how='inner')

# # RF
# submission_RF = sample[['date','hour']].merge(X_test_with_prediction_RF[['demand']], \
#                                                   right_index=True, left_index=True, how='inner')

In [64]:
# save to csv

submission_lgbm.to_csv('./fine_tuned_lgbm.csv', index=None)