# Data Challenge : Historical consumption regression for electricity supply pricing

### Importings

See the other notebook for data preprocessing details and visualisation

In [41]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import datetime
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
import calendar
import math
import holidays



plt.style.use('fivethirtyeight')
#Import training data
inputFilePath = "./input_training_ssnsrY0.csv"
data_train = pd.read_csv(inputFilePath, delimiter = ',', skiprows = 0, index_col=[0])

#Import testing data
inputFilePathTest = "./input_test_cdKcI0e.csv"
data_test = pd.read_csv(inputFilePathTest, delimiter = ',', skiprows = 0, index_col=[0])
id_test = data_test.index
# import targets
outputFilePath = "./output_training_Uf11I9I.csv"
targets = pd.read_csv(outputFilePath, delimiter = ',', skiprows = 0, index_col=[0])

#Remove useless datas
data_train = data_train.drop(["loc_1", "loc_2", "loc_secondary_1", "loc_secondary_2", "loc_secondary_3"], axis = 1)
data_test = data_test.drop(["loc_1", "loc_2", "loc_secondary_1", "loc_secondary_2", "loc_secondary_3"], axis = 1)

data_train.timestamp = pd.to_datetime(data_train.timestamp)
data_test.timestamp = pd.to_datetime(data_test.timestamp)

fr_holidays = holidays.France()
data_train['isHoliday'] = data_train.timestamp.apply(lambda x:1 if x in fr_holidays else 0)
data_test['isHoliday'] = data_test.timestamp.apply(lambda x:1 if x in fr_holidays else 0)


# indexing with timestamp
data_test = data_test.set_index('timestamp')
data_train = data_train.set_index('timestamp')

# time features
def timefeatures(df):
    df['hour'] = df.index.hour
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofmonth'] = df.index.day
    df['quarter'] = df.index.quarter
    df['weekofyear'] = df.index.weekofyear
    df['dayofweek'] = df.index.dayofweek

    return df

data_train = timefeatures(data_train)
data_test = timefeatures(data_test)

# isWeekend feature
data_train['isWeekend'] = data_train['dayofweek'].apply(lambda x: 1 if x in [5,6] else 0)
data_test['isWeekend'] = data_test['dayofweek'].apply(lambda x: 1 if x in [5,6] else 0)

# smoothing temp and humidity
data_train['temp_1_smooth7D'] = data_train['temp_1'].interpolate().rolling(24*7).mean().fillna(method='bfill').round(decimals=1)
data_train['temp_2_smooth7D'] = data_train['temp_2'].interpolate().rolling(24*7).mean().fillna(method='bfill').round(decimals=1)
data_test['temp_1_smooth7D'] = data_test['temp_1'].interpolate().rolling(24*7).mean().fillna(method='bfill').round(decimals=1)
data_test['temp_2_smooth7D'] = data_test['temp_2'].interpolate().rolling(24*7).mean().fillna(method='bfill').round(decimals=1)

data_train['humidity_1_smooth7D'] = data_train['humidity_1'].interpolate().rolling(24*7).mean().fillna(method='bfill').round()
data_train['humidity_2_smooth7D'] = data_train['humidity_2'].interpolate().rolling(24*7).mean().fillna(method='bfill').round()
data_test['humidity_1_smooth7D'] = data_test['humidity_1'].interpolate().rolling(24*7).mean().fillna(method='bfill').round()
data_test['humidity_2_smooth7D'] = data_test['humidity_2'].interpolate().rolling(24*7).mean().fillna(method='bfill').round()

# concatenate features and targets
data_train = pd.concat([targets.set_index(data_train.index), data_train],axis=1)

  df['weekofyear'] = df.index.weekofyear


In [42]:
# data preparation
features_lm_loc1 = ['hour','isHoliday','weekofyear', 'month','consumption_secondary_1', 'consumption_secondary_2',
       'consumption_secondary_3', 'temp_1_smooth7D','humidity_1_smooth7D']

features_lm_loc2 = features_lm_loc1

# training data
X_train1 = data_train[features_lm_loc1]
X_train2 = data_train[features_lm_loc2]
# training labels
y_train1 = data_train['consumption_1']
y_train2 = data_train['consumption_2']

# test data
X_test1 = data_test[features_lm_loc1]
X_test2 = data_test[features_lm_loc2]

# XGBoost Regressor

In [43]:
hyper_params_1={'n_estimators': 100,
 'min_child_weight': 7,
 'metric': 'l2',
 'max_depth': 8,
 'learning_rate': 0.05,
 'gamma': 0.2,
 'colsample_bytree': 0.4}
#Estimated from random search 

In [44]:
# Initialisation of the model with the optimal parameters
reg = xgb.XGBRegressor(**hyper_params_1)

In [45]:
## prediction for Lille (1)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)
y_pred1_df = pd.DataFrame(y_pred1, index=data_test.index, columns=['pred1']) # save data in a DataFrame format

Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [46]:
# Calculat the mean squared error between training and prediction data
y_train_pred1 = reg.predict(X_train1)
rmse = np.sqrt(mean_squared_error(y_train1, y_train_pred1)) ## mse for the training
print("RMSE: %f" % (rmse))

RMSE: 9.847181


In [47]:
## prediction for Aix (2)
reg.fit(X_train2, y_train2)
y_pred2 = reg.predict(X_test2)
y_pred2_df = pd.DataFrame(y_pred2, index=data_test.index, columns=['pred2'])

Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [48]:
y_train_pred2 = reg.predict(X_train2)
rmse = np.sqrt(mean_squared_error(y_train2, y_train_pred2)) ## mse for the training
print("RMSE: %f" % (rmse))

RMSE: 10.032691


In [49]:
## create the submission csv file
sub2 = pd.concat([y_pred1_df, y_pred2_df], axis=1).set_index(id_test)
sub2.set_index(data_test.index, inplace=True )
sub2.to_csv('submission_xgb.csv')
## resulting accuracy in the data challenge 19.45

## Parameter tunning for XGBoost

For localisation 1

In [50]:
## Hyper Parameter Optimization

params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "n_estimators"     : range(50,300,50),
 "max_depth"        : [ 2,3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ],
 'metric': ['l2', 'auc']}

# We have used random search instead of grid search to optimise calculating time 
random_search=RandomizedSearchCV(reg,
                                param_distributions=params,
                               n_iter=5,                           
                               n_jobs=-1, 
                               cv=5,
                               random_state=1)


In [51]:
random_search.fit(X_train1, y_train1)
random_search.best_params_

Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




{'n_estimators': 200,
 'min_child_weight': 1,
 'metric': 'l2',
 'max_depth': 10,
 'learning_rate': 0.1,
 'gamma': 0.0,
 'colsample_bytree': 0.7}

In [52]:
reg = xgb.XGBRegressor(**random_search.best_params_)

In [53]:
## prediction for Lille (1)
reg.fit(X_train1, y_train1)
y_pred1 = reg.predict(X_test1)
y_pred1_df = pd.DataFrame(y_pred1, index=data_test.index, columns=['pred1'])

Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [54]:
# the metric used to evaluate the model
y_train_pred1 = reg.predict(X_train1)
rmse = np.sqrt(mean_squared_error(y_train1, y_train_pred1)) ## mse of the training
print("RMSE: %f" % (rmse))

RMSE: 1.646539


For localisation 2

In [55]:
random_search.fit(X_train2, y_train2)
random_search.best_params_

Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




{'n_estimators': 200,
 'min_child_weight': 1,
 'metric': 'l2',
 'max_depth': 10,
 'learning_rate': 0.1,
 'gamma': 0.0,
 'colsample_bytree': 0.7}

In [56]:
reg = xgb.XGBRegressor(**random_search.best_params_)

In [57]:
## prediction for Aix (2)
reg.fit(X_train2, y_train2)
y_pred2 = reg.predict(X_test2)
y_pred2_df = pd.DataFrame(y_pred2, index=data_test.index, columns=['pred2'])

Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [58]:
# the metric used to evaluate the model
y_train_pred2 = reg.predict(X_train2)
rmse = np.sqrt(mean_squared_error(y_train2, y_train_pred2)) ## mse for the training
print("RMSE: %f" % (rmse))

RMSE: 1.679853


In [59]:
## create the submission csv file
sub3 = pd.concat([y_pred1_df, y_pred2_df], axis=1).set_index(id_test)
sub3.to_csv('submission_xgb_2.csv')