## Dataset preprocessing



In [1]:
import numpy as nb
import pandas as pd

from sklearn import preprocessing
from sklearn import decomposition
from sklearn.preprocessing import MinMaxScaler

from matplotlib import pyplot as plt

%matplotlib inline

import datetime

### Timestamp extraction

Because crazy things happened in the past year,  we validated that, some specific dates had significantly higher electricity prices. Therefore we do weekday, month, and time extraction from the timestamp.  



In [2]:
def extract_weekday(dataset):
    splits = dataset['date'].astype(str).str.split('-')
    dataset['weekday'] = [datetime.date(int(year), int(month), int(day)).weekday() for (year, month, day) in splits]

In [3]:
def extract_month(dataset):
    dataset['month'] = [month for (_, month, _) in dataset['date'].astype(str).str.split('-')]

In [4]:
def extract_datetime(dataset):
    dataset.loc[:,'time'] = pd.to_datetime(dataset.loc[:,'time'], format="%Y-%m-%d %H:%M:%S", utc=True)
    dataset['date'] = dataset['time'].dt.date
    dataset['hour'] = dataset['time'].dt.hour

In [5]:
def one_hot_encode(dataset, columns, encoder = None) -> preprocessing.OneHotEncoder:
    if encoder:
        transformed = encoder.transform(dataset[columns])
    else:
        encoder = preprocessing.OneHotEncoder(sparse= False)
        transformed = encoder.fit_transform(dataset[columns])

    new_columns = []
    for i, column in enumerate(encoder.feature_names_in_):
        new_columns.extend([column + str(cat) for cat in encoder.categories_[i]])

    encoder_df = pd.DataFrame(transformed, index=dataset.index)
    dataset[new_columns] = encoder_df
    dataset.drop(columns=columns, inplace=True)
    return encoder

In [6]:
def extract_features(dataset):
    extract_datetime(dataset)
    extract_month(dataset)
    extract_weekday(dataset)


### Feature dropping

In Estonia, there are approximately 500\-800 millimeters of rain on average. Our dataset consisted of only about 140mm of rain, which is definitely not correct. Also, the amount of snow was inappropriate for the  
 same reason.


In [7]:
def drop_features(dataset):
    dataset.drop(columns=['snow','prcp','time','date'], inplace=True)


In [8]:
def drop_rows(dataset):
    # Deal with NaN values
    initial_len = len(dataset)
    dataset.dropna(inplace=True)
    new_len = len(dataset)
    if (initial_len != new_len):
        print(f'Dropped {initial_len - new_len} row')

    # Deal with outliners
    dataset.drop(dataset[dataset['el_price'] > 1].index , inplace=True)

In [9]:
def normalize(dataset, scaler = None) -> (pd.DataFrame, preprocessing.MinMaxScaler):
    if scaler:
        dataset_scaled = scaler.transform(dataset)
        return (dataset_scaled, scaler)
    scaler = preprocessing.MinMaxScaler()
    dataset_scaled = scaler.fit_transform(dataset)
    return (dataset_scaled, scaler)

In [10]:
def reduce_dimensions(dataset, pca = None) -> (pd.DataFrame, decomposition.PCA):
    if pca:
        dataset_reduced = pca.transform(dataset)
        return (dataset_reduced, pca)
    pca = decomposition.PCA(n_components=0.9)
    dataset_reduced = pca.fit_transform(dataset)
    return (dataset_reduced, pca)

In [11]:
def preprocess(dataset, encoder=None) -> preprocessing.OneHotEncoder:
    extract_features(dataset)
    drop_features(dataset)
    encoder = one_hot_encode(dataset, ['coco', 'weekday'], encoder)
    drop_rows(dataset)
    return encoder


### Import dataset

Here we import dataset, do inital processing and split into train and validation.

In [14]:
def read_dataset(file_name) -> pd.DataFrame:
    return pd.read_csv(file_name)

In [15]:
def extract_labels(dataset) -> (pd.DataFrame, pd.Series):
    X_train = dataset.loc[:, ~dataset.columns.isin(['consumption'])]
    y_train = dataset['consumption']
    return (X_train, y_train)

In [24]:
train_df = read_dataset('train.csv')
encoder = preprocess(train_df)


X_train, y_train = extract_labels(train_df)

X_train_norm, scaler = normalize(X_train)
X_train_reduced, pca = reduce_dimensions(X_train_norm)


Dropped 2 row


In [25]:
X_train_norm.shape

(8588, 41)

In [26]:
X_test = read_dataset('test.csv')
preprocess(X_test, encoder)

X_test_norm, _ = normalize(X_test, scaler)
print(X_test_norm.shape)
X_test_reduced, _ = reduce_dimensions(X_test_norm, pca)

(168, 41)


In [27]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train_reduced, y_train, test_size=0.2)

In [28]:
X_train.shape

(6870, 16)

In [29]:
import itertools

# Numpy
import numpy as np

# XGBoost
import xgboost as xgb

# Pandas
import pandas as pd

# Sklearn
import sklearn.preprocessing
import sklearn.utils
from sklearn.metrics import mean_squared_error

# Visualiseerimine
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
#reg = xgb.XGBRegressor(tree_method="gpu_hist")
# Fit the model using predictor X and response y.

In [31]:
# XGBoosts wants data to be wrapped into special formats
#dtrain = xgb.DMatrix(X_train,label=y_train)
#dval = xgb.DMatrix(X_val,label=y_val)
#dtest = xgb.DMatrix(X_test_reduced)

In [35]:
xgbr = xgb.XGBRegressor(verbosity=0)

In [36]:
xgbr.fit(X_train, y_train)

XGBRegressor(verbosity=0)

In [37]:
yvalpred = xgbr.predict(X_val)
mse = mean_squared_error(y_val, yvalpred)
print("MSE: %.2f" % mse)

MSE: 0.98


In [39]:
prediction = xgbr.predict(X_test_reduced)

In [40]:
prediction

array([0.62187433, 0.75440955, 0.62187433, 0.66751933, 0.6425308 ,
       0.71292907, 0.6462215 , 0.52454984, 0.4854585 , 0.7375344 ,
       0.65172505, 0.46956855, 0.46956855, 0.6344147 , 0.78201   ,
       0.93963426, 0.6133261 , 0.812935  , 0.91216516, 0.75446105,
       0.81961656, 0.9321583 , 0.7963561 , 0.7827731 , 0.7443634 ,
       0.6932913 , 0.6932913 , 0.65665776, 0.6044201 , 0.6044201 ,
       0.5616169 , 0.63849294, 0.6188985 , 0.58194673, 0.69731253,
       0.6666155 , 0.598039  , 0.72141546, 0.54217297, 0.8621993 ,
       0.54217297, 0.54217297, 0.6915774 , 0.7110124 , 0.6073897 ,
       0.63762325, 0.6825634 , 0.7646626 , 0.67939097, 0.5506639 ,
       0.5369478 , 0.8143346 , 0.65855944, 0.65855944, 0.6716348 ,
       0.65543866, 0.6602229 , 0.764709  , 0.6200744 , 0.97568333,
       0.87496567, 0.87496567, 0.9688145 , 1.0166875 , 0.80761695,
       0.77413523, 0.9235763 , 1.1156735 , 1.1156735 , 1.0596904 ,
       0.98011816, 0.64114666, 0.5733334 , 0.5685491 , 0.66234

In [41]:
X_test = read_dataset('test.csv')

In [43]:
predictions_dict = {'time':X_test.time,'consumption':prediction}
pred_df = pd.DataFrame(predictions_dict)
pred_df.to_csv('submission_Xgboost_v1.csv',index=False)