<a href="https://colab.research.google.com/github/KotkaZ/journey-to-zero/blob/master/lstm-test1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset preprocessing



In [None]:
import numpy as nb
import pandas as pd

from sklearn import preprocessing
from sklearn import decomposition
from sklearn.preprocessing import MinMaxScaler

from matplotlib import pyplot as plt

%matplotlib inline

import datetime

### Timestamp extraction

Because crazy things happened in the past year,  we validated that, some specific dates had significantly higher electricity prices. Therefore we do weekday, month, and time extraction from the timestamp.  



In [None]:
def extract_weekday(dataset):
    splits = dataset['date'].astype(str).str.split('-')
    dataset['weekday'] = [datetime.date(int(year), int(month), int(day)).weekday() for (year, month, day) in splits]
    dataset['date'] = [date for (date, _, _) in dataset['date'].astype(str).str.split('-')]

In [None]:
def extract_month(dataset):
    dataset['month'] = [month for (_, month, _) in dataset['date'].astype(str).str.split('-')]

In [None]:
def extract_datetime(dataset):
    dataset.loc[:,'time'] = pd.to_datetime(dataset.loc[:,'time'], format="%Y-%m-%d %H:%M:%S", utc=True)
    dataset['date'] = dataset['time'].dt.date
    dataset['hour'] = dataset['time'].dt.hour

In [None]:
def one_hot_encode(dataset, columns, encoder = None) -> preprocessing.OneHotEncoder:
    if encoder:
        transformed = encoder.transform(dataset[columns])
    else:
        encoder = preprocessing.OneHotEncoder(sparse= False)
        transformed = encoder.fit_transform(dataset[columns])

    new_columns = []
    for i, column in enumerate(encoder.feature_names_in_):
        new_columns.extend([column + str(cat) for cat in encoder.categories_[i]])

    encoder_df = pd.DataFrame(transformed, index=dataset.index)
    dataset[new_columns] = encoder_df
    dataset.drop(columns=columns, inplace=True)
    return encoder

In [None]:
def extract_features(dataset):
    extract_datetime(dataset)
    extract_month(dataset)
    extract_weekday(dataset)


### Feature dropping

In Estonia, there are approximately 500\-800 millimeters of rain on average. Our dataset consisted of only about 140mm of rain, which is definitely not correct. Also, the amount of snow was inappropriate for the  
 same reason.


In [None]:
def drop_features(dataset):
    dataset.drop(columns=['snow','prcp','time'], inplace=True)


In [None]:
def drop_rows(dataset):
    # Deal with NaN values
    initial_len = len(dataset)
    display(dataset.head())
    dataset.dropna(inplace=True)
    new_len = len(dataset)
    if (initial_len != new_len):
        print(f'Dropped {initial_len - new_len} row')

    # Deal with outliners
    dataset.drop(dataset[dataset['el_price'] > 1].index , inplace=True)

In [None]:
def normalize(dataset, scaler = None) -> (pd.DataFrame, preprocessing.MinMaxScaler):
    if scaler:
        dataset_scaled = scaler.transform(dataset)
        return (dataset_scaled, scaler)
    scaler = preprocessing.MinMaxScaler()
    dataset_scaled = scaler.fit_transform(dataset)
    return (dataset_scaled, scaler)

In [None]:
def reduce_dimensions(dataset, pca = None) -> (pd.DataFrame, decomposition.PCA):
    if pca:
        dataset_reduced = pca.transform(dataset)
        return (dataset_reduced, pca)
    pca = decomposition.PCA(n_components=0.9)
    dataset_reduced = pca.fit_transform(dataset)
    return (dataset_reduced, pca)

In [None]:
def make3d(dataset):
  timeseries = dataset[['date', 'hour', 'month', 'weekday']]
  dataset.drop(['date', 'hour', 'month', 'weekday'], axis=1, inplace=True)
  return timeseries


In [None]:
def preprocess(dataset, encoder=None):
    extract_features(dataset)
    drop_features(dataset)
    drop_rows(dataset)
    timeseries = make3d(dataset)
    encoder = one_hot_encode(dataset, ['coco'], encoder)
    return encoder, timeseries


### Import dataset

Here we import dataset, do inital processing and split into train and validation.

In [None]:
def read_dataset(file_name) -> pd.DataFrame:
    return pd.read_csv(file_name)

In [None]:
def extract_labels(dataset) -> (pd.DataFrame, pd.Series):
    X_train = dataset.loc[:, ~dataset.columns.isin(['consumption'])]
    y_train = dataset['consumption']
    return (X_train, y_train)

In [None]:
train_df = read_dataset('train.csv')
encoder, timeseries = preprocess(train_df)

X_train, y_train = extract_labels(train_df)

X_train_norm, scaler_data = normalize(X_train)
timeseries_norm, scaler_time = normalize(timeseries)

X_train_reduced, pca = reduce_dimensions(X_train_norm)
X_train = np.ndarray(shape = (X_train_reduced.shape[0],timeseries_norm.shape[1],X_train_reduced.shape[1]), dtype = "object")
X_train = [X_train_reduced, timeseries_norm]


Unnamed: 0,temp,dwpt,rhum,wdir,wspd,wpgt,pres,coco,el_price,consumption,date,hour,month,weekday
0,11.2,10.3,94.0,320.0,7.2,16.7,1012.6,2.0,0.09016,0.577,2021,21,8,1
1,10.7,9.6,93.0,320.0,7.2,13.0,1012.6,2.0,0.09251,0.594,2021,22,8,1
2,9.9,9.0,94.0,320.0,7.2,13.0,1012.2,2.0,0.0889,0.685,2021,23,8,1
3,10.0,8.4,90.0,330.0,7.2,13.0,1011.9,1.0,0.08735,1.016,2021,0,9,2
4,9.0,8.1,94.0,300.0,3.6,13.0,1011.4,2.0,0.08688,0.677,2021,1,9,2


Dropped 198 row


In [None]:
len(X_train)

2

In [None]:
timeseries_norm.shape

(8392, 4)

In [None]:
X_test = read_dataset('test.csv')
_, timeseries_test = preprocess(X_test, encoder)

X_test_norm, _ = normalize(X_test, scaler_data)
timeseries_test_norm = normalize(timeseries_test, scaler_time)
print(X_test_norm.shape)
X_test_reduced, _ = reduce_dimensions(X_test_norm, pca)

Unnamed: 0,temp,dwpt,rhum,wdir,wspd,wpgt,pres,coco,el_price,date,hour,month,weekday
0,21.1,15.2,69.0,340.0,9.0,9.3,1022.0,2.0,0.25533,2022,21,8,2
1,20.1,15.1,73.0,30.0,6.0,14.8,1022.0,2.0,0.19492,2022,22,8,2
2,20.1,15.1,73.0,320.0,7.0,13.0,1022.0,2.0,0.18853,2022,23,8,2
3,18.7,17.0,90.0,0.0,4.0,11.1,1022.4,4.0,0.19947,2022,0,8,3
4,18.1,17.1,94.0,280.0,7.0,11.1,1022.0,3.0,0.21192,2022,1,8,3


(168, 31)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train_reduced, y_train, test_size=0.2)

In [None]:
X_train[0]

array([ 0.91217738,  0.29231251,  0.26357358,  0.32136907,  0.17356219,
        0.57092074, -0.40074392, -0.10835966,  0.12000969, -0.12569246,
        0.18129354, -0.09592695,  0.07441223,  0.05265869, -0.53121764,
       -0.00556582])

In [None]:
import itertools

# Numpy
import numpy as np

# Keras
from keras.layers import Dense, Input, BatchNormalization
from keras.models import Sequential
from tensorflow.keras.optimizers import SGD

# Pandas
import pandas as pd

# Sklearn
import sklearn.preprocessing
import sklearn.utils

# Visualiseerimine
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
X_train.shape[1]

16

In [None]:
X_train.shape[2]

IndexError: ignored

In [None]:
from tensorflow.keras.layers import Input, Conv2D, Activation, Flatten, Dense, MaxPooling2D, BatchNormalization, Dropout, LSTM

mdl = Sequential()

mdl.add(LSTM(512, activation="linear", input_shape=(16,0)))
mdl.add(BatchNormalization())
mdl.add(Dense(256, activation="linear"))
mdl.add(BatchNormalization())
mdl.add(Dense(1, activation="linear"))

mdl.compile(loss='mae', optimizer="adam")




In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test_reduced.shape)

(6870, 16)
(6870,)
(1718, 16)
(1718,)
(168, 16)


In [None]:
history = mdl.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=10, batch_size=32, verbose=True)

Epoch 1/10




ValueError: ignored

In [None]:
plt.figure(figsize=(16, 6))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Training', 'Validation'])
plt.title('Loss')

#plt.subplot(1, 2, 2)
#plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_accuracy'])
#plt.xlabel('Epoch')
#plt.ylabel('Accuracy')
#plt.legend(['Training', 'Validation'])
#plt.title('Accuracy')

In [None]:
prediction = mdl.predict(X_test_reduced)

In [None]:
prediction

In [None]:
X_test = read_dataset('test.csv')

In [None]:
predictions_dict = {'time':X_test.time,'consumption':[el[0]for el in prediction]}
pred_df = pd.DataFrame(predictions_dict)
pred_df.to_csv('submission_291022_v1.csv',index=False)