In [27]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [28]:
url = '../data/default_nyc_taxi.csv'
data = pd.read_csv(url, parse_dates=['timestamp'], index_col='timestamp')

data = data[["value"]]
data.head()

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2014-07-01 00:00:00,10844
2014-07-01 00:30:00,8127
2014-07-01 01:00:00,6210
2014-07-01 01:30:00,4656
2014-07-01 02:00:00,3820


In [29]:
period = 48
print(data.index[:period])

DatetimeIndex(['2014-07-01 00:00:00', '2014-07-01 00:30:00',
               '2014-07-01 01:00:00', '2014-07-01 01:30:00',
               '2014-07-01 02:00:00', '2014-07-01 02:30:00',
               '2014-07-01 03:00:00', '2014-07-01 03:30:00',
               '2014-07-01 04:00:00', '2014-07-01 04:30:00',
               '2014-07-01 05:00:00', '2014-07-01 05:30:00',
               '2014-07-01 06:00:00', '2014-07-01 06:30:00',
               '2014-07-01 07:00:00', '2014-07-01 07:30:00',
               '2014-07-01 08:00:00', '2014-07-01 08:30:00',
               '2014-07-01 09:00:00', '2014-07-01 09:30:00',
               '2014-07-01 10:00:00', '2014-07-01 10:30:00',
               '2014-07-01 11:00:00', '2014-07-01 11:30:00',
               '2014-07-01 12:00:00', '2014-07-01 12:30:00',
               '2014-07-01 13:00:00', '2014-07-01 13:30:00',
               '2014-07-01 14:00:00', '2014-07-01 14:30:00',
               '2014-07-01 15:00:00', '2014-07-01 15:30:00',
               '2014-07-

In [30]:
print('From  ' + str(np.min(data.index)) + '  to  ' +str(np.max(data.index)))

From  2014-07-01 00:00:00  to  2015-01-31 23:30:00


In [31]:
print('Data size: %d \nNumber of data per day: %d \nNumber of days: %d' %(data.shape[0], period, data.shape[0] / period))

Data size: 10320 
Number of data per day: 48 
Number of days: 215


In [32]:
print('Missing value: ', data.isnull().to_numpy().sum())

Missing value:  0


In [33]:
ratio = 0.55
train_size = int(data.shape[0] * ratio)
train = data[:train_size]
test = data[train_size:]

dates_train = data.index[:train_size]
dates_test = data.index[train_size:]

print('Train size: ', ratio)

print('\n\nTRAIN SET:  from  ' + str(np.min(dates_train)) + '  to  ' +str(np.max(dates_train)))
print('Data size: ', train.shape[0])
print('Number of days: ', int(train.shape[0] / period))


print('\n\nTEST SET:  from  ' + str(np.min(dates_test)) + '  to  ' +str(np.max(dates_test)))
print('Data size: ', test.shape[0])
print('Number of days: ', int(test.shape[0] / period))

Train size:  0.55


TRAIN SET:  from  2014-07-01 00:00:00  to  2014-10-27 05:30:00
Data size:  5676
Number of days:  118


TEST SET:  from  2014-10-27 06:00:00  to  2015-01-31 23:30:00
Data size:  4644
Number of days:  96


In [34]:
scaler = StandardScaler()
scaler = scaler.fit(train.values.reshape(-1, 1))

train = scaler.transform(train.values.reshape(-1, 1))
test = scaler.transform(test.values.reshape(-1, 1))

In [35]:
def split_into_windows(X, length):
    windows = []
    for start in range(0, len(X) - length + 1):
        windows.append(X[start: start + length])
    return np.array(windows)

In [36]:
window_length = 48

X_train = split_into_windows(train, window_length)
X_test = split_into_windows(test, window_length)

_, input_length, input_dim = X_train.shape

In [37]:
X_train.shape

(5629, 48, 1)

In [38]:
X_train[0].shape

(48, 1)

In [39]:
# params
dim_hidden1 = 24 # 1/2 the length of one window
dim_hidden2 = 12 # 1/4 the length of one window


# model
model = Sequential()
model.add(LSTM(units=dim_hidden1, activation='relu', input_shape=(input_length, input_dim), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=dim_hidden2, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=dim_hidden2, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=dim_hidden1, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1))

# summary
model.compile(loss='mse', optimizer='adam')
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 48, 24)            2496      
                                                                 
 dropout_4 (Dropout)         (None, 48, 24)            0         
                                                                 
 lstm_5 (LSTM)               (None, 48, 12)            1776      
                                                                 
 dropout_5 (Dropout)         (None, 48, 12)            0         
                                                                 
 lstm_6 (LSTM)               (None, 48, 12)            1200      
                                                                 
 dropout_6 (Dropout)         (None, 48, 12)            0         
                                                                 
 lstm_7 (LSTM)               (None, 48, 24)           

In [40]:
# params
batch_size = 48 # 24 hours == length of the window
epochs = 50
validation_split = 0.1
es = EarlyStopping(monitor='val_loss', patience=5)


# fitting
history = model.fit(X_train, X_train, 
                batch_size = batch_size, 
                epochs = epochs, 
                validation_split = validation_split, 
                callbacks = [es])

Epoch 1/50


KeyboardInterrupt: 

In [None]:
pred_x_test = model.predict(X_test)
pred_x_train = model.predict(X_train)



In [None]:
mse = lambda x, x_pred: np.mean(np.abs(x - x_pred)**2, axis=1)

train_mae_loss = pd.DataFrame(mse(X_train, pred_x_train), columns=['Error'])
test_mae_loss = pd.DataFrame(mse(X_test, pred_x_test), columns=['Error'])

### Selection Threshold

threshold = mean of the loss + 2 standard devation of the loss

In [None]:
test_mae_loss_array = test_mae_loss.to_numpy().flatten()

In [None]:
threshold = np.mean(test_mae_loss)[0] + 2 * np.std(test_mae_loss)[0]

print(threshold)

0.04718840143485599


In [None]:
test_mae_loss_array = test_mae_loss.to_numpy().flatten()

### Dates where anomalies have been detected

In [None]:
y_pred = test_mae_loss_array >= threshold
y_pred.shape

(4597,)

In [None]:
dates_test[:-(window_length-1)].shape

(4597,)

In [None]:
dates_with_hours_anomalies = dates_test[:-(window_length-1)][y_pred]

print('Dates where anomalies have been detected:')
for d in dates_with_hours_anomalies:
    print(d)

Dates where anomalies have been detected:
2014-11-01 01:30:00
2014-11-01 02:00:00
2014-11-01 02:30:00
2014-11-01 03:00:00
2014-11-01 03:30:00
2014-11-01 04:00:00
2014-11-01 04:30:00
2014-11-01 05:00:00
2014-11-01 05:30:00
2014-11-01 06:00:00
2014-11-01 06:30:00
2014-11-01 07:00:00
2014-11-01 07:30:00
2014-11-01 08:00:00
2014-11-01 08:30:00
2014-11-01 09:00:00
2014-11-01 09:30:00
2014-11-01 10:00:00
2014-11-01 10:30:00
2014-11-01 11:00:00
2014-11-01 11:30:00
2014-11-01 12:00:00
2014-11-01 12:30:00
2014-11-01 13:00:00
2014-11-01 13:30:00
2014-11-01 14:00:00
2014-11-01 14:30:00
2014-11-01 15:00:00
2014-11-01 15:30:00
2014-11-01 16:00:00
2014-11-01 16:30:00
2014-11-01 17:00:00
2014-11-01 17:30:00
2014-11-01 18:00:00
2014-11-01 18:30:00
2014-11-01 19:00:00
2014-11-01 19:30:00
2014-11-01 20:00:00
2014-11-01 20:30:00
2014-11-01 21:00:00
2014-11-01 21:30:00
2014-11-01 22:00:00
2014-11-01 22:30:00
2014-11-01 23:00:00
2014-11-01 23:30:00
2014-11-02 00:00:00
2014-11-02 00:30:00
2014-11-02 01:00:0

In [None]:
ref = dates_with_hours_anomalies
ref = pd.to_datetime(ref, format='%Y/%m/%d')

ref

DatetimeIndex(['2014-11-01 01:30:00', '2014-11-01 02:00:00',
               '2014-11-01 02:30:00', '2014-11-01 03:00:00',
               '2014-11-01 03:30:00', '2014-11-01 04:00:00',
               '2014-11-01 04:30:00', '2014-11-01 05:00:00',
               '2014-11-01 05:30:00', '2014-11-01 06:00:00',
               ...
               '2015-01-27 05:30:00', '2015-01-27 06:00:00',
               '2015-01-27 06:30:00', '2015-01-27 07:00:00',
               '2015-01-27 07:30:00', '2015-01-27 08:00:00',
               '2015-01-27 08:30:00', '2015-01-27 09:00:00',
               '2015-01-27 09:30:00', '2015-01-27 10:00:00'],
              dtype='datetime64[ns]', name='timestamp', length=171, freq=None)

In [None]:
lstm_data = data.reset_index().copy()

lstm_data['date'] = pd.to_datetime(lstm_data['timestamp']).dt.date

# Modifier les valeurs dans 'pred' où la date est dans 'ref'
lstm_data["pred"] = 0 
lstm_data.loc[lstm_data['date'].isin(ref.date), 'pred'] = 1

# Supprimer la colonne 'date' si elle n'est plus nécessaire
lstm_data = lstm_data.drop('date', axis=1)

lstm_data.to_csv("../data/lstm_nyc_taxi.csv")

In [None]:
total = pd.read_csv("../data/total_nyc_taxi.csv",index_col=0)
total["lstm_pred"] = lstm_data["pred"]

total.to_csv("../data/total_nyc_taxi.csv")