In [154]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense

In [155]:
url = '../data/default_nyc_taxi.csv'
data = pd.read_csv(url, parse_dates=['timestamp'], index_col='timestamp')
data = data[["value"]]

data.head()

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2014-07-01 00:00:00,10844
2014-07-01 00:30:00,8127
2014-07-01 01:00:00,6210
2014-07-01 01:30:00,4656
2014-07-01 02:00:00,3820


In [156]:
period = 24 * 2
data.index

DatetimeIndex(['2014-07-01 00:00:00', '2014-07-01 00:30:00',
               '2014-07-01 01:00:00', '2014-07-01 01:30:00',
               '2014-07-01 02:00:00', '2014-07-01 02:30:00',
               '2014-07-01 03:00:00', '2014-07-01 03:30:00',
               '2014-07-01 04:00:00', '2014-07-01 04:30:00',
               ...
               '2015-01-31 19:00:00', '2015-01-31 19:30:00',
               '2015-01-31 20:00:00', '2015-01-31 20:30:00',
               '2015-01-31 21:00:00', '2015-01-31 21:30:00',
               '2015-01-31 22:00:00', '2015-01-31 22:30:00',
               '2015-01-31 23:00:00', '2015-01-31 23:30:00'],
              dtype='datetime64[ns]', name='timestamp', length=10320, freq=None)

In [157]:
print('From  ' + str(np.min(data.index)) + '  to  ' +str(np.max(data.index)))

From  2014-07-01 00:00:00  to  2015-01-31 23:30:00


In [158]:
print('Data size: %d \nNumber of data per day: %d \nNumber of days: %d' %(data.shape[0], period, data.shape[0] / period))

Data size: 10320 
Number of data per day: 48 
Number of days: 215


In [159]:
print('Missing value: ', data.isnull().to_numpy().sum())

Missing value:  0


In [160]:
# splitting data into days - function
def create_dataset(X, dates, period=1):
    Xs = []
    indexes = []
    for i in range(int(len(X) / period)):
        v = X.iloc[i*period: (i + 1)*period].values
        indexes.append(dates[period*i])
        Xs.append(v)        
    return np.array(Xs), np.array(indexes)

In [161]:
# new data frame
df, dates = create_dataset(data.value, data.index, period)
df = pd.DataFrame(df, dates)

print('df.shape: ', df.shape)
df.head()

df.shape:  (215, 48)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47
2014-07-01,10844,8127,6210,4656,3820,2873,2369,2064,2221,2158,2515,4364,6526,11039,13857,15865,17920,20346,19539,20107,18984,17720,17249,18463,18908,18886,18178,19459,19546,20591,19380,18544,16228,15013,17203,19525,22966,27598,26827,24904,22875,20394,23401,24439,23318,21733,20104,16111
2014-07-02,13370,9945,7571,5917,4820,3634,2993,2535,2570,2485,2868,4482,6788,11078,13729,16700,19156,19953,19502,18994,17311,17904,17133,18589,19134,19259,18667,19078,18546,18593,17967,16624,14634,13888,17430,21919,23633,24512,24887,26872,22009,18259,20844,22576,22401,19056,17518,15307
2014-07-03,12646,10562,8416,7098,5826,4383,3270,2948,3146,3077,3000,4592,6486,10113,12240,14574,16778,18910,18350,17218,16097,16409,15893,16778,17604,18665,19045,19261,19363,19078,18193,16635,14615,13759,17008,19595,21328,22661,29985,21501,22684,22188,22663,19573,17136,16606,16166,16020
2014-07-04,15591,14395,12535,11341,9980,8404,7200,6578,5657,4474,3459,3276,3595,4240,4828,4926,5165,5776,7338,7839,8623,9731,11024,13231,13613,13737,15574,14226,18480,18265,16575,16417,14703,13469,12105,11676,15487,15077,14999,14487,14415,13796,14036,14021,15593,16589,17984,18035
2014-07-05,17576,16189,14441,12535,11006,9151,8010,7096,6407,4421,3126,2514,2550,3148,3658,4345,4682,6248,7454,9010,10280,11488,11595,13098,12623,13031,13263,13349,13822,13716,13919,14203,13179,13708,13897,14740,14575,16085,18182,16861,14140,14477,15293,15457,16048,17477,16391,17006


In [162]:
# splitting
ratio = 0.55
train_size = int(df.shape[0] * ratio)
X_train = df[:train_size]
X_test = df[train_size:]

dates_train = np.array(df.index[:train_size], dtype='datetime64[D]')
dates_test = np.array(df.index[train_size:], dtype='datetime64[D]')


# info
print('Train size: ', ratio)
print('\n\nTRAIN SET:  from  ' + str(np.min(dates_train)) + '  to  ' +str(np.max(dates_train)))
print('Data size: ', X_train.shape[0])
print('Number of days: ', int(X_train.shape[0] / period))
print('\n\nTEST SET:  from  ' + str(np.min(dates_test)) + '  to  ' +str(np.max(dates_test)))
print('Data size: ', X_test.shape[0])
print('Number of days: ', int(X_test.shape[0] / period))

Train size:  0.55


TRAIN SET:  from  2014-07-01  to  2014-10-26
Data size:  118
Number of days:  2


TEST SET:  from  2014-10-27  to  2015-01-31
Data size:  97
Number of days:  2


In [163]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [164]:
# params
dim_hidden1 = 32
dim_hidden2 = 16
dim_hidden3 = 8


# model
class Autoencoder(Model):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = Sequential([
          Dense(dim_hidden1, activation="relu"),
          Dense(dim_hidden2, activation="relu"),
          Dense(dim_hidden3, activation="relu")])

        self.decoder = Sequential([
          Dense(dim_hidden2, activation="relu"),
          Dense(dim_hidden1, activation="relu"),
          Dense(period, activation="sigmoid")])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [165]:
# params
epochs = 100
batch_size = 20
validation_split = 0.1
shuffle = False


# fitting model
autoencoder = Autoencoder()
autoencoder.compile(optimizer='adam', loss='mse')

In [166]:
history = autoencoder.fit(X_train, X_train, 
                          epochs = epochs, 
                          batch_size = batch_size, 
                          validation_split = validation_split, 
                          shuffle = shuffle)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [167]:
encoded_data = autoencoder.encoder(X_train).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

In [168]:
encoded_data = autoencoder.encoder(X_test).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

In [169]:
reconstruction = autoencoder.predict(X_train)
loss_train = tf.keras.losses.mae(reconstruction, X_train)



In [170]:
reconstruction_test = autoencoder.predict(X_test)
loss_test = tf.keras.losses.mae(reconstruction_test, X_test)



In [171]:
threshold = np.mean(loss_train) + 1.75*np.std(loss_train)
print(threshold)

1.1298712800324364


In [172]:
results = pd.DataFrame({'date': np.array(dates_test, dtype='datetime64[D]'), 
                        'loss': loss_test})

results = results.set_index('date')

In [173]:
scaled_loss = (loss_test - np.min(loss_test)) / (np.max(loss_test) - np.min(loss_test)) * 0.4

In [174]:
y_pred = loss_test.numpy() >= threshold

In [175]:
ref = np.array(dates_test[y_pred])

ref = pd.to_datetime(ref, format='%Y/%m/%d')

print(np.array(dates_test[y_pred]))

['2014-11-01' '2014-11-27' '2014-12-07' '2014-12-14' '2014-12-24'
 '2014-12-25' '2014-12-26' '2014-12-27' '2014-12-28' '2015-01-01'
 '2015-01-04' '2015-01-11' '2015-01-18' '2015-01-25' '2015-01-26'
 '2015-01-27']


In [176]:
autoencoder_data = data.reset_index().copy()

autoencoder_data['date'] = pd.to_datetime(autoencoder_data['timestamp']).dt.date

# Modifier les valeurs dans 'pred' où la date est dans 'ref'
autoencoder_data["pred"] = 0 
autoencoder_data.loc[autoencoder_data['date'].isin(ref.date), 'pred'] = 1

# Supprimer la colonne 'date' si elle n'est plus nécessaire
autoencoder_data = autoencoder_data.drop('date', axis=1)

autoencoder_data.to_csv("../data/autoencoder_nyc_taxi.csv")

In [177]:
total = pd.read_csv("../data/total_nyc_taxi.csv",index_col=0)
total["autoencoder_pred"] = autoencoder_data["pred"]

total.to_csv("../data/total_nyc_taxi.csv")