# Transform Demand data, so it can be used by a Neural Network

In [1]:
import datetime
import pandas as pd
import numpy as np

from scipy import spatial
from typing import Tuple

## Generate Demand Data and Passenger objects

In [2]:
from src.robin.demand.entities import Demand, Passenger

path_config_demand = '../configs/test_case/demand_data.yml'

demand = Demand.from_yaml(path_config_demand)

passengers = demand.generate_passengers()

## Get passenger relevant information and save it in a dataframe

In [3]:
def get_passenger_info(passenger: Passenger) -> Tuple[str, str, str, datetime.datetime, float]:
    """
    Get the information of a passenger and return it as a tuple

    Args:
        passenger: Passenger object

    Returns:
        Tuple[str, str, str, datetime.date, float]: Tuple with the information of the passenger
    """
    user_pattern = passenger.user_pattern.name
    origin, destination = passenger.market.departure_station, passenger.market.arrival_station
    arrival_day = datetime.datetime.combine(passenger.arrival_day.date, datetime.datetime.min.time())
    arrival_time = np.asarray(passenger.arrival_time).astype(np.float32)
    return user_pattern.lower(), origin, destination, arrival_day, arrival_time

# Map a list of Passenger objects to a list of tuples with the passenger information
passengers_info = list(map(get_passenger_info, passengers))

# Dataframe with the passenger information
df = pd.DataFrame(passengers_info, columns=['user_pattern', 'origin', 'destination', 'arrival_day', 'arrival_time'])

def elapsed_days(date: datetime.datetime) -> np.ndarray:
    """
    Returns the number of days elapsed since the first day of the year.

    Args:
        date (datetime.datetime): Datetime object.

    Returns:
        int: Number of days elapsed since the first day of the year.
    """
    reference_date = datetime.datetime(1970, 1, 1)
    return np.asarray((date - reference_date).days * 1.0).astype(np.float32)


df['arrival_day'] = df['arrival_day'].apply(elapsed_days)
print(df.head())

  user_pattern origin destination  arrival_day arrival_time
0     business  60000       71801      19509.0     7.175927
1     business  60000       71801      19509.0     8.415422
2     business  04040       71801      19509.0     7.544066
3      student  04040       71801      19509.0    20.271843
4     business  60000       71801      19509.0     8.302775


## Map stations IDs to Stations names

In [4]:
stations_csv_path = f'../data/renfe/renfe_stations.csv'

def get_renfe_station_id(adif_id: str, stations_df: pd.DataFrame) -> str:
    """
    Returns the Station name given the Adif station id.

    Args:
        adif_id (str): Adif station id.
        stations_df (pd.DataFrame): Dataframe with the stations' information.

    Returns:
        str: Station name.
    """
    station_name = stations_df[stations_df['stop_id'] == adif_id]['stop_name'].values[0]
    station_name = station_name.replace("-", " ").split(" ")[0].lower()
    return station_name

stations_df = pd.read_csv(stations_csv_path, dtype={'stop_id': str, 'renfe_id': str})

df['origin'] = df['origin'].apply(get_renfe_station_id, args=(stations_df,))
df['destination'] = df['destination'].apply(get_renfe_station_id, args=(stations_df,))

print(df.head())

  user_pattern    origin destination  arrival_day arrival_time
0     business    madrid   barcelona      19509.0     7.175927
1     business    madrid   barcelona      19509.0     8.415422
2     business  zaragoza   barcelona      19509.0     7.544066
3      student  zaragoza   barcelona      19509.0    20.271843
4     business    madrid   barcelona      19509.0     8.302775


## Import pre-trained GloVe embeddings

In [5]:
# Get bag of words
words_set = set(df[['user_pattern', 'origin', 'destination']].values.flatten())

print(words_set)

{'madrid', 'zaragoza', 'business', 'student', 'barcelona'}


In [6]:
# Import GloVe embeddings 50D
import os
import numpy as np

word_index = {word: index for index, word in enumerate(words_set)}

embeddings_index = {}
f = open(os.path.join('../data/pretrained/glove6B/glove.6B.50d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype=np.float32)
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

EMBEDDING_DIM = 50

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
embedding_dict = {}
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    print(word, embedding_vector)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        embedding_dict[word] = embedding_vector

Found 400000 word vectors.
madrid [ 1.3315     0.72181   -0.060088   0.43948    0.18419   -1.5083
 -0.48125    0.46037   -1.4088     1.2701     0.68031   -0.59232
 -1.6325    -0.30376    0.87685   -0.75531   -0.37583   -0.5363
 -1.0669     0.45537   -0.66694    0.43001   -0.69525    0.67518
 -0.93783   -0.67933    1.1104     0.37576   -0.36894   -0.083185
  2.0346     0.96286   -0.56629   -0.7787    -0.10705   -0.14102
  0.07384    0.62338    0.20366    0.0076751  0.71088    0.01501
  0.53186   -0.82256   -0.35087    0.30876   -0.065328   0.23722
  1.4692    -0.93469  ]
zaragoza [ 1.0642    0.089939 -0.28715   0.82471   0.31063  -1.4798    0.12028
  0.7449   -1.2919    0.39737   0.4715   -0.53483  -0.48049  -1.2998
  0.32826  -1.3085   -0.67916  -0.20625  -0.7232    0.41638  -1.2895
 -0.69963  -0.23631   0.70175  -0.61498   0.20193   1.329     0.25294
 -0.093715 -0.16535   0.99605   1.2007   -0.26729  -0.42035  -0.15881
  0.63906  -0.73977   1.3119    0.61136  -0.59197   1.3032    0.16

In [7]:
input_data = df[['user_pattern', 'origin', 'destination']].values.tolist()

input_vectors = np.array([np.array(list(map(lambda word: embeddings_index.get(word), row))) for row in input_data])

print(input_vectors[0])
print(input_vectors.shape)

[[ 0.023693   0.13316    0.023131   0.49833    0.026874  -0.43252
  -1.1364    -0.82001    0.22388   -0.032119  -0.069651   0.39857
  -0.58275    0.095008  -0.023643   0.23237   -0.42441    0.65709
   0.57802   -0.51602    1.8253     0.12951   -0.61773    0.39281
  -0.35754   -1.6778    -0.45201   -0.47075    0.19487    0.35828
   3.6034     0.32865    0.47288   -0.33787   -0.46234   -0.51628
  -1.3755     0.70789    0.4648    -0.16186   -0.0961    -0.28523
   0.30047    0.50902    0.081356  -0.015639  -0.51021    0.34585
   0.24201    0.82237  ]
 [ 1.3315     0.72181   -0.060088   0.43948    0.18419   -1.5083
  -0.48125    0.46037   -1.4088     1.2701     0.68031   -0.59232
  -1.6325    -0.30376    0.87685   -0.75531   -0.37583   -0.5363
  -1.0669     0.45537   -0.66694    0.43001   -0.69525    0.67518
  -0.93783   -0.67933    1.1104     0.37576   -0.36894   -0.083185
   2.0346     0.96286   -0.56629   -0.7787    -0.10705   -0.14102
   0.07384    0.62338    0.20366    0.0076751  0.710

## Embedding Model

In [8]:
from keras.models import Sequential
from keras.layers import Input, Embedding, Flatten

vocab_size = len(words_set)

embedding_model = Sequential()
embedding_model.add(Input(shape=(3,)))
embedding_model.add(Embedding(input_dim= vocab_size + 1,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            name='words_embedding', trainable=True
                            ))

embedding_model.compile(optimizer='adam', loss='mse')
embedding_model.summary()

2023-05-05 10:27:23.853575: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 words_embedding (Embedding)  (None, 3, 50)            300       
                                                                 
Total params: 300
Trainable params: 300
Non-trainable params: 0
_________________________________________________________________


2023-05-05 10:27:27.794209: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Autoencoder model

In [9]:
import tensorflow as tf

from keras.layers import Input, GRU, Dense, RepeatVector, TimeDistributed
from keras.models import Model

input_shape = (3, 50)
output_shape = (3, 50)

latent_dim = 4

inputs = Input(shape=input_shape)

encoder = GRU(64, return_sequences=True)(inputs)
encoder = GRU(32)(encoder)

latent = Dense(latent_dim)(encoder)

decoder_inputs = Input(shape=(latent_dim,))
decoder = RepeatVector(input_shape[0])(decoder_inputs)
decoder = GRU(32, return_sequences=True)(decoder)
decoder = GRU(64, return_sequences=True)(decoder)
decoder_outputs = TimeDistributed(Dense(output_shape[1]))(decoder)

encoder_model = Model(inputs, latent)
decoder_model = Model(decoder_inputs, decoder_outputs)

model = Model(inputs, decoder_model(latent))

model.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.MeanSquaredError()])
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 3, 50)]           0         
                                                                 
 gru (GRU)                   (None, 3, 64)             22272     
                                                                 
 gru_1 (GRU)                 (None, 32)                9408      
                                                                 
 dense (Dense)               (None, 4)                 132       
                                                                 
 model_1 (Functional)        (None, 3, 50)             25714     
                                                                 
Total params: 57,526
Trainable params: 57,526
Non-trainable params: 0
_________________________________________________________________


In [10]:
history = model.fit(input_vectors, input_vectors, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Make predictions

In [11]:
def find_closest_embeddings(embedding):
    return sorted(embedding_dict.keys(), key=lambda word: spatial.distance.euclidean(embedding_dict[word], embedding))[0]

In [12]:
random_index = np.random.randint(0, len(input_vectors))
random_vector = input_vectors[random_index]

print("Input - Random passenger data: ")
print(input_data[random_index])

prediction = model.predict(np.array([random_vector]))[0]

print("Output - Passenger reconstruction: ")
decoded_prediction = [find_closest_embeddings(word_vector) for word_vector in prediction]
print(decoded_prediction)

Input - Random passenger data: 
['business', 'madrid', 'barcelona']
Output - Passenger reconstruction: 
['business', 'madrid', 'barcelona']
