# Transform Demand data, so it can be used by a Neural Network

In [1]:
import datetime
import pandas as pd
import numpy as np

from scipy import spatial
from typing import Tuple

## Generate Demand Data and Passenger objects

In [2]:
from src.robin.demand.entities import Demand, Passenger

path_config_demand = '../configs/test_case/demand_data.yml'

demand = Demand.from_yaml(path_config_demand)

passengers = demand.generate_passengers()

## Get passenger relevant information and save it in a dataframe

In [67]:
from sklearn.preprocessing import MinMaxScaler

def get_passenger_info(passenger: Passenger) -> Tuple[str, str, str, datetime.datetime]:
    """
    Get the information of a passenger and return it as a tuple

    Args:
        passenger: Passenger object

    Returns:
        Tuple[str, str, str, datetime.date, float]: Tuple with the information of the passenger
    """
    user_pattern = passenger.user_pattern.name
    origin, destination = passenger.market.departure_station, passenger.market.arrival_station
    arrival_day = datetime.datetime.combine(passenger.arrival_day.date, datetime.datetime.min.time())
    arrival_time = passenger.arrival_time
    arrival_hour = int(arrival_time)
    arrival_minutes = int((arrival_time - arrival_hour) * 60)
    arrival_datetime = arrival_day + datetime.timedelta(hours=arrival_hour, minutes=arrival_minutes)
    return user_pattern.lower(), origin, destination, arrival_datetime

# Map a list of Passenger objects to a list of tuples with the passenger information
passengers_info = list(map(get_passenger_info, passengers))

def get_datetime_vector(datetime: datetime.datetime) -> np.array:
    """
    Returns a vector with the datetime information

    Args:
        datetime (datetime.datetime): Datetime object

    Returns:
        np.array: Vector with the datetime information
    """
    return np.array([datetime.year, datetime.month, datetime.day, datetime.hour, datetime.minute])

# Dataframe with the passenger information
df = pd.DataFrame(passengers_info, columns=['user_pattern', 'origin', 'destination', 'arrival_datetime'])
df['arrival_datetime'] = df['arrival_datetime'].apply(get_datetime_vector)
arrival_dates = np.array(df['arrival_datetime'].tolist())
print(arrival_dates.shape)
scaled_data = MinMaxScaler().fit_transform(arrival_dates)
df['scaled_data'] = scaled_data.tolist()
# Padd scaled data column
df['scaled_data'] = df['scaled_data'].apply(lambda x: np.array(x + [0.0] * (50 - len(x))))

print(df.head())

(5956, 5)
  user_pattern origin destination     arrival_datetime  \
0      student  60000       04040  [2023, 6, 1, 14, 4]   
1     business  60000       71801  [2023, 6, 1, 7, 48]   
2     business  60000       71801  [2023, 6, 1, 8, 48]   
3     business  60000       71801  [2023, 6, 1, 8, 11]   
4     business  60000       04040  [2023, 6, 1, 8, 37]   

                                         scaled_data  
0  [0.0, 0.0, 0.0, 0.6086956521739131, 0.06779661...  
1  [0.0, 0.0, 0.0, 0.30434782608695654, 0.8135593...  
2  [0.0, 0.0, 0.0, 0.34782608695652173, 0.8135593...  
3  [0.0, 0.0, 0.0, 0.34782608695652173, 0.1864406...  
4  [0.0, 0.0, 0.0, 0.34782608695652173, 0.6271186...  


## Map stations IDs to Stations names

In [68]:
stations_csv_path = f'../data/renfe/renfe_stations.csv'

def get_renfe_station_id(adif_id: str, stations_df: pd.DataFrame) -> str:
    """
    Returns the Station name given the Adif station id.

    Args:
        adif_id (str): Adif station id.
        stations_df (pd.DataFrame): Dataframe with the stations' information.

    Returns:
        str: Station name.
    """
    station_name = stations_df[stations_df['stop_id'] == adif_id]['stop_name'].values[0]
    station_name = station_name.replace("-", " ").split(" ")[0].lower()
    return station_name

stations_df = pd.read_csv(stations_csv_path, dtype={'stop_id': str, 'renfe_id': str})

df['origin'] = df['origin'].apply(get_renfe_station_id, args=(stations_df,))
df['destination'] = df['destination'].apply(get_renfe_station_id, args=(stations_df,))

print(df.head())

  user_pattern  origin destination     arrival_datetime  \
0      student  madrid    zaragoza  [2023, 6, 1, 14, 4]   
1     business  madrid   barcelona  [2023, 6, 1, 7, 48]   
2     business  madrid   barcelona  [2023, 6, 1, 8, 48]   
3     business  madrid   barcelona  [2023, 6, 1, 8, 11]   
4     business  madrid    zaragoza  [2023, 6, 1, 8, 37]   

                                         scaled_data  
0  [0.0, 0.0, 0.0, 0.6086956521739131, 0.06779661...  
1  [0.0, 0.0, 0.0, 0.30434782608695654, 0.8135593...  
2  [0.0, 0.0, 0.0, 0.34782608695652173, 0.8135593...  
3  [0.0, 0.0, 0.0, 0.34782608695652173, 0.1864406...  
4  [0.0, 0.0, 0.0, 0.34782608695652173, 0.6271186...  


## Import pre-trained GloVe embeddings

In [69]:
# Get bag of words
words_set = set(df[['user_pattern', 'origin', 'destination']].values.flatten())

print(words_set)

{'barcelona', 'business', 'madrid', 'zaragoza', 'student'}


In [70]:
# Import GloVe embeddings 50D
import os
import numpy as np

word_index = {word: index for index, word in enumerate(words_set)}

embeddings_index = {}
f = open(os.path.join('../data/pretrained/glove6B/glove.6B.50d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype=np.float32)
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

EMBEDDING_DIM = 50

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
embedding_dict = {}
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    print(word, embedding_vector)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        embedding_dict[word] = embedding_vector

Found 400000 word vectors.
barcelona [ 0.68944   1.2217   -0.23655   0.36109  -0.62116  -1.0075   -0.52565
  0.65766  -1.2764    1.1286    1.1386   -0.36088  -1.3849   -0.58442
  0.9772   -0.35103   0.29237  -0.27426  -1.3109   -0.015967 -1.0695
  0.11901  -0.56335   0.49648  -0.44571  -0.47566   0.79045   0.42923
 -0.76743  -0.14029   1.7552    1.3342   -0.42864  -0.29125  -0.2056
  0.21055   0.099324  1.4187    0.34068  -0.4477    1.0795    0.10387
  0.15772  -0.51013  -0.50933   0.25395   0.050859 -0.084172  0.69738
 -1.192   ]
business [ 0.023693  0.13316   0.023131  0.49833   0.026874 -0.43252  -1.1364
 -0.82001   0.22388  -0.032119 -0.069651  0.39857  -0.58275   0.095008
 -0.023643  0.23237  -0.42441   0.65709   0.57802  -0.51602   1.8253
  0.12951  -0.61773   0.39281  -0.35754  -1.6778   -0.45201  -0.47075
  0.19487   0.35828   3.6034    0.32865   0.47288  -0.33787  -0.46234
 -0.51628  -1.3755    0.70789   0.4648   -0.16186  -0.0961   -0.28523
  0.30047   0.50902   0.081356 -0.0

In [71]:
from copy import deepcopy

input_data = df[['user_pattern', 'origin', 'destination', 'scaled_data']].values.tolist()

input_vectors = []
for row in deepcopy(input_data):
    row[:3] = list(map(lambda word: embedding_dict.get(word), row[:3]))
    input_vectors.append(row)

#input_vectors = np.array([np.array(list(map(lambda word: embeddings_index.get(word), row[:3])) + row[-1]) for row in input_data])

print(input_vectors[0])

[array([-1.0729  ,  0.94103 ,  0.084904, -1.0766  ,  0.42866 ,  0.099877,
       -0.51081 , -0.24961 , -0.30883 ,  0.19553 ,  0.1965  , -0.73152 ,
        0.096916, -0.062686,  0.12078 , -0.72384 , -0.382   ,  0.6934  ,
        0.32956 ,  0.40244 ,  0.53485 ,  0.91781 , -0.44553 ,  0.71804 ,
       -0.13635 , -1.6906  ,  0.15818 , -1.2367  , -1.2278  , -0.058566,
        2.7544  ,  0.18672 , -0.263   , -1.2792  ,  0.16992 ,  0.40748 ,
        0.12248 ,  0.11211 ,  0.78318 ,  0.036392, -0.40808 , -0.058474,
       -0.27932 ,  0.33035 ,  0.52384 , -1.0487  ,  0.27565 ,  0.0363  ,
        0.048604,  0.28239 ], dtype=float32), array([ 1.3315   ,  0.72181  , -0.060088 ,  0.43948  ,  0.18419  ,
       -1.5083   , -0.48125  ,  0.46037  , -1.4088   ,  1.2701   ,
        0.68031  , -0.59232  , -1.6325   , -0.30376  ,  0.87685  ,
       -0.75531  , -0.37583  , -0.5363   , -1.0669   ,  0.45537  ,
       -0.66694  ,  0.43001  , -0.69525  ,  0.67518  , -0.93783  ,
       -0.67933  ,  1.1104   ,  0.

## Embedding Model

In [43]:
import tensorflow as tf
from keras.layers import Layer


class Time2Vec(Layer):
    """Time2Vector encoding layer"""

    def __init__(self, kernel: int = 64, activation: str = "sin") -> None:
        """
        Args:
            kernel (int, optional): length of time vector representation. Defaults to 64
            activation (str, optional): periodic activation for time encoding. Defaults to "sin".
        Raises:
            NotImplementedError: Non-supported activations
        """

        # periodic components
        if activation in ["sin", "cos"]:
            activation = {"sin": tf.math.sin, "cos": tf.math.cos}[activation]
        else:
            raise NotImplementedError(
                f"'{activation}' is an unsupported periodic activation."
            )

        super().__init__(trainable=True, name="Time2VecLayer_" + activation.__name__)

        self.k = kernel - 1
        self.p_activation = activation

    def build(self, input_shape: tuple) -> None:
        """method for building and initializing the weights for the tensor operations
        Args:
            input_shape (tuple): shape of the incoming tensor
        """
        # Linear component
        self.w_b = self.add_weight(
            shape=(1, input_shape[1], 1), initializer="uniform", trainable=True
        )

        self.b_b = self.add_weight(
            shape=(1, input_shape[1], 1), initializer="uniform", trainable=True
        )

        # Periodic components
        self.freq = self.add_weight(
            shape=(1, input_shape[1], self.k), initializer="uniform", trainable=True
        )

        self.phase = self.add_weight(
            shape=(1, input_shape[1], self.k), initializer="uniform", trainable=True
        )

        super().build(input_shape)

    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:

        """method to perform the layer operation
        Args:
            inputs (tf.Tensor): shape = (batch_size, feature_size)
        Returns:
            tf.Tensor: shape = (batch_size, feature_size, length of time vector representation)
        """

        inputs = tf.expand_dims(inputs, axis=-1)

        # Linear components
        lin = (
            # Multiply each time dimension with the corresponding linear time component
            tf.multiply(inputs, self.w_b)
            # Bias component for each time dimension
            + self.b_b
        )

        # Periodic components
        # Multiply each time dimension (M, D, H, mins, etc.) with the corresponding frequency vector
        per = tf.multiply(tf.tile(inputs, multiples=[1, 1, self.k]), self.freq)
        # Phase vector for each time dimension
        per = self.p_activation(per + self.phase)
        return tf.concat([lin, per], -1)

    def compute_output_shape(self, input_shape: tuple) -> tuple:
        """computes the shape of output tensor
        Args:
            input_shape (tuple): shape of incoming tensor
        Returns:
            tuple: shape of outgoing tensor
        """
        return input_shape[0], input_shape[1], self.k + 1


if __name__ == "__main__":
    # 32 samples with time represented as %%M%%D%%H%%S
    test_vector = tf.random.uniform(shape=(32, 4), dtype=tf.float32)
    print(test_vector)
    #xti = Time2Vec(16, "sin")(test_vector)
    # Shape = (32, 4, 16)! 16 dimensional representation for each t-dimension
    #print(xti.shape)

tf.Tensor(
[[0.65050876 0.4465016  0.42221534 0.64011216]
 [0.9662026  0.7904266  0.0613178  0.22272074]
 [0.4362595  0.36590302 0.57302403 0.15569937]
 [0.6882864  0.7655716  0.90022624 0.9065237 ]
 [0.04508305 0.31924665 0.7474377  0.0195899 ]
 [0.20869064 0.4683851  0.63936234 0.99558187]
 [0.37097585 0.8990139  0.08087122 0.85237277]
 [0.79036653 0.8027264  0.2237761  0.6445254 ]
 [0.7936405  0.37244594 0.10876894 0.64354753]
 [0.78340375 0.5202061  0.36396265 0.04890037]
 [0.9221221  0.07489288 0.24913776 0.2682953 ]
 [0.41051102 0.54679406 0.5661305  0.60156333]
 [0.24146652 0.7910892  0.14174306 0.29268622]
 [0.5695696  0.41060877 0.17198431 0.47137773]
 [0.77031446 0.6005255  0.6517365  0.59011984]
 [0.75613904 0.25892174 0.98167837 0.24066949]
 [0.37523973 0.8791685  0.01619399 0.49088526]
 [0.26744783 0.6887306  0.67361367 0.55209637]
 [0.5478947  0.33871293 0.06698322 0.38533616]
 [0.9255959  0.9520111  0.12218833 0.6654377 ]
 [0.83597565 0.9185085  0.5610728  0.7742789 ]
 [

In [28]:
from keras.models import Sequential
from keras.layers import Input, Embedding, Flatten

vocab_size = len(words_set)

embedding_model = Sequential()
embedding_model.add(Input(shape=(3,)))
embedding_model.add(Embedding(input_dim= vocab_size + 1,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            name='words_embedding', trainable=True
                            ))

embedding_model.compile(optimizer='adam', loss='mse')
embedding_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 words_embedding (Embedding)  (None, 3, 50)            300       
                                                                 
Total params: 300
Trainable params: 300
Non-trainable params: 0
_________________________________________________________________


## Autoencoder model

In [72]:
import tensorflow as tf

from keras.layers import Input, GRU, Dense, RepeatVector, TimeDistributed
from keras.models import Model

input_shape = (4, 50)
output_shape = (4, 50)

latent_dim = 4

inputs = Input(shape=input_shape)

encoder = GRU(64, return_sequences=True)(inputs)
encoder = GRU(32)(encoder)

latent = Dense(latent_dim)(encoder)

decoder_inputs = Input(shape=(latent_dim,))
decoder = RepeatVector(input_shape[0])(decoder_inputs)
decoder = GRU(32, return_sequences=True)(decoder)
decoder = GRU(64, return_sequences=True)(decoder)
decoder_outputs = TimeDistributed(Dense(output_shape[1]))(decoder)

encoder_model = Model(inputs, latent)
decoder_model = Model(decoder_inputs, decoder_outputs)

model = Model(inputs, decoder_model(latent))

model.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.MeanSquaredError()])
model.summary()

Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 4, 50)]           0         
                                                                 
 gru_12 (GRU)                (None, 4, 64)             22272     
                                                                 
 gru_13 (GRU)                (None, 32)                9408      
                                                                 
 dense_6 (Dense)             (None, 4)                 132       
                                                                 
 model_10 (Functional)       (None, 4, 50)             25714     
                                                                 
Total params: 57,526
Trainable params: 57,526
Non-trainable params: 0
_________________________________________________________________


In [73]:
history = model.fit(input_vectors, input_vectors, epochs=10, batch_size=32, verbose=1)

KeyboardInterrupt: 

## Make predictions

In [31]:
def find_closest_embeddings(embedding):
    return sorted(embedding_dict.keys(), key=lambda word: spatial.distance.euclidean(embedding_dict[word], embedding))[0]

In [34]:
random_index = np.random.randint(0, len(input_vectors))
random_vector = input_vectors[random_index]

print("Input - Random passenger data: ")
print(input_data[random_index])

prediction = model.predict(np.array([random_vector]))[0]

print("Output - Passenger reconstruction: ")
decoded_prediction = [find_closest_embeddings(word_vector) for word_vector in prediction[:3]]
print(decoded_prediction, [int(x) for x in prediction[-1][:6]])

Input - Random passenger data: 
['business', 'madrid', 'zaragoza', array([2023,    6,    1,    7,   18,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])]
Output - Passenger reconstruction: 
['business', 'madrid', 'barcelona'] [129, 6, 0, 10, 29, 0]
