# Transform Demand data, so it can be used by a Neural Network

In [8]:
import datetime
import pandas as pd

from typing import Tuple

## Generate Demand Data and Passenger objects

In [9]:
from src.robin.demand.entities import Demand, Passenger

path_config_demand = '../configs/test_case/demand_data.yml'

demand = Demand.from_yaml(path_config_demand)

passengers = demand.generate_passengers()

## Get passenger relevant information and save it in a dataframe

In [11]:
def get_passenger_info(passenger: Passenger) -> Tuple[str, str, str, float]:
    """
    Get the information of a passenger and return it as a tuple

    Args:
        passenger: Passenger object

    Returns:
        Tuple[str, str, str, datetime.date, float]: Tuple with the information of the passenger
    """
    user_pattern = passenger.user_pattern.name
    origin, destination = passenger.market.departure_station, passenger.market.arrival_station
    arrival_day = datetime.datetime.combine(passenger.arrival_day.date, datetime.datetime.min.time())
    #arrival_float = passenger.arrival_time
    #arrival_hour = int(arrival_float)
    #arrival_minute = int((arrival_float - arrival_hour) * 60)
    #arrival_time = arrival_day + datetime.timedelta(hours=arrival_hour, minutes=arrival_minute)
    arrival_time = passenger.arrival_time
    return user_pattern.lower(), origin, destination, arrival_day, arrival_time

# Map a list of Passenger objects to a list of tuples with the passenger information
passengers_info = list(map(get_passenger_info, passengers))

# Dataframe with the passenger information
df = pd.DataFrame(passengers_info, columns=['user_pattern', 'origin', 'destination', 'arrival_day', 'arrival_time'])

print(df.head())

  user_pattern origin destination arrival_day  arrival_time
0     business  60000       04040  2023-06-01      7.481474
1     business  60000       04040  2023-06-01      7.454439
2     business  60000       04040  2023-06-01      8.142726
3     business  60000       04040  2023-06-01      7.103851
4     business  60000       04040  2023-06-01      7.723837


## Map stations IDs to Stations names

In [4]:
print(passengers_info[0])

('business', '60000', '04040', datetime.datetime(2023, 6, 1, 8, 39))


In [5]:
stations_csv_path = f'../data/renfe/renfe_stations.csv'

def get_renfe_station_id(adif_id: str, stations_df: pd.DataFrame) -> str:
    """
    Returns the Station name given the Adif station id.

    Args:
        adif_id (str): Adif station id.
        stations_df (pd.DataFrame): Dataframe with the stations' information.

    Returns:
        str: Station name.
    """
    station_name = stations_df[stations_df['stop_id'] == adif_id]['stop_name'].values[0]
    station_name = station_name.replace("-", " ").split(" ")[0].lower()
    return station_name

def datetime_to_seconds(date: datetime.datetime) -> float:
    """
    Returns the time in seconds of a datetime object.

    Args:
        date (datetime.datetime): Datetime object.

    Returns:
        float: Time in seconds.
    """
    return date.hour * 3600 + date.minute * 60 + date.second

stations_df = pd.read_csv(stations_csv_path, dtype={'stop_id': str, 'renfe_id': str})

df['origin'] = df['origin'].apply(get_renfe_station_id, args=(stations_df,))
df['destination'] = df['destination'].apply(get_renfe_station_id, args=(stations_df,))
df['arrival_time'] = df['arrival_time'].apply(datetime_to_seconds)

print(df.head())

  user_pattern  origin destination  arrival_time
0     business  madrid    zaragoza         31140
1     business  madrid    zaragoza         36600
2     business  madrid    zaragoza         27540
3     business  madrid    zaragoza         26760
4     business  madrid    zaragoza         35220


In [7]:
def seconds_to_datetime(seconds: float) -> datetime.datetime:
    """
    Returns a datetime object given the time in seconds.

    Args:
        seconds (float): Time in seconds.

    Returns:
        datetime.datetime: Datetime object.
    """
    return datetime.datetime.min + datetime.timedelta(seconds=seconds)

df['arrival_time'] = df['arrival_time'].apply(seconds_to_datetime)

print(df.head())

  user_pattern  origin destination         arrival_time
0     business  madrid    zaragoza  0001-01-01 08:39:00
1     business  madrid    zaragoza  0001-01-01 10:10:00
2     business  madrid    zaragoza  0001-01-01 07:39:00
3     business  madrid    zaragoza  0001-01-01 07:26:00
4     business  madrid    zaragoza  0001-01-01 09:47:00


## Define Autoencoder Model

## NOT FUNCTIONAL YET!

## Import pre-trained GloVe embeddings

In [6]:
# Get bag of words
words_set = set(df[['user_pattern', 'origin', 'destination']].values.flatten())

print(words_set)

{'barcelona', 'madrid', 'student', 'zaragoza', 'business'}


In [61]:
# Import GloVe embeddings 50D
import os
import numpy as np

word_index = {word: index for index, word in enumerate(words_set)}

embeddings_index = {}
f = open(os.path.join('../data/pretrained/glove6B/glove.6B.50d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype=np.float32)
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

EMBEDDING_DIM = 50

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    print(word, embedding_vector)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.
student [-1.0729    0.94103   0.084904 -1.0766    0.42866   0.099877 -0.51081
 -0.24961  -0.30883   0.19553   0.1965   -0.73152   0.096916 -0.062686
  0.12078  -0.72384  -0.382     0.6934    0.32956   0.40244   0.53485
  0.91781  -0.44553   0.71804  -0.13635  -1.6906    0.15818  -1.2367
 -1.2278   -0.058566  2.7544    0.18672  -0.263    -1.2792    0.16992
  0.40748   0.12248   0.11211   0.78318   0.036392 -0.40808  -0.058474
 -0.27932   0.33035   0.52384  -1.0487    0.27565   0.0363    0.048604
  0.28239 ]
madrid [ 1.3315     0.72181   -0.060088   0.43948    0.18419   -1.5083
 -0.48125    0.46037   -1.4088     1.2701     0.68031   -0.59232
 -1.6325    -0.30376    0.87685   -0.75531   -0.37583   -0.5363
 -1.0669     0.45537   -0.66694    0.43001   -0.69525    0.67518
 -0.93783   -0.67933    1.1104     0.37576   -0.36894   -0.083185
  2.0346     0.96286   -0.56629   -0.7787    -0.10705   -0.14102
  0.07384    0.62338    0.20366    0.0076751  0.71088    0.01501


In [62]:
# Keras Time2Vec custom layer - From: https://towardsdatascience.com/time2vec-for-time-series-features-encoding-a03a4f3f937e

"""module for implementing time to vector encoding"""

import tensorflow as tf
from keras.layers import Layer

class Time2Vec(Layer):
    """Time to vector encoding layer."""

    def __init__(self, kernel: int = 64, activation: str = "sin") -> None:
        """
        Args:
            kernel (int, optional): length of time vector representation. Defaults to 64
            activation (str, optional): periodic activation for time encoding. Defaults to "sin".
        Raises:
            NotImplementedError: Non-supported activations
        """

        # periodic components
        if activation in ["sin", "cos"]:
            activation = {"sin": tf.math.sin, "cos": tf.math.cos}[activation]
        else:
            raise NotImplementedError(
                f"'{activation}' is an unsupported periodic activation."
            )

        super().__init__(trainable=True, name="Time2VecLayer_" + activation.__name__)

        self.k = kernel - 1
        self.p_activation = activation

    def build(self, input_shape: tuple) -> None:
        """method for building and initializing the weights for the tensor operations
        Args:
            input_shape (tuple): shape of the incoming tensor
        """
        # Linear component
        self.w_b = self.add_weight(
            shape=(1, input_shape[1], 1), initializer="uniform", trainable=True
        )

        self.b_b = self.add_weight(
            shape=(1, input_shape[1], 1), initializer="uniform", trainable=True
        )

        # Periodic components
        self.freq = self.add_weight(
            shape=(1, input_shape[1], self.k), initializer="uniform", trainable=True
        )

        self.phase = self.add_weight(
            shape=(1, input_shape[1], self.k), initializer="uniform", trainable=True
        )

        super().build(input_shape)

    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:

        """method to perform the layer operation
        Args:
            inputs (tf.Tensor): shape = (batch_size, feature_size)
        Returns:
            tf.Tensor: shape = (batch_size, feature_size, length of time vector representation)
        """

        inputs = tf.expand_dims(inputs, axis=-1)

        # Linear components
        lin = (
            # Multiply each time dimension with the corresponding linear time component
            tf.multiply(inputs, self.w_b)
            # Bias component for each time dimension
            + self.b_b
        )

        # Periodic components
        # Multiply each time dimension (M, D, H, mins, etc.) with the corresponding frequency vector
        per = tf.multiply(tf.tile(inputs, multiples=[1, 1, self.k]), self.freq)
        # Phase vector for each time dimension
        per = self.p_activation(per + self.phase)
        return tf.concat([lin, per], -1)

    def compute_output_shape(self, input_shape: tuple) -> tuple:
        """computes the shape of output tensor
        Args:
            input_shape (tuple): shape of incoming tensor
        Returns:
            tuple: shape of outgoing tensor
        """
        return input_shape[0], input_shape[1], self.k + 1


In [63]:
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Concatenate
from keras.layers import Flatten, Dropout

vocab_size = len(words_set)

# Encoder inputs
input_words = Input(shape=(3,), name='input_words')
input_date = Input(shape=(1,), name='input_date')

words_embedding = Embedding(input_dim= vocab_size + 1, # +1?
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            name='words_embedding', trainable=True
                            )(input_words)

time2vec = Time2Vec(1)(input_date)

flat_words = Flatten()(words_embedding)
flat_dates = Flatten()(time2vec)

x = Concatenate()([flat_words, flat_dates])

hidden = Dense(32, activation='relu', name='hidden')(x)

output_words = Dense(3, activation='softmax', name='output_words')(hidden)
output_date = Dense(1, activation='linear', name='output_date')(hidden)

model = Model(inputs=[input_words, input_date], outputs=[output_words, output_date])

# Compilar el modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
input = [df[['user_pattern', 'origin', 'destination']].values, df[['arrival_time']].values]

output = [df[['user_pattern', 'origin', 'destination']].values, df[['arrival_time']].values]

print(input)

model.fit(input, output, epochs=10, batch_size=32, verbose=1)