# Transform Demand data, so it can be used by a Neural Network

In [27]:
import datetime
import pandas as pd
import numpy as np

from typing import Tuple

## Generate Demand Data and Passenger objects

In [28]:
from src.robin.demand.entities import Demand, Passenger

path_config_demand = '../configs/test_case/demand_data.yml'

demand = Demand.from_yaml(path_config_demand)

passengers = demand.generate_passengers()

## Get passenger relevant information and save it in a dataframe

In [43]:
def get_passenger_info(passenger: Passenger) -> Tuple[str, str, str, datetime.datetime, float]:
    """
    Get the information of a passenger and return it as a tuple

    Args:
        passenger: Passenger object

    Returns:
        Tuple[str, str, str, datetime.date, float]: Tuple with the information of the passenger
    """
    user_pattern = passenger.user_pattern.name
    origin, destination = passenger.market.departure_station, passenger.market.arrival_station
    arrival_day = datetime.datetime.combine(passenger.arrival_day.date, datetime.datetime.min.time())
    arrival_time = np.asarray(passenger.arrival_time).astype(np.float32)
    return user_pattern.lower(), origin, destination, arrival_day, arrival_time

# Map a list of Passenger objects to a list of tuples with the passenger information
passengers_info = list(map(get_passenger_info, passengers))

# Dataframe with the passenger information
df = pd.DataFrame(passengers_info, columns=['user_pattern', 'origin', 'destination', 'arrival_day', 'arrival_time'])

def elapsed_days(date: datetime.datetime) -> np.ndarray:
    """
    Returns the number of days elapsed since the first day of the year.

    Args:
        date (datetime.datetime): Datetime object.

    Returns:
        int: Number of days elapsed since the first day of the year.
    """
    reference_date = datetime.datetime(1970, 1, 1)
    return np.asarray((date - reference_date).days * 1.0).astype(np.float32)


df['arrival_day'] = df['arrival_day'].apply(elapsed_days)
print(df.head())

  user_pattern origin destination  arrival_day arrival_time
0     business  60000       04040      19509.0    7.5334277
1     business  60000       04040      19509.0      8.24437
2     business  60000       04040      19509.0      9.97972
3     business  60000       04040      19509.0      9.56266
4     business  60000       04040      19509.0     8.047462


## Map stations IDs to Stations names

In [44]:
stations_csv_path = f'../data/renfe/renfe_stations.csv'

def get_renfe_station_id(adif_id: str, stations_df: pd.DataFrame) -> str:
    """
    Returns the Station name given the Adif station id.

    Args:
        adif_id (str): Adif station id.
        stations_df (pd.DataFrame): Dataframe with the stations' information.

    Returns:
        str: Station name.
    """
    station_name = stations_df[stations_df['stop_id'] == adif_id]['stop_name'].values[0]
    station_name = station_name.replace("-", " ").split(" ")[0].lower()
    return station_name

stations_df = pd.read_csv(stations_csv_path, dtype={'stop_id': str, 'renfe_id': str})

df['origin'] = df['origin'].apply(get_renfe_station_id, args=(stations_df,))
df['destination'] = df['destination'].apply(get_renfe_station_id, args=(stations_df,))

print(df.head())

  user_pattern  origin destination  arrival_day arrival_time
0     business  madrid    zaragoza      19509.0    7.5334277
1     business  madrid    zaragoza      19509.0      8.24437
2     business  madrid    zaragoza      19509.0      9.97972
3     business  madrid    zaragoza      19509.0      9.56266
4     business  madrid    zaragoza      19509.0     8.047462


## Define Autoencoder Model

## NOT FUNCTIONAL YET!

## Import pre-trained GloVe embeddings

In [45]:
# Get bag of words
words_set = set(df[['user_pattern', 'origin', 'destination']].values.flatten())

print(words_set)

{'student', 'barcelona', 'madrid', 'zaragoza', 'business'}


In [46]:
# Import GloVe embeddings 50D
import os
import numpy as np

word_index = {word: index for index, word in enumerate(words_set)}

embeddings_index = {}
f = open(os.path.join('../data/pretrained/glove6B/glove.6B.50d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype=np.float32)
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

EMBEDDING_DIM = 50

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    print(word, embedding_vector)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.
student [-1.0729    0.94103   0.084904 -1.0766    0.42866   0.099877 -0.51081
 -0.24961  -0.30883   0.19553   0.1965   -0.73152   0.096916 -0.062686
  0.12078  -0.72384  -0.382     0.6934    0.32956   0.40244   0.53485
  0.91781  -0.44553   0.71804  -0.13635  -1.6906    0.15818  -1.2367
 -1.2278   -0.058566  2.7544    0.18672  -0.263    -1.2792    0.16992
  0.40748   0.12248   0.11211   0.78318   0.036392 -0.40808  -0.058474
 -0.27932   0.33035   0.52384  -1.0487    0.27565   0.0363    0.048604
  0.28239 ]
barcelona [ 0.68944   1.2217   -0.23655   0.36109  -0.62116  -1.0075   -0.52565
  0.65766  -1.2764    1.1286    1.1386   -0.36088  -1.3849   -0.58442
  0.9772   -0.35103   0.29237  -0.27426  -1.3109   -0.015967 -1.0695
  0.11901  -0.56335   0.49648  -0.44571  -0.47566   0.79045   0.42923
 -0.76743  -0.14029   1.7552    1.3342   -0.42864  -0.29125  -0.2056
  0.21055   0.099324  1.4187    0.34068  -0.4477    1.0795    0.10387
  0.15772  -0.51013  -0.50933   0

In [104]:
from keras.models import Model
from keras.layers import Input, Embedding, Dense

vocab_size = len(words_set)

input_words = Input(shape=(3,), name='input_words')

words_embedding = Embedding(input_dim= vocab_size + 1, # +1?
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            name='words_embedding', trainable=True
                            )(input_words)

hidden = Dense(16, activation='relu', name='hidden')(words_embedding)
latent = Dense(8, activation='relu', name='latent')(hidden)

decoded = Dense(16, activation='relu', name='decoded')(latent)
output_words = Dense(3, activation='softmax', name='output_words')(decoded)

model = Model(inputs=[input_words], outputs=[output_words])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_words (InputLayer)    [(None, 3)]               0         
                                                                 
 words_embedding (Embedding)  (None, 3, 50)            300       
                                                                 
 hidden (Dense)              (None, 3, 16)             816       
                                                                 
 latent (Dense)              (None, 3, 8)              136       
                                                                 
 decoded (Dense)             (None, 3, 16)             144       
                                                                 
 output_words (Dense)        (None, 3, 3)              51        
                                                                 
Total params: 1,447
Trainable params: 1,447
Non-trainable 

In [105]:
input = df[['user_pattern', 'origin', 'destination']].values.tolist()

In [106]:
model.fit(input, input, epochs=10, batch_size=1, verbose=1)

Epoch 1/10


ValueError: in user code:

    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/engine/training.py", line 890, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/engine/training.py", line 948, in compute_loss
        return self.compiled_loss(
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/engine/compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/losses.py", line 139, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/losses.py", line 243, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/losses.py", line 1930, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "/Users/david/opt/anaconda3/envs/robin/lib/python3.8/site-packages/keras/backend.py", line 5283, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)

    ValueError: `logits` and `labels` must have the same shape, received ((1, 3, 3) vs (1, 3)).
