# Transform Demand data, so it can be used by a Neural Network


In [12]:
import datetime
import pandas as pd
import nltk
import numpy as np
import random

from nltk.corpus import wordnet
from sklearn.preprocessing import MinMaxScaler
from scipy import spatial
from typing import Tuple

EMBEDDING_DIM = 50

In [4]:
# Download wordnet if not already downloaded
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/uclm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from src.robin.supply.entities import Supply

path_config_supply = '../configs/test_case/supply_data.yml'

supply = Supply.from_yaml(path_config_supply)

# Get set of corridors
corridors = []
for service in supply.services:
    if service.line.corridor not in corridors:
        corridors.append(service.line.corridor)

# Get set of paths
paths = []
for corridor in corridors:
    for path in corridor.paths:
        if path not in paths:
            paths.append(path)

# Parse paths of Station objects to paths of station mame
paths = [[station.name.replace("-", " ").split(" ")[0].lower() for station in path] for path in paths]
print(paths)

[['madrid', 'guadalajara', 'calatayud', 'zaragoza', 'lleida', 'tarragona', 'barcelona', 'girona', 'figueres']]


In [8]:
def get_word_syn(word: str) -> str:
    """
    Get a random synonym of a word

    Args:
        word (str): Word to get the synonym from

    Returns:
        str: Synonym of the word
    """
    word_synset = wordnet.synsets(word)
    if len(word_synset) > 0:
        station_synset = word_synset[0]
        station_lemmas = station_synset.lemmas()
        if len(station_lemmas) > 0:
            word_lemma = random.choice(station_lemmas)
            return word_lemma.name().lower()

    return word

def get_random_pair(paths: list[list]) -> Tuple[str, str]:
    """
    Get a random pair of stations from a path

    Args:
        paths (list[list]): List of stations

    Returns:
        Tuple[str, str]: Tuple with the origin and destination stations
    """
    random_path = random.choice(paths)  # Choose a random path

    origin_index = random.randint(0, len(random_path) - 2)  # Choose a random origin station
    destination_index = random.randint(origin_index + 1, len(random_path) - 1)  # Choose a random destination station

    origin_station = random_path[origin_index]
    destination_station = random_path[destination_index]
    return origin_station, destination_station

In [19]:
stations_csv_path = f'../data/renfe/renfe_stations.csv'

def get_renfe_station_id(adif_id: str, stations_df: pd.DataFrame) -> str:
    """
    Returns the Station name given the Adif station id.

    Args:
        adif_id (str): Adif station id.
        stations_df (pd.DataFrame): Dataframe with the stations' information.

    Returns:
        str: Station name.
    """
    station_name = stations_df[stations_df['stop_id'] == adif_id]['stop_name'].values[0]
    station_name = station_name.replace("-", " ").split(" ")[0].lower()
    return station_name

stations_df = pd.read_csv(stations_csv_path, dtype={'stop_id': str, 'renfe_id': str})

In [67]:
n_passengers = 1000

pairs = []
for _ in range(n_passengers):
    pairs.append(get_random_pair(paths))

df = pd.DataFrame(pairs, columns=['origin', 'destination'])

user_patterns = ("business", "student", "tourist")

df['user_pattern'] = [random.choice(user_patterns) for _ in range(n_passengers)]

random_timedelta = lambda: datetime.timedelta(days=random.randint(0, 365*10), hours=random.randint(0, 24), minutes=random.randint(0, 60))
df['arrival_date'] = [datetime.datetime(2021, 1, 1) + random_timedelta() for _ in range(n_passengers)]
print(df.head())

df_embedding = pd.DataFrame()
df_embedding['arrival_date'] = df['arrival_date'].apply(lambda x: np.asarray([x.year, x.month, x.day, x.hour, x.minute] + [0.0] * (EMBEDDING_DIM - 5), dtype=np.float32))
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(np.array(df_embedding['arrival_date'].tolist()))
df_embedding['scaled_date'] = scaled_data.tolist()
print(df_embedding.head())

   origin destination user_pattern        arrival_date
0  madrid      girona     business 2022-07-27 15:50:00
1  lleida   barcelona      tourist 2030-12-18 00:41:00
2  madrid   calatayud     business 2027-11-09 08:27:00
3  madrid      lleida      student 2029-12-29 07:58:00
4  lleida   barcelona     business 2030-06-05 16:27:00
                                        arrival_date  \
0  [2022.0, 7.0, 27.0, 15.0, 50.0, 0.0, 0.0, 0.0,...   
1  [2030.0, 12.0, 18.0, 0.0, 41.0, 0.0, 0.0, 0.0,...   
2  [2027.0, 11.0, 9.0, 8.0, 27.0, 0.0, 0.0, 0.0, ...   
3  [2029.0, 12.0, 29.0, 7.0, 58.0, 0.0, 0.0, 0.0,...   
4  [2030.0, 6.0, 5.0, 16.0, 27.0, 0.0, 0.0, 0.0, ...   

                                         scaled_date  
0  [0.111114501953125, 0.5454545021057129, 0.8666...  
1  [1.0, 1.0, 0.5666666626930237, 0.0, 0.69491523...  
2  [0.6666717529296875, 0.9090908765792847, 0.266...  
3  [0.888885498046875, 1.0, 0.9333333373069763, 0...  
4  [1.0, 0.4545454680919647, 0.13333334028720856,...  


In [68]:
# Get bag of words
words_set = set(df[['origin', 'destination', 'user_pattern', ]].values.flatten())

print(words_set)

{'student', 'zaragoza', 'guadalajara', 'calatayud', 'business', 'madrid', 'barcelona', 'figueres', 'tarragona', 'girona', 'lleida', 'tourist'}


In [69]:
# Import GloVe embeddings 50D
import os
import numpy as np

word_index = {word: index for index, word in enumerate(words_set)}

embeddings_index = {}
f = open(os.path.join('../data/pretrained/glove6B/glove.6B.50d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype=np.float32)
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

EMBEDDING_DIM = 50

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
embedding_dict = {}
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    print(word, embedding_vector)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        embedding_dict[word] = embedding_vector

Found 400000 word vectors.
student [-1.0729    0.94103   0.084904 -1.0766    0.42866   0.099877 -0.51081
 -0.24961  -0.30883   0.19553   0.1965   -0.73152   0.096916 -0.062686
  0.12078  -0.72384  -0.382     0.6934    0.32956   0.40244   0.53485
  0.91781  -0.44553   0.71804  -0.13635  -1.6906    0.15818  -1.2367
 -1.2278   -0.058566  2.7544    0.18672  -0.263    -1.2792    0.16992
  0.40748   0.12248   0.11211   0.78318   0.036392 -0.40808  -0.058474
 -0.27932   0.33035   0.52384  -1.0487    0.27565   0.0363    0.048604
  0.28239 ]
zaragoza [ 1.0642    0.089939 -0.28715   0.82471   0.31063  -1.4798    0.12028
  0.7449   -1.2919    0.39737   0.4715   -0.53483  -0.48049  -1.2998
  0.32826  -1.3085   -0.67916  -0.20625  -0.7232    0.41638  -1.2895
 -0.69963  -0.23631   0.70175  -0.61498   0.20193   1.329     0.25294
 -0.093715 -0.16535   0.99605   1.2007   -0.26729  -0.42035  -0.15881
  0.63906  -0.73977   1.3119    0.61136  -0.59197   1.3032    0.16094
 -0.027686 -0.9412   -0.68288   0.

In [76]:
from sklearn.model_selection import train_test_split
from copy import deepcopy

df_embedding['origin'] = df['origin']
df_embedding['destination'] = df['destination']
df_embedding['user_pattern'] = df['user_pattern']
df_embedding = df_embedding[['origin', 'destination', 'user_pattern', 'scaled_date']]

input_data = df_embedding[['user_pattern', 'origin', 'destination', 'scaled_date']].values.tolist()
input_vectors = []
for row in deepcopy(input_data):
    row[:3] = map(lambda word: embedding_dict.get(word), row[:3])
    input_vectors.append(np.asarray(row, dtype=np.float32))

input_vectors = np.array(input_vectors)

print(input_vectors.shape)
print(input_vectors[0])
#X_train, X_test, _, _ = train_test_split(df_embedding, df_embedding, test_size=0.3, random_state=42)

(1000, 4, 50)
[[ 0.023693    0.13316     0.023131    0.49833     0.026874   -0.43252
  -1.1364     -0.82001     0.22388    -0.032119   -0.069651    0.39857
  -0.58275     0.095008   -0.023643    0.23237    -0.42441     0.65709
   0.57802    -0.51602     1.8253      0.12951    -0.61773     0.39281
  -0.35754    -1.6778     -0.45201    -0.47075     0.19487     0.35828
   3.6034      0.32865     0.47288    -0.33787    -0.46234    -0.51628
  -1.3755      0.70789     0.4648     -0.16186    -0.0961     -0.28523
   0.30047     0.50902     0.081356   -0.015639   -0.51021     0.34585
   0.24201     0.82237   ]
 [ 1.3315      0.72181    -0.060088    0.43948     0.18419    -1.5083
  -0.48125     0.46037    -1.4088      1.2701      0.68031    -0.59232
  -1.6325     -0.30376     0.87685    -0.75531    -0.37583    -0.5363
  -1.0669      0.45537    -0.66694     0.43001    -0.69525     0.67518
  -0.93783    -0.67933     1.1104      0.37576    -0.36894    -0.083185
   2.0346      0.96286    -0.56629   

## Embedding Model

In [77]:
from keras.models import Sequential
from keras.layers import Input, Embedding, Flatten

vocab_size = len(words_set)

embedding_model = Sequential()
embedding_model.add(Input(shape=(3,)))
embedding_model.add(Embedding(input_dim= vocab_size + 1,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            name='words_embedding', trainable=True
                            ))

embedding_model.compile(optimizer='adam', loss='mse')
embedding_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 words_embedding (Embedding)  (None, 3, 50)            650       
                                                                 
Total params: 650
Trainable params: 650
Non-trainable params: 0
_________________________________________________________________


## Autoencoder model

In [78]:
import tensorflow as tf

from keras.layers import Input, GRU, Dense, RepeatVector, TimeDistributed
from keras.models import Model

input_shape = (4, EMBEDDING_DIM)
output_shape = (4, EMBEDDING_DIM)

latent_dim = 4

inputs = Input(shape=input_shape)

encoder = GRU(64, return_sequences=True)(inputs)
encoder = GRU(32)(encoder)

latent = Dense(latent_dim)(encoder)

decoder_inputs = Input(shape=(latent_dim,))
decoder = RepeatVector(input_shape[0])(decoder_inputs)
decoder = GRU(32, return_sequences=True)(decoder)
decoder = GRU(64, return_sequences=True)(decoder)
decoder_outputs = TimeDistributed(Dense(output_shape[1]))(decoder)

encoder_model = Model(inputs, latent)
decoder_model = Model(decoder_inputs, decoder_outputs)

model = Model(inputs, decoder_model(latent))

model.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.MeanSquaredError()])
model.summary()

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 4, 50)]           0         
                                                                 
 gru_16 (GRU)                (None, 4, 64)             22272     
                                                                 
 gru_17 (GRU)                (None, 32)                9408      
                                                                 
 dense_8 (Dense)             (None, 4)                 132       
                                                                 
 model_13 (Functional)       (None, 4, 50)             25714     
                                                                 
Total params: 57,526
Trainable params: 57,526
Non-trainable params: 0
_________________________________________________________________


In [79]:
history = model.fit(input_vectors, input_vectors, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Make predictions

In [13]:
def find_closest_embeddings(embedding):
    return sorted(embedding_dict.keys(), key=lambda word: spatial.distance.euclidean(embedding_dict[word], embedding))[0]

In [37]:
random_index = np.random.randint(0, len(input_vectors))
random_vector = input_vectors[random_index]

print("Input - Random passenger data: ")
print(input_data[random_index])

prediction = model.predict(np.array([random_vector]))[0]

print("Output - Passenger reconstruction: ")
decoded_prediction = [find_closest_embeddings(word_vector) for word_vector in prediction[:3]]
print(decoded_prediction)
decode_date = scaler.inverse_transform(np.array([prediction[-1][:5]]))
print([int(v) for v in decode_date[0]])



Input - Random passenger data: 
['business', 'madrid', 'zaragoza', array([0.        , 0.        , 0.        , 0.30434783, 0.50847458,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])]
Output - Passenger reconstruction: 
['business', 'madrid', 'zaragoza']
[2023, 6, 1, 7, 29]
