# Natural language processing. Token classification

For this work, we need to creates an AI capable of detecting in a French sentence a depature city and an arrival city.

"Comment aller de [ville A] a [ville B]"
"Depuis [ville B], comment aller a [ville A]"

Manipulate texts is called a Natural Language Processing problem.





# Data preparations

## Clean data

In [9]:
import pandas as pd

df = pd.read_csv('./default_dataset.csv', delimiter=';', encoding='utf-8')

df

Unnamed: 0,text,to,from,moment
0,Depuis La chaize-le-vicomte à La roche-sur-foron,La roche-sur-foron,La chaize-le-vicomte,
1,Itiniréraire jusqu'a Giromagny depuis Quimper,Giromagny,Quimper,
2,Comment aller de Boigneville à Longjumeau mardi,Longjumeau,Boigneville,mardi
3,Je suis actuellement à Villers-sur-mer et j’ai...,Ferrières-en-bray,Villers-sur-mer,
4,Donne moi l'itinéraire pour aller à Fontenay-l...,Fontenay-le-fleury,Lizy-sur-ourcq,
...,...,...,...,...
1995,train Frontenex Domène,Domène,Frontenex,
1996,Quel est l'itinéraire entre Castres et Étainhus,Étainhus,Castres,
1997,Je pars de Laignes pour aller à Osséja,Osséja,Laignes,
1998,Je pars dimanche matin de Lapeyrouse pour alle...,Colombes,Lapeyrouse,dimanche matin


In [10]:
cities_df = pd.read_csv("./data/cities.csv", delimiter=",", encoding="utf-8")
cities = list(cities_df['label'])
cities[:10]

['ville du pont',
 'villers grelot',
 'villars les blamont',
 'les villedieu',
 'villers buzon',
 'villers la combe',
 'villers sous chalamont',
 'voujeaucourt',
 'bouconville vauclair',
 'bouresches']

In [11]:
cities.index('domene')

7758

In [12]:
df = df.reindex(columns=['text', 'from', 'to', 'moment'])
df.head()

Unnamed: 0,text,from,to,moment
0,Depuis La chaize-le-vicomte à La roche-sur-foron,La chaize-le-vicomte,La roche-sur-foron,
1,Itiniréraire jusqu'a Giromagny depuis Quimper,Quimper,Giromagny,
2,Comment aller de Boigneville à Longjumeau mardi,Boigneville,Longjumeau,mardi
3,Je suis actuellement à Villers-sur-mer et j’ai...,Villers-sur-mer,Ferrières-en-bray,
4,Donne moi l'itinéraire pour aller à Fontenay-l...,Lizy-sur-ourcq,Fontenay-le-fleury,


In [13]:
import re

# Standardize capitalization
def capitalize_cities(name):
    return ' '.join(word.capitalize() for word in name.split())

# if word.lower() not in ['de', 'la', 'le', 'sur', 'du', 'des', 'et'] else word

# Manage (le ) Havre...
def correct_city_name(city):
    # Get all contents inside brackets
    match = re.search(r"\((.*?)\)\s*$", city)
    if match:
        content = match.group(1)  # Content separeted from bracket
        city = city.replace(match.group(0), '').strip()  # Delete brackets
        # Manage apostrophe
        if content.endswith("'"):
            city = f"{content.capitalize()}{city}"  # no space
        else:
            city = f"{content.capitalize()} {city}"  # with space
    return city

df['from_corrected'] = df['from'].apply(correct_city_name).apply(capitalize_cities)
df['to_corrected'] = df['to'].apply(correct_city_name).apply(capitalize_cities)

#correct_city_name('Pavillons-sous-bois (les)')

In [14]:
def update_text(row):
    text = row['text']
    from_city = row['from']
    to_city = row['to']
    from_city_capitalized = row['from_corrected']
    to_city_capitalized = row['to_corrected']

    # Replaces city names to their capitalized version
    text = text.replace(from_city, from_city_capitalized).replace(to_city, to_city_capitalized)
    return text

# Apply the function to each rows
df['text'] = df.apply(update_text, axis=1)

# capitalize each text
df['text'].str.capitalize()

0        Depuis la chaize-le-vicomte à la roche-sur-foron
1           Itiniréraire jusqu'a giromagny depuis quimper
2         Comment aller de boigneville à longjumeau mardi
3       Je suis actuellement à villers-sur-mer et j’ai...
4       Donne moi l'itinéraire pour aller à fontenay-l...
                              ...                        
1995                               Train frontenex domène
1996      Quel est l'itinéraire entre castres et étainhus
1997               Je pars de laignes pour aller à osséja
1998    Je pars dimanche matin de lapeyrouse pour alle...
1999    J'aimerais me rendre sur chevrières en partant...
Name: text, Length: 2000, dtype: object

In [15]:
df.head()

Unnamed: 0,text,from,to,moment,from_corrected,to_corrected
0,Depuis La Chaize-le-vicomte à La Roche-sur-foron,La chaize-le-vicomte,La roche-sur-foron,,La Chaize-le-vicomte,La Roche-sur-foron
1,Itiniréraire jusqu'a Giromagny depuis Quimper,Quimper,Giromagny,,Quimper,Giromagny
2,Comment aller de Boigneville à Longjumeau mardi,Boigneville,Longjumeau,mardi,Boigneville,Longjumeau
3,Je suis actuellement à Villers-sur-mer et j’ai...,Villers-sur-mer,Ferrières-en-bray,,Villers-sur-mer,Ferrières-en-bray
4,Donne moi l'itinéraire pour aller à Fontenay-l...,Lizy-sur-ourcq,Fontenay-le-fleury,,Lizy-sur-ourcq,Fontenay-le-fleury


In [16]:
# Select lines which contains brackets
df[df['text'].str.contains(r'\(.*\)', na=False)]

Unnamed: 0,text,from,to,moment,from_corrected,to_corrected


In [17]:
df[df['text'].str.contains("Aigle", na=False)]

Unnamed: 0,text,from,to,moment,from_corrected,to_corrected


In [18]:
# remove unnecessarry columns
df = df.drop(['from', 'to', 'moment'], axis=1)

In [19]:
df = df.rename(columns={'from_corrected': 'from', 'to_corrected': 'to'})
df

Unnamed: 0,text,from,to
0,Depuis La Chaize-le-vicomte à La Roche-sur-foron,La Chaize-le-vicomte,La Roche-sur-foron
1,Itiniréraire jusqu'a Giromagny depuis Quimper,Quimper,Giromagny
2,Comment aller de Boigneville à Longjumeau mardi,Boigneville,Longjumeau
3,Je suis actuellement à Villers-sur-mer et j’ai...,Villers-sur-mer,Ferrières-en-bray
4,Donne moi l'itinéraire pour aller à Fontenay-l...,Lizy-sur-ourcq,Fontenay-le-fleury
...,...,...,...
1995,train Frontenex Domène,Frontenex,Domène
1996,Quel est l'itinéraire entre Castres et Étainhus,Castres,Étainhus
1997,Je pars de Laignes pour aller à Osséja,Laignes,Osséja
1998,Je pars dimanche matin de Lapeyrouse pour alle...,Lapeyrouse,Colombes


In [20]:
df.to_excel('cleaned_dataframe.xlsx', index=False)

## Labelized Data

This problem is a NER problem. Our model's goal is to classify each word to be a city **FROM** or **TO**.

Our cities contains multiple word like "La Chaize-le-vicomte" we need o process them.
We can process them using IOB tagging system.

In our case, it will be:
-  **B**: beginning of the chunk
- **I**: inside of the chunk
- **FROM**: city to departure
- **TO**: city to arrive
- **O**: Other

In [21]:
# create labels sequence using IOB system (for multi word city names)
def create_IOB_label_sequence(sentence, city_from, city_to):
    words = sentence.split()  
    labels = []  

    city_from_words = city_from.split()
    city_to_words = city_to.split()

    index = 0  
    while index < len(words):
        if ' '.join(words[index:index+len(city_from_words)]) == city_from:
            labels.append('B-FROM')
            labels.extend(['I-FROM'] * (len(city_from_words) - 1))
            index += len(city_from_words)
        elif ' '.join(words[index:index+len(city_to_words)]) == city_to:
            labels.append('B-TO')
            labels.extend(['I-TO'] * (len(city_to_words) - 1))
            index += len(city_to_words)
        else:
            labels.append('O')
            index += 1

    return labels

df['IOB_labels'] = df.apply(lambda row: create_IOB_label_sequence(row['text'], row['from'], row['to']), axis=1)
df

Unnamed: 0,text,from,to,IOB_labels
0,Depuis La Chaize-le-vicomte à La Roche-sur-foron,La Chaize-le-vicomte,La Roche-sur-foron,"[O, B-FROM, I-FROM, O, B-TO, I-TO]"
1,Itiniréraire jusqu'a Giromagny depuis Quimper,Quimper,Giromagny,"[O, O, B-TO, O, B-FROM]"
2,Comment aller de Boigneville à Longjumeau mardi,Boigneville,Longjumeau,"[O, O, O, B-FROM, O, B-TO, O]"
3,Je suis actuellement à Villers-sur-mer et j’ai...,Villers-sur-mer,Ferrières-en-bray,"[O, O, O, O, B-FROM, O, O, O, O, B-TO]"
4,Donne moi l'itinéraire pour aller à Fontenay-l...,Lizy-sur-ourcq,Fontenay-le-fleury,"[O, O, O, O, O, O, B-TO, O, O, O, B-FROM]"
...,...,...,...,...
1995,train Frontenex Domène,Frontenex,Domène,"[O, B-FROM, B-TO]"
1996,Quel est l'itinéraire entre Castres et Étainhus,Castres,Étainhus,"[O, O, O, O, B-FROM, O, B-TO]"
1997,Je pars de Laignes pour aller à Osséja,Laignes,Osséja,"[O, O, O, B-FROM, O, O, O, B-TO]"
1998,Je pars dimanche matin de Lapeyrouse pour alle...,Lapeyrouse,Colombes,"[O, O, O, O, O, B-FROM, O, O, O, B-TO]"


## Split data

In [22]:
X = df['text'].values
y = df['IOB_labels'].values

X[:10], y[:10]

(array(['Depuis La Chaize-le-vicomte à La Roche-sur-foron',
        "Itiniréraire jusqu'a Giromagny depuis Quimper",
        'Comment aller de Boigneville à Longjumeau mardi',
        'Je suis actuellement à Villers-sur-mer et j’aimerais partir à Ferrières-en-bray',
        "Donne moi l'itinéraire pour aller à Fontenay-le-fleury en partant de Lizy-sur-ourcq",
        'Comment aller de Viviers-du-lac à Aytré',
        'Je pars de Caen pour aller à Villeneuve-la-comtesse',
        "le départ et l'arrivée de mon voyage sont Sathonay-camp et Quillan",
        'Quel trajet pour Éveux Grandvilliers',
        'train Wizernes Sillé-le-guillaume'], dtype=object),
 array([list(['O', 'B-FROM', 'I-FROM', 'O', 'B-TO', 'I-TO']),
        list(['O', 'O', 'B-TO', 'O', 'B-FROM']),
        list(['O', 'O', 'O', 'B-FROM', 'O', 'B-TO', 'O']),
        list(['O', 'O', 'O', 'O', 'B-FROM', 'O', 'O', 'O', 'O', 'B-TO']),
        list(['O', 'O', 'O', 'O', 'O', 'O', 'B-TO', 'O', 'O', 'O', 'B-FROM']),
        list([

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.1,
                                                    random_state=42)

X_train[:5], y_train[:5]

(array(['Je prends le train Saint-mard Puy-guillaume',
        "j'aimerais partir de Reignier pour me rendre à Digoin",
        'trajet Vertou Leyment',
        'je viens de Pontorson et je veux aller à Loudun',
        'Je suis à Hatrize et je souhaite me rendre à Vonnas'],
       dtype=object),
 array([list(['O', 'O', 'O', 'O', 'B-FROM', 'B-TO']),
        list(['O', 'O', 'O', 'B-FROM', 'O', 'O', 'O', 'O', 'B-TO']),
        list(['O', 'B-FROM', 'B-TO']),
        list(['O', 'O', 'O', 'B-FROM', 'O', 'O', 'O', 'O', 'O', 'B-TO']),
        list(['O', 'O', 'O', 'B-FROM', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TO'])],
       dtype=object))

# Text vectorization (Tokenisation)



## Vectorizing X (Sentences)

In [24]:
from keras.layers import TextVectorization
import tensorflow as tf
import re
import string

max_vocab_length = 3000 # max number of words to have in our vocabulary 
max_length = 30 # max length our sequences will be

# Custom standardization function
def custom_standardization(input_text):
    # Remove punctuations, but preserve apostrophes
    return tf.strings.regex_replace(input_text, "[^a-zA-Z0-9À-ÖØ-öø-ÿ' ]", "")

text_vectorizer = TextVectorization(max_tokens=max_vocab_length, # max number of words to have in our vocabulary
                                    standardize=custom_standardization,
                                    split="whitespace",
                                    output_mode="int",
                                    output_sequence_length=max_length
                                    )

# Fit the text vectorizer to the training text
text_vectorizer.adapt(X_train)

In [25]:
# Find the longest sentence by word count
longest_sentence = df['text'].explode().apply(lambda x: len(x.split())).idxmax()
longest_sentence_text = df['text'].explode()[longest_sentence]

len(longest_sentence_text.split())

20

In [26]:
# Choose a random sentence from the training dataset and tokenize it
import random

random_sentence = random.choice(X_train)
print(f"Original text:\n {random_sentence} \
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
 je vais à Us depuis Égletons       

Vectorized version:


<tf.Tensor: shape=(1, 30), dtype=int64, numpy=
array([[ 10,  55,   2, 257,  11, 233,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]], dtype=int64)>

In [27]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary() # get all of the unique words in our training data
top_5_words = words_in_vocab[:5+2] # get most common words (with the 2 specials characters, O: padding token, UNK: unknown)
bottom_5_words = words_in_vocab[-5:] # get the least common words

print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"5 most common words: {top_5_words}")
print(f"5 least common words: {bottom_5_words}")

Number of words in vocab: 2206
5 most common words: ['', '[UNK]', 'à', 'de', 'Je', 'aller', 'pour']
5 least common words: ['Aillysurnoye', 'Aiguesmortes', 'Ahun', 'Agde', 'Acheres']


## Preparing Y (IOB Labels)

IOB labels should be converted also to numerical format.

In [28]:
import itertools

# flatten labels
y_flatten = list(itertools.chain.from_iterable(y))
y_flatten_tensor = tf.convert_to_tensor(y_flatten)
y_flatten_tensor

<tf.Tensor: shape=(15164,), dtype=string, numpy=
array([b'O', b'B-FROM', b'I-FROM', ..., b'O', b'O', b'B-FROM'],
      dtype=object)>

In [29]:
from keras.layers import StringLookup

label_lookup = StringLookup(output_mode='int',
                            max_tokens=5,
                            num_oov_indices=0)

# Fit the layer on training labels
label_lookup.adapt(y_flatten_tensor)

# Transform labels to integers
label_lookup.get_vocabulary()

['O', 'B-TO', 'B-FROM', 'I-TO', 'I-FROM']

In [30]:
y_train_transformed = [label_lookup(labels).numpy() for labels in y_train]
y_test_transformed = [label_lookup(labels).numpy() for labels in y_test]

y_train_transformed[:5], y_test_transformed[:5]

([array([0, 0, 0, 0, 2, 1], dtype=int64),
  array([0, 0, 0, 2, 0, 0, 0, 0, 1], dtype=int64),
  array([0, 2, 1], dtype=int64),
  array([0, 0, 0, 2, 0, 0, 0, 0, 0, 1], dtype=int64),
  array([0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1], dtype=int64)],
 [array([0, 0, 0, 2, 0, 1], dtype=int64),
  array([0, 0, 0, 0, 2, 0, 1], dtype=int64),
  array([0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1], dtype=int64),
  array([0, 0, 0, 0, 1, 0, 0], dtype=int64),
  array([0, 0, 0, 1, 0, 2], dtype=int64)])

In [31]:
# pad my y to be equals at X padded size
from keras.preprocessing.sequence import pad_sequences

# Define a padding value that's different from any actual IOB tags
PADDING_LABEL = label_lookup.vocabulary_size()  # 'num_tags' is the number of IOB tags you have

y_train_padded = pad_sequences(y_train_transformed, maxlen=max_length, padding='post', value=PADDING_LABEL)
y_test_padded = pad_sequences(y_test_transformed, maxlen=max_length, padding='post', value=PADDING_LABEL)

y_train_padded[:5], y_test_padded[:5]

(array([[0, 0, 0, 0, 2, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5],
        [0, 0, 0, 2, 0, 0, 0, 0, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5],
        [0, 2, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5],
        [0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5],
        [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5]]),
 array([[0, 0, 0, 2, 0, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5],
        [0, 0, 0, 0, 2, 0, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5],
        [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5],
        [0, 0, 0, 0, 1, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5],
        [0, 0, 0,

# Make a modelling checkpoint callback

In [32]:


import os
import logging

tf.get_logger().setLevel(logging.WARNING) # remove INFO log (model saved at...)

# Create a function to implement a ModelCheckpoint callback with a specific filename
def create_model_checkpoint(model_name, save_path="model_experiments"):
    return tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(save_path, model_name),
        monitor="val_loss",
        verbose=0, # only output a limited amount of text
        save_best_only=True,
    )

# Model creation

In [None]:
from tensorflow import keras
from keras import layers

max_vocab_length = text_vectorizer.vocabulary_size() # Max number of words in the vocabulary
max_length = max_length # Max length of each sequence
num_tags = label_lookup.vocabulary_size() + 1 # Number of unique IOB tags (including 'O' and PADDING_LABEL)

model = keras.Sequential([
    layers.Input((1,), dtype=tf.string),
    text_vectorizer, # turn the input text into numbers
    layers.Embedding(input_dim=max_vocab_length, output_dim=128),
    layers.GlobalMaxPooling1D(),
    layers.Dense(num_tags, activation="relu"),
], name="model_1")

model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

model.fit(X_train,
          y_train_padded,
          epochs=5,
          validation_data=(X_test, y_test_padded))

Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\ipykernel\kernelapp.py", line 736, in start

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\tornado\platform\asyncio.py", line 195, in start

  File "c:\Users\loannmr\.conda\envs\AI\Lib\asyncio\base_events.py", line 607, in run_forever

  File "c:\Users\loannmr\.conda\envs\AI\Lib\asyncio\base_events.py", line 1922, in _run_once

  File "c:\Users\loannmr\.conda\envs\AI\Lib\asyncio\events.py", line 80, in _run

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\ipykernel\kernelbase.py", line 516, in dispatch_queue

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\ipykernel\kernelbase.py", line 505, in process_one

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\ipykernel\kernelbase.py", line 412, in dispatch_shell

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\ipykernel\kernelbase.py", line 740, in execute_request

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\ipykernel\ipkernel.py", line 422, in do_execute

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\ipykernel\zmqshell.py", line 546, in run_cell

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\IPython\core\interactiveshell.py", line 3024, in run_cell

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\IPython\core\interactiveshell.py", line 3079, in _run_cell

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\IPython\core\interactiveshell.py", line 3284, in run_cell_async

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\IPython\core\interactiveshell.py", line 3466, in run_ast_nodes

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code

  File "C:\Users\loannmr\AppData\Local\Temp\ipykernel_16588\2512839750.py", line 20, in <module>

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\engine\training.py", line 1783, in fit

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\engine\training.py", line 1127, in train_step

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\engine\training.py", line 1185, in compute_loss

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\losses.py", line 143, in __call__

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\losses.py", line 270, in call

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\losses.py", line 2454, in sparse_categorical_crossentropy

  File "c:\Users\loannmr\.conda\envs\AI\Lib\site-packages\keras\src\backend.py", line 5777, in sparse_categorical_crossentropy

logits and labels must have the same first dimension, got logits shape [32,6] and labels shape [960]
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_1549076]

# Model 2 LSTM

In [None]:
from tensorflow import keras
from keras import layers

max_vocab_length = text_vectorizer.vocabulary_size() + 1 # Max number of words in the vocabulary (+ OOV token)
max_length = max_length # Max length of each sequence
num_tags = label_lookup.vocabulary_size() + 1 # Number of unique IOB tags (including 'O' and PADDING_LABEL)

model = keras.Sequential([
    layers.Input((1,), dtype=tf.string), # raw input string
    text_vectorizer, # turn the input text into numbers
    layers.Embedding(input_dim=max_vocab_length + 1, output_dim=128, mask_zero=True), # tells the model to ignore padded 0 (enhance correct sentences but more computation)
    layers.LSTM(64, return_sequences=True),
    layers.TimeDistributed(layers.Dense(num_tags, activation="softmax")), # make a prediction for each word in the sequence.
], name="model_1")

model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

model.fit(X_train,
          y_train_padded,
          epochs=30,
          validation_data=(X_test, y_test_padded),
          verbose=1,
          callbacks=[create_model_checkpoint(model_name=model.name)])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x1b5e06e0490>

In [None]:
# loss , accuracy
model.evaluate(X_test, y_test_padded)



[0.021895453333854675, 0.993565022945404]

In [33]:
from tensorflow import keras
from keras import layers

model_1 = keras.models.load_model('model_experiments/model_1', custom_objects={'custom_standardization': custom_standardization})
model_1.evaluate(X_test, y_test_padded)



[0.021895453333854675, 0.993565022945404]

In [34]:
sample_text = ["Itinéraire vers Rouen dès que j'arrive à Paris"]
#sample_text = ["Je veux aller de Lyon à Paris"]

text_vectorizer(sample_text)

<tf.Tensor: shape=(1, 30), dtype=int64, numpy=
array([[105,  65, 307,   1,   1,   1,   2,   1,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]], dtype=int64)>

In [35]:


predictions = model_1.predict(sample_text)



In [None]:
label_lookup.get_vocabulary()

['O', 'B-TO', 'B-FROM', 'I-TO', 'I-FROM']

In [None]:
tf.argmax(predictions[0], axis=-1)

<tf.Tensor: shape=(30,), dtype=int64, numpy=
array([0, 0, 2, 1, 1, 3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)>

In [36]:
def make_prediction(sentences):
    decoded_predictions = []

    # make predictions
    predictions = model_1.predict(sentences)

    # mask padding and decode prediction
    for i, prediction in enumerate(predictions):
        actual_length = len(sentences[i].split())  # Length of the actual sentence
        
        predicted_tags = tf.argmax(prediction, axis=-1).numpy()[:actual_length]  # Consider only actual length
        # Map predicted tags to their labels, ignoring padding
        decoded_predictions.append([label_lookup.get_vocabulary()[tag] for tag in predicted_tags])

    return decoded_predictions

In [40]:
sentences = [
    "Itinéraire vers Rouen dès que j'arrive à Paris",
    "Je suis à Paris et je souhaite me rendre à Lyon",
    "Comment partir à Lyon depuis Toulouse",
    "Je mange des baguette"
]

predictions = make_prediction(sentences)


for sentence, prediction in zip(sentences, predictions):
    print(sentence)
    print(prediction)

# #make_prediction([])
# #make_prediction([])
# make_prediction([])

Itinéraire vers Rouen dès que j'arrive à Paris
['O', 'O', 'B-FROM', 'B-TO', 'B-TO', 'I-TO', 'O', 'B-FROM']
Je suis à Paris et je souhaite me rendre à Lyon
['O', 'O', 'O', 'B-FROM', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TO']
Comment partir à Lyon depuis Toulouse
['O', 'O', 'O', 'B-TO', 'O', 'B-FROM']
Je mange des baguette
['O', 'B-FROM', 'O', 'B-TO']


On remarque que des phrases aléatoires, notre model est perdu. Il faut ameliorer notre jeu de donnee, en rajoutant des phrases avec 0 villes ou 1 villes.