# Voice recognition

In [1]:
from vosk import Model, KaldiRecognizer

# load models
model = Model("./models/vosk-model-fr-0.22")
recognizer = KaldiRecognizer(model, 16000)

In [4]:
import pyaudio

def start_voice_recognition():
    """
    Speach to text.
    Stop the voice recognition by saying "stop" or until a specific time
    """
    mic = pyaudio.PyAudio()
    stream = mic.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8192)
    stream.start_stream()

    print("Listening...")
    while True:
        data = stream.read(4096)
        if recognizer.AcceptWaveform(data):
            text = recognizer.Result()
            text = text[14:-3]  # Get the sentence
            print(text)
            break  # Stop after the first sentence is recognized

    # Close the stream and PyAudio instance
    stream.stop_stream()
    stream.close()
    mic.terminate()

    return text

In [19]:
start_voice_recognition()

Listening...
alors comment l'état paraît aller au bout de quarante cinq à soixante


"alors comment l'état paraît aller au bout de quarante cinq à soixante"

# Spell checker

In [21]:
import language_tool_python

is_bad_rule = lambda rule: rule.message == 'Possible spelling mistake found.' and len(rule.replacements) and rule.replacements[0][0].isupper()
with language_tool_python.LanguageToolPublicAPI('fr') as tool:
    sentence = "comment aller de paris a lyon"
    matches = tool.check(sentence)
    corrected_sentence = tool.correct(sentence)
    corrected_sentence = tool.correct(corrected_sentence)

corrected_sentence

'Comment aller de paris à Lyon ?'

In [5]:
import language_tool_python

def spell_check(sentence):
    """
    Correct spelling mistakes in the sentence as long as they occur
    """
    # Initialize LanguageToolPublicAPI
    with language_tool_python.LanguageToolPublicAPI('fr') as tool:
        # Initialize the corrected version
        corrected_old = sentence
        corrected_new = tool.correct(corrected_old)

        # Perform spell checking until no more corrections are made
        while corrected_new != corrected_old and corrected_new:
            corrected_old = corrected_new
            corrected_new = tool.correct(corrected_old)

    
    return corrected_new

In [24]:
spell_check("cmment aller de lyon a marseille")

'Comment aller de Lyon à Marseille'

# Load model 

In [6]:
import pickle
#from keras.models import load_model
import keras
import tensorflow as tf

def custom_standardization(input_text):
    # Remove punctuations, but preserve apostrophes
    return tf.strings.regex_replace(input_text, "[^a-zA-Z0-9À-ÖØ-öø-ÿ' ]", "")

#Load the custom function (not seems to work)
# with open("./models/custom_standardization.pkl", "rb") as file:
#     custom_standardization = pickle.load(file) 

# Load the saved model
model_bi_gru = keras.models.load_model('./models/model_bidirectional_gru.keras', custom_objects={'custom_standardization': custom_standardization})

OSError: Unable to open file (file signature not found)

In [None]:
vocabulary = {
    "O": 0,
    "B-TO": 1,
    "B-FROM": 2,
    "I-TO": 3,
    "I-FROM": 4,
    "PAD": 5
}

# Define the reverse vocabulary mapping from integers to labels
reverse_vocabulary = {index: label for label, index in vocabulary.items()}

def make_prediction(sentence):
    pred = model_bi_gru.predict([sentence], verbose=0)[0]

    # calculate the real length of the sentence to remove the padding
    actual_length = len(sentence.split())

    predicted_tags = tf.argmax(pred, axis=-1).numpy()[:actual_length]

    labels = [reverse_vocabulary[index] for index in predicted_tags]

    return labels

def extract_predicted_cities(sentence: str, labels: list):
    """
    Extracts predicted cities from a sentence based on the provided labels.

    Args:
    - sentence (str): The input sentence.
    - labels (list): List of labels corresponding to each word in the sentence.

    Returns:
    - dict: A dictionary containing predicted cities categorized by label.
    """
    predicted_cities = {
        "B-TO": [],
        "B-FROM": [],
        "I-TO": [],
        "I-FROM": [],
    }

    # Split the sentence into words
    words = sentence.split()

    # Iterate through each label and its corresponding word
    for label, word in zip(labels, words):
        if label in predicted_cities:
            predicted_cities[label].append(word)

    return predicted_cities

In [78]:
Sentence = "Je veux aller de Lyon Ricola à Toulouse Marco"
array_labels = make_prediction(Sentence)
extract_predicted_cities(Sentence, array_labels)

{'B-TO': ['Toulouse'],
 'B-FROM': ['Lyon'],
 'I-TO': ['Marco'],
 'I-FROM': ['Ricola']}

# TGV stations finder

In [7]:
# GRAPH UTILITIES
import pandas as pd
import networkx as nx

def create_graph(df):
    G = nx.from_pandas_edgelist(df,
                            source="start_station", 
                            target="end_station",
                            edge_attr='duration', # weights
                            create_using=nx.DiGraph(oriented=True, data=True))
    return G

def preload_fastest_paths(G):
    """
    Example: access the shortest path and its weight from node A to node B
    path_from_A_to_B = weighted_paths['A']['B']['path']
    weight_from_A_to_B = weighted_paths['A']['B']['duration']
    """
    # Johnson's algorithm to find all pairs shortest paths
    shortest_paths_johnson = nx.johnson(G, weight="duration")
    # Create a dictionary to store the paths with their weights
    weighted_paths = {}
    for source, targets in shortest_paths_johnson.items():
        weighted_paths[source] = {}
        for target, path in targets.items():
            weight = sum(G[path[i]][path[i + 1]]['duration'] for i in range(len(path) - 1))
            weighted_paths[source][target] = {'path': path, 'duration': weight}
    return weighted_paths

def find_fastest_paths(start_stations, end_stations):
    best_path = None
    shortest_duration = float('inf')
    all_paths_info = []

    for start in start_stations:
        for end in end_stations:
            if start in shortest_paths and end in shortest_paths[start]:
                duration = shortest_paths[start][end]['duration']
                
                path_info = {
                    "start": start,
                    "end": end,
                    "duration": duration,
                    "path": shortest_paths[start][end]['path']
                }
                all_paths_info.append(path_info)
                
                if duration < shortest_duration:
                    shortest_duration = duration
                    best_path = (start, end, duration)
    return best_path, all_paths_info

# Create graph and preload paths
timetables_df = pd.read_csv('./data/timetables.csv', delimiter=',', encoding='utf8')
G = create_graph(timetables_df)
shortest_paths = preload_fastest_paths(G)

In [8]:
best_path, all_paths = find_fastest_paths(['lyon part dieu'], ['paris montparnasse hall 1 - 2'])
best_path, all_paths

(('lyon part dieu', 'paris montparnasse hall 1 - 2', 327.0),
 [{'start': 'lyon part dieu',
   'end': 'paris montparnasse hall 1 - 2',
   'duration': 327.0,
   'path': ['lyon part dieu', 'rennes', 'paris montparnasse hall 1 - 2']}])

In [9]:
# Get unique stations available
start_stations_unique = timetables_df['start_station'].unique()
end_stations_unique = timetables_df['end_station'].unique()
all_stations = list(start_stations_unique) + list(end_stations_unique)
unique_stations = pd.unique(all_stations)
print(unique_stations)

['paris montparnasse hall 1 - 2' 'bordeaux saint-jean' 'strasbourg'
 'paris est' 'paris gare du nord' 'aix-en-provence tgv'
 'marseille saint-charles' 'montpellier saint-roch' 'metz'
 'nancy place de la république' 'nice-ville' 'toulon' 'reims' 'colmar'
 'remiremont' 'saint-dié-des-vosges' 'sedan' 'charleville-mézières'
 'bar-le-duc' 'luxembourg' 'thionville' 'paris gare de lyon hall 1 - 2'
 'lille europe' 'lyon part dieu' 'montpellier sud de france' 'rennes'
 'lyon perrache' 'lille flandres' 'nantes' 'le havre' 'perpignan' 'hyères'
 'miramas' 'avignon centre' 'grenoble' 'évian-les-bains'
 'saint-étienne châteaucreux' 'mulhouse' 'besançon viotte'
 'toulouse matabiau' 'annecy' 'valenciennes' 'tourcoing' 'dunkerque'
 'boulogne sur mer' 'rang-du-fliers - verton - berck' 'brest' 'quimper'
 'auray' 'la rochelle' 'niort' 'aéroport charles de gaulle 2 tgv'
 'saint-malo' 'tours' 'poitiers' 'arcachon' 'agen' 'hendaye' 'tarbes'
 'saint-brieuc' 'lannion' 'vannes' 'lorient' 'saint-nazaire' 'le cro

In [26]:
from fuzzywuzzy import fuzz

def find_similar_stations(city, stations=unique_stations, threshold=90):
    """
    Find stations similar to the given city name.

    Parameters:
        city (str): The city name to search for.
        stations (list): List of station names.
        threshold (int): Similarity threshold for fuzzy string matching.

    Returns:
        list: List of stations similar to the city name.
    """
    # Convert city name to lowercase
    city_lower = city.lower()

    # Find stations similar to the city
    similar_stations = [station for station in stations if fuzz.partial_ratio(city_lower, station.lower()) >= threshold]

    return similar_stations[:4] # return maximum of 4 stations (paris has 4 stations)

In [25]:
find_similar_stations('paris', threshold=90)

['paris montparnasse hall 1 - 2',
 'paris est',
 'paris gare du nord',
 'paris gare de lyon hall 1 - 2']

In [16]:
def generate_city_combinations(b_cities, i_cities):
    """
    Generate all possible combinations of cities
    """
    
    city_combinations = [[b_city] + i_cities for b_city in b_cities]
    return city_combinations

def find_station_for_direction(predicted_cities: dict[str, list], direction='TO', threshold=90):
    """
    Find a similar station for a given direction based on predicted cities.

    Args:
    - predicted_cities (dict): Dictionary containing predicted cities categorized by label.
    - direction (str): Direction ('TO' or 'FROM').
    - threshold (int): Similarity threshold for finding similar stations.

    Returns:
    - str: The found similar station, or None if no similar station is found.
    """
    
    # Concatenate B-TO and I-TO cities if they exist
    if direction == 'TO':
        city_combinations =  generate_city_combinations(predicted_cities.get('B-TO', []), predicted_cities.get('I-TO', []))
    else:
        city_combinations =  generate_city_combinations(predicted_cities.get('B-FROM', []), predicted_cities.get('I-FROM', []))
    
    print(city_combinations)
    for city_words in city_combinations:
        # Try different lengths of city names starting from the full city name
        for i in range(len(city_words), 0, -1):
            # Concatenate the first i words of the city name
            partial_city = ' '.join(city_words[:i])
            similar_station = find_similar_stations(partial_city, threshold=threshold)
            if similar_station:
                return similar_station
             
    # If no similar station is found, return None
    return None

In [18]:
predicted_cities = {'B-TO': ['ok', 'lyon'], 'B-FROM': ['a'], 'I-TO': ["a", "b"], 'I-FROM': []}

stations_from = find_station_for_direction(predicted_cities, direction='FROM', threshold=90)
stations_to = find_station_for_direction(predicted_cities, direction='TO', threshold=90)

print(stations_from)
print(stations_to)



[['a']]
[['ok', 'a', 'b'], ['lyon', 'a', 'b']]
['paris montparnasse hall 1 - 2', 'bordeaux saint-jean', 'strasbourg', 'paris est', 'paris gare du nord']
['paris gare de lyon hall 1 - 2', 'lyon part dieu', 'lyon perrache']


# Final

- language-tool-python (spell checker) (https://pypi.org/project/language-tool-python/)
- Vosk (https://alphacephei.com/vosk/) (https://alphacephei.com/vosk/models/vosk-model-fr-0.22.zip)
- PyAudio (https://people.csail.mit.edu/hubert/pyaudio/)
- Numpy
- Pandas
- Tensorflow
- fuzzywuzzy (https://github.com/seatgeek/fuzzywuzzy)
- python-Levenshtein (https://github.com/rapidfuzz/python-Levenshtein)


In [19]:
Sentence = "cmment aller de lyon a marseille"

corrected_sentence = spell_check(Sentence)
print(corrected_sentence)

labels = make_prediction(corrected_sentence)
print(labels)

predicted_cities = extract_predicted_cities(corrected_sentence, labels)
print(predicted_cities)

stations_from = find_station_for_direction(predicted_cities, direction='FROM', threshold=90)
stations_to = find_station_for_direction(predicted_cities, direction='TO', threshold=90)
print(stations_from, stations_to)

for station_from in stations_from:
    for station_to in stations_to:
        best_path, all_paths = find_fastest_paths(station_from, station_to)
        print(best_path)

Comment aller de Lyon à Marseille


NameError: name 'make_prediction' is not defined

In [10]:
Sentence = start_voice_recognition()

corrected_sentence = spell_check(Sentence)
print(corrected_sentence)

labels = make_prediction(corrected_sentence)
print(labels)

predicted_cities = extract_predicted_cities(corrected_sentence, labels)
print(predicted_cities)

stations_from = find_station_for_direction(predicted_cities, direction='FROM', threshold=90)
stations_to = find_station_for_direction(predicted_cities, direction='TO', threshold=90)
print(stations_from, stations_to)

for station_from in stations_from:
    for station_to in stations_to:
        best_path, all_paths = find_fastest_paths(station_from, station_to)
        print(best_path)

Listening...
depuis toulouse comment aller à lyon
Depuis Toulouse comment aller à Lyon


['O', 'B-FROM', 'O', 'O', 'O', 'B-TO']