# T9 - Artificial intelligence (T-DAT-901)
## Travel Order Resolver
Project members:
* Matisse AUBRY
* Georges BITAKWIRE
* Loan JOUFFROY
* Raphaël LÉVY 
* Marco VIALLEFONT

## Project setup

In [2]:
!pip install langdetect
!python -m spacy download fr_core_news_sm
!apt remove libav-tools
!pip install geograpy3==0.1.2
!pip install pydub
!pip install SpeechRecognition
import ipywidgets as widgets
from IPython import display as disp
from IPython.display import display, Audio, clear_output
from google.colab import output
import base64
from pydub import AudioSegment
import io
import tempfile
import librosa
import numpy as np
from langdetect import detect
import fr_core_news_sm
from nltk.corpus import stopwords
import nltk
import geograpy
import speech_recognition as sr
from scipy.io.wavfile import write
import soundfile
import speech_recognition as sr
import os
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')


nlp = fr_core_news_sm.load()
print("Project setup")

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 5.1 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=38730156fb5bd832da4f26908446cb9a8aa838ad80be42ba925abea15a6c9163
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Collecting fr_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7 MB)
[K     |████████████████████████████████| 14.7 MB 5.1 MB/s 
Building wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for fr-

In [5]:
def record_audio(seconds=3,
                 sample_rate=44100,
                 normalize_db=0.1):
    """Record audio from the browser in colab using javascript.
    Based on: https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
    Args:
      seconds: Number of seconds to record.
      sample_rate: Resample recorded audio to this sample rate.
      normalize_db: Normalize the audio to this many decibels. Set to None to skip
        normalization step.
    Returns:
      An array of the recorded audio at sample_rate.
    """
    # Use Javascript to record audio.
    record_js_code = """
      const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
      const b2text = blob => new Promise(resolve => {
        const reader = new FileReader()
        reader.onloadend = e => resolve(e.srcElement.result)
        reader.readAsDataURL(blob)
      })
      var record = time => new Promise(async resolve => {
        stream = await navigator.mediaDevices.getUserMedia({ audio: true })
        recorder = new MediaRecorder(stream)
        chunks = []
        recorder.ondataavailable = e => chunks.push(e.data)
        recorder.start()
        await sleep(time)
        recorder.onstop = async ()=>{
          blob = new Blob(chunks)
          text = await b2text(blob)
          resolve(text)
        }
        recorder.stop()
      })
      """
    print('Starting recording for {} seconds...'.format(seconds))
    display(disp.Javascript(record_js_code))
    audio_string = output.eval_js('record(%d)' % (seconds * 1000.0))
    print('Finished recording!')
    audio_bytes = base64.b64decode(audio_string.split(',')[1])
    return audio_bytes_to_np(audio_bytes,
                             sample_rate=sample_rate,
                             normalize_db=normalize_db)
    
def audio_bytes_to_np(wav_data,
                      sample_rate=44100,
                      normalize_db=0.1):
    """Convert audio file data (in bytes) into a numpy array.
    Saves to a tempfile and loads with librosa.
    Args:
      wav_data: A byte stream of audio data.
      sample_rate: Resample recorded audio to this sample rate.
      normalize_db: Normalize the audio to this many decibels. Set to None to skip
        normalization step.
    Returns:
      An array of the recorded audio at sample_rate.
    """
    # Parse and normalize the audio.
    audio = AudioSegment.from_file(io.BytesIO(wav_data))
    audio.remove_dc_offset()
    if normalize_db is not None:
        audio.normalize(headroom=normalize_db)
    # Save to tempfile and load with librosa.
    with tempfile.NamedTemporaryFile(suffix='.wav') as temp_wav_file:
        fname = temp_wav_file.name
        audio.export(fname, format='wav')
        audio_np, unused_sr = librosa.load(fname, sr=sample_rate)
    return audio_np

In [6]:
#@title Record
#@markdown * Set recording time:

SAMPLE_RATE = 44100
record_seconds =   4 #@param {type:"number", min:2, max:10, step:1}
NAME_FILE = 'record.wav'

def record_new_audio_and_save_it(b=None, view_audio_return=True, record_time=record_seconds):
  print("Start recording...")
  global audio
  clear_output()
  audio = record_audio(record_time, sample_rate=SAMPLE_RATE)

  if view_audio_return is True:
    display(Audio(audio, rate=SAMPLE_RATE))

  # Supprime le fichier s'il existe
  if os.path.exists(NAME_FILE):
    os.remove(NAME_FILE)

  write(NAME_FILE,SAMPLE_RATE, audio)
  data, samplerate = soundfile.read(NAME_FILE)
  soundfile.write(NAME_FILE, data, samplerate, subtype='PCM_16')

# # For test the record_new_audio_and_save_it function
# button = widgets.Button(description="Start recording...")
# button.on_click(record_new_audio_and_save_it)
# display(button)

In [7]:
def speech_to_text(source_audio_file='/content/' + NAME_FILE):
  r = sr.Recognizer()
  with sr.AudioFile(source_audio_file) as source:
      # listen for the data (load audio to memory)
      audio_data = r.record(source)
      # recognize (convert from speech to text)
      text = r.recognize_google(audio_data, language="fr-FR")
      print('Speech to text out : ' + text)
      return text

In [8]:
LANGUAGES_ACCEPTED = ['fr'] # => If, one day, we wish add new languages

# Return True if the input have a accpected language, else return False
def check_if_text_accepted_language(text):
    accepted = False
    lang_detected = detect(text)
    for langue in LANGUAGES_ACCEPTED:
        if (langue == lang_detected):
            accepted = True
    return accepted

In [102]:
def return_arrival(sentence):
  if(sentence == ""):
    print("veuillez saisir votre phrase")
  else:
    doc = nlp(sentence)
    verb = [token.lemma_ for token in doc]
    phrase_sent = [(X, X.pos_) for X in doc]
    places = geograpy.get_place_context(text=sentence)

  #     nlp_wk = spacy.load('xx_ent_wiki_sm')
      
    adp =  [word for word,pos in phrase_sent if pos == 'ADP']
    prop = [word for word,pos in phrase_sent if pos == 'PROPN']
    city = [(X.text) for X in doc.ents]

    # print(adp)
    if(len(adp)>1):
      adp = str(adp)
      # print(city)
      adp = adp.replace("pour,", "")
      adp = adp.replace("Bonjour", "")
      adp = adp.replace("en,", "")
      adp = adp.replace("en", "")
      adp = adp.replace("dans,", "")
      adp = adp.replace("depuis", "")
      adp = adp.replace(" ", "")
      
      # print(adp)
      adp = nlp(adp)
      
      adp_word =  [(X, X.pos_) for X in adp]
      # print(adp_word)
      real_adp = [word for word,pos in adp_word if pos == 'ADP'] 
      adp = real_adp
      # print(len(str(adp)))
    dictionary = {"departure":["de","depuis","à", "par","vers"]} 
    arrival = None
    for cle, valeur in dictionary.items():
      if not adp or valeur[0] == str(adp[0]):
        if len(city) == 1:
          city = str(city)
          city = nlp(city)
          city_word = [(X, X.pos_) for X in city]
          real_city = [word for word,pos in city_word if pos == 'PROPN'] 
          city = real_city
          
        if(str(city[1]) == ""):
          arrival = prop[1]
          print("16arrival : ",prop[1])
        else:
          arrival = city[1]
          print("15arrival : ",city[1])
      elif valeur[4] == str(adp[0]) or valeur[1] == str(adp[0]) :
        if(str(city[1]) == ""):
          arrival = prop[1]
          print("14arrival : ",prop[1])
        else:
          arrival = city[1]
          print("13arrival : ",city[1])
      elif valeur[2] == str(adp[0]) and str(verb[2]) == "être":
        if(str(city[1]) == ""):
          arrival = prop[1]
          print("12arrival : ",prop[1])
        else:
          arrival = city[1]
          print("11arrival : ",city[1])
      elif valeur[2] == str(adp[0]):
        if(str(city[0]) == ""):
          arrival = prop[0]
          print("10arrival : ",prop[0])
        else:
          if len(city) == 3:
            arrival = city[0] 
          elif len(city) == 2 and str(verb[0]) == "donner":
            arrival = city[0]
            print("9arrival : ", city[0])
          else:
            arrival = city[1]
      elif valeur[2] == str(adp[0]):
        if(str(city[0]) == ""):
          arrival = prop[0]
          print("8arrival : ",prop[0])
        else:
          arrival = city[0]
          print("7arrival : ",city[0])
      elif valeur[1] == str(adp[0]) or str(adp[1]):
        if(str(city[0]) == ""):
          arrival = prop[0]
          print("6arrival : ",prop[0])
        else:
          arrival = city[0]
          print("5arrival : ",city[0])
      elif valeur[3] == str(adp[1]):
        if(str(city[0]) == ""):
          arrival = prop[0]
          print("4arrival : ",prop[0])
        else:
          arrival = city[0]
          print("3arrival : ",city[0])
      else:
        if(str(city[1]) == ""):
          arrival = prop[1]
          print("2arrival : ",prop[1])
        else:
          arrival = city[1]
          print("1arrival : ",city[1])
      return arrival
return_arrival("Train pour aller à Angers en provenance de Paris")

'Paris'

In [99]:
def return_departure(sentence):
  if(sentence == ""):
    print("veuillez saisir votre phrase")
  else:
    doc = nlp(sentence)
    verb = [token.lemma_ for token in doc]
    phrase_sent = [(X, X.pos_) for X in doc]
    places = geograpy.get_place_context(text=sentence)
  #     nlp_wk = spacy.load('xx_ent_wiki_sm')
    adp =  [word for word,pos in phrase_sent if pos == 'ADP']
    prop = [word for word,pos in phrase_sent if pos == 'PROPN']
    city = [(X.text) for X in doc.ents]
    print(city)
    # print(adp)
    if(len(adp)>1):
      adp = str(adp)
      # print(city)
      adp = adp.replace("pour,", "")
      adp = adp.replace("Bonjour", "")
      adp = adp.replace("en,", "")
      adp = adp.replace("en", "")
      adp = adp.replace("dans,", "")
      adp = adp.replace(" ", "")
      
      # print(adp)
      adp = nlp(adp)
      
      adp_word =  [(X, X.pos_) for X in adp]
      # print(adp_word)
      real_adp = [word for word,pos in adp_word if pos == 'ADP'] 
      adp = real_adp
      # print(len(str(adp)))
    dictionary = {"departure":["de","depuis","à", "par","vers"]}
    # print(prop)
    departure = None
    for cle, valeur in dictionary.items():
      if not adp or valeur[0] == str(adp[0]):
        if len(city) == 1:
          city = str(city)
          city = nlp(city)          
          city_word = [(X, X.pos_) for X in city]
          real_city = [word for word,pos in city_word if pos == 'PROPN'] 
          print(real_city)
          city = real_city
        else: 
          if(str(city[0]) == ""):
            departure = prop[0]
            #print("departure 15: ",prop[0])
          else:
            departure = city[0]
          # print("departure 14: ",city[0])
      elif valeur[4] == str(adp[0]):
        if(str(city[0]) == ""):
          departure = prop[0]
          # print("departure 13: ",prop[0])
        else:
          departure = city[0]
          # print("departure 12: ",city[0])
      elif valeur[2] == str(adp[0]) and str(verb[2]) or str(verb[0]) == "être":
        if(str(city[0]) == ""):
          departure = prop[0]
          print("departure 11: ",prop[0])
        else:
          departure = city[1]
          print("departure 10: ",city[1])
      elif valeur[2] == str(adp[0]):
        if(str(city[1]) == ""):
          departure = prop[1]
          print("departure 9: ",prop[1])
        else:
          if len(city) >= 3 and verb[7] == "donne":
            departure = city[1] 
            print("departure 8 : ",)
          elif len(city) == 3:
            departure = city[2]
          else: 
            departure = city[1]
          
      elif valeur[2] == str(adp[0]):
        if(str(city[1]) == ""):
          departure = prop[1]
          #print("departure 7 : ",prop[1])
        else:
          departure = city[1]
          print("departure 6 : ",city[1])
      
      elif valeur[1] == str(adp[0]) or str(adp[1]):
        if(str(city[1]) == ""):
          departure = prop[1]
          #print("departure 5 : ",prop[1])
        else:
          departure = city[1]
          #print("departure 4 : ",city[1])
      elif valeur[3] == str(adp[1]):
        if(str(city[1]) == ""):
          departure = prop[1]
          #print("departure 3 : ",prop[1])
        else:
          departure = city[1]
          #print("departure 3 : ",city[1])
      else:
        if(str(city[0]) == ""):
          departure = prop[0]
          #print("departure 2 : ",prop[0])
        else:
          departure = city[0]
          #print("departure 1 : ",city[0])
      return departure
return_departure("Train pour aller à Angers en provenance de Paris")

['Angers', 'Paris']
departure 10:  Paris


'Paris'

In [53]:
def return_POS(sentence):
  if(sentence == ""):
    print("veuillez saisir votre trajet")
  else:
    departure = return_departure(sentence)
    arrival = return_arrival(sentence)
    print(departure, arrival)
  return departure, arrival
return_POS("Je veux aller au Burger King")

['Burger King']
[Burger, King]
None King


(None, King)

In [9]:
# Il est possible d'utiliser la fonction sans record audio
# Dans ce cas là, il faut set le param text

# record => Si True, lance un record sur le micro
# text => Si record=False, alors ce sera le text qui sera pris en compte dans le traitement
# view_audio_return => Si True, un bloc audio va être retourné en plus, cela permet d'écouter le record
# record_time => Durée (en secondes) de l'enregistrement audio
def main_function(record=True, text=None, view_audio_return=True, record_time=4, isTest=False):
    sentence = ''
    if record is True:
      record_new_audio_and_save_it(view_audio_return=view_audio_return, record_time=record_time)
      sentence = speech_to_text()
    else:
      if text is not None:
        sentence = text
      else:
        print('/!\ Please, set the `text` param')
        return False
    
    # Check if the text is in french
    if (check_if_text_accepted_language(sentence)):
      departure, arrival = return_POS(sentence)
      if departure and arrival:
        print('Departure found :: ' + departure)
        print('Arrival found :: ' + arrival)
        if isTest is True:
          return True, departure, arrival
      else:
        print('/!\ I did not understand very well')
    else:
      print('/!\ Please, speak french')
      return False
  

In [25]:
main_function(
    record=True,
    view_audio_return=True,
    record_time=3
  )
print("-------------")

Starting recording for 3 seconds...


<IPython.core.display.Javascript object>

Finished recording!


Speech to text out : je souhaite aller à Nice


NameError: ignored

# Testing part:

Avant de lancer les tests:


*   Télécharger le dataset à cette [adresse](https://docs.google.com/spreadsheets/d/15NJFAdmLEO3WwihoUif9T_U09vlIrvgoROKcAetUlz4/edit#gid=0)
*   Importer le fichier dans Colab
*   Renommer le en 'dataset.csv'




In [None]:
import csv

DATASET_NAME_FILE='dataset.csv'

def import_dataset():
  dataset = []
  with open(DATASET_NAME_FILE, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
      sentence = row[0]
      departure = row[1]
      arrival = row[2]
      dataset.append([sentence, departure, arrival])
    dataset.pop(0)
    return dataset


def do_test(dataset=import_dataset()):
  TOTAL_DATA = len(dataset)
  TESTING = 0
  TOTAL_SUCCESS = 0
  TOTAL_FAIL = 0

  print(TOTAL_DATA)
  for data in dataset:
    TESTING = TESTING + 1
    print('testing', TESTING, '...')

    t = main_function(
      record=False,
      text=data[0],
      view_audio_return=False,
      isTest=True
    )
    
    if t[0] is True and (t[1] == data[1]) and (t[2] == data[2]):
      TOTAL_SUCCESS = TOTAL_SUCCESS + 1
    else:
      TOTAL_FAIL = TOTAL_FAIL + 1
      print('/!\ FAIL HERE')

    print('Objectif => Sentence:', data[0], '; Departure;', data[1], '; Arrival:', data[2])
    print("-------------")
  
  print('Total data :: ', TOTAL_DATA)
  print('Total success ::', TOTAL_SUCCESS)
  print('Total failure ::', TOTAL_FAIL)
  print('Efficiency', (TOTAL_SUCCESS/TOTAL_DATA)*100, '%')
  print('Echec', (TOTAL_FAIL/TOTAL_DATA)*100, '%')

do_test()

18
testing 1 ...
['Paris', 'Marseille']
Marseille Paris
Departure found :: Marseille
Arrival found :: Paris
Objectif => Sentence: je voudrais partir à Paris pour me rendre à Marseille ; Departure; Marseille ; Arrival: Paris
-------------
testing 2 ...
['Paris', 'Marseille']
Marseille Paris
Departure found :: Marseille
Arrival found :: Paris
Objectif => Sentence: comment pourai-je me rendre à Paris depuis Marseille ; Departure; Marseille ; Arrival: Paris
-------------
testing 3 ...
['Marseille', 'Paris']
Paris Marseille
Departure found :: Paris
Arrival found :: Marseille
Objectif => Sentence: Je veux aller à Marseille depuis Paris ; Departure; Paris ; Arrival: Marseille
-------------
testing 4 ...
['Marseille', 'Paris']
Marseille Paris
Departure found :: Marseille
Arrival found :: Paris
Objectif => Sentence: je souhaite partir de Marseille à Paris ; Departure; Marseille ; Arrival: Paris
-------------
testing 5 ...
['Marseille', 'Paris']
Marseille Paris
Departure found :: Marseille
Arriv