In [None]:
# Import all the needed libraries
from vosk import Model, KaldiRecognizer
import pyaudio
from gtts import gTTS
from io import BytesIO
from pygame import mixer
import time
import pandas as pd

In [None]:
# define a function that will be used to setup the speech recognition module
def setup_speech_recognition():
    # Load the model and create a recognizer
    model = Model("./model/vosk-model-small-it-0.22")
    recognizer = KaldiRecognizer(model, 16000)
    # setup the microphone to record audio
    mic = pyaudio.PyAudio()
    stream = mic.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=16000,
        input=True,
        frames_per_buffer=8192
    )
    stream.stop_stream()  # Start with the stream stopped
    # return the recognizer and the stream
    return recognizer, stream

In [None]:
# define a function that will be used to setup the speech synthesis module
def setup_speech_synthesis():
    # setup the mixer to play the audio
    mixer.init()
    # return the mixer
    return mixer

In [None]:
# define a function that will be used to recognize the speech
def recognize_speech(recognizer, stream):
    # read the audio data from the stream
    data = stream.read(4096)
    # check if the data is empty
    if len(data) == 0:
        return None
    # check if the recognizer has recognized the speech
    if recognizer.AcceptWaveform(data):
        # return the recognized speech
        return recognizer.Result()[14:-3] # remove the first 14 characters and the last 3 characters, needed to remove the metadata
    # return None if the speech is not recognized
    return None

In [None]:
# define a function that will be used to synthesize the speech
def synthesize_speech(text):
    # create a BytesIO object to store the mp3 file
    mp3_fp = BytesIO()
    # create a gTTS object and write the mp3 file to the BytesIO object and so perform the synthesis
    tts = gTTS(text, lang='it')
    tts.write_to_fp(mp3_fp)
    return mp3_fp

In [None]:
# define a function that will be used to play the synthesized speech
def play_speech(mixer, mp3_fp):
    # set the BytesIO object to the beginning of the file
    mp3_fp.seek(0)
    # play the mp3 file
    mixer.music.load(mp3_fp)
    mixer.music.play()
    # wait until the audio is played
    while mixer.music.get_busy():
        time.sleep(0.1)
    return

In [None]:
# define a function to retrieve patient data
def get_patient_data():
    df_registry = pd.read_csv('./patient_registry_test.csv')
    # create a dictionary with the patient data
    patient = {}
    patient['name'] = df_registry['name'][0]
    patient['gender'] = df_registry['gender'][0]
    patient['age'] = int(df_registry['age'][0])
    df_therapy_plan = pd.read_csv('./therapy_plan_test.csv')
    # create a dictionary with the therapy plan data
    therapy_plan = {}
    # iterate over the therapy plan data and get only the rows for which the column medicine_1 is not empty
    for _, row in df_therapy_plan.iterrows():
        if not pd.isna(row['medicine_1']): # meaning that the patient must take at least one medicine at that time
            # get all the medicines that the patient must take at that time
            medicines = row.drop(['hour']).dropna().tolist()
            therapy_plan[row['hour']] = medicines
    return patient, therapy_plan

In [None]:
# define a function to greet the patient
def greet_patient(patient):
    # create the text to be synthesized
    text = f'Ciao {patient["name"]}, benvenuto alla terapia vocale. Come stai oggi?'
    return text

In [None]:
def speech_interaction(stream, text, mixer, recognizer):
    mp3_fp = synthesize_speech(text)
    play_speech(mixer, mp3_fp)
    stream.start_stream()
    patient_speech = None
    while patient_speech == None:
        patient_speech = recognize_speech(recognizer, stream)
    stream.stop_stream()
    time.sleep(1)
    return patient_speech

In [None]:
patient, therapy_plan = get_patient_data()
recognizer, stream = setup_speech_recognition()
mixer = setup_speech_synthesis()
patient_speech = None
while patient_speech != "buonanotte":
    text = ""
    if patient_speech == None:
        text = greet_patient(patient)
        stream.stop_stream()
        patient_speech = speech_interaction(stream, text, mixer, recognizer)
    elif patient_speech.startswith("ben"):
        text = "Sono contento di sentirtelo dire " + patient["name"] + "."
        text += "Oggi dovrai prendere i seguenti farmaci: "
        text += ", ".join([','.join(lst) for lst in therapy_plan.values()])
        stream.stop_stream()
        patient_speech = speech_interaction(stream, text, mixer, recognizer)
    elif patient_speech.startswith("mal"):
        text = "Mi dispiace, spero che ti senta meglio presto"
    else:
        text = "Non ho capito cosa hai detto"
    