In [None]:
import speech_recognition as sr
from google.cloud import speech
import io
import os

#######################
GOOGLE_CLOUD_SPEECH_CREDENTIALS_PATH = '../files/TFM project-287dc6d9869a.json'
#######################

def transcript_audio(filepath, language, use_cloud):
    transcript = '##NONE##'
    # The name of the audio file to transcribe
    file_name = os.path.join(os.path.dirname(''), filepath)
    
    if use_cloud:
        try:
             # Instantiates a client
            speech_client = speech.Client.from_service_account_json(GOOGLE_CLOUD_SPEECH_CREDENTIALS_PATH)
            
            # Loads the audio into memory
            with io.open(file_name, 'rb') as audio_file:
                content = audio_file.read()
                sample = speech_client.sample(
                    content,
                    source_uri=None,
                    encoding='LINEAR16',
                    sample_rate_hertz=16000)

            # Detects speech in the audio file
            alternatives = sample.recognize(language)
            
            if (len(alternatives)>0):
                transcript = alternatives[0].transcript
        except Exception as e: 
            print(e)
            
    if (transcript == '##NONE##'):
        try:
            r = sr.Recognizer()
            with sr.AudioFile(file_name) as source:
                audio = r.record(source) 
            # for testing purposes, we're just using the default API key
            # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY", show_all=True)`
            # instead of `r.recognize_google(audio, show_all=True)`
            alternatives = r.recognize_google(audio, show_all=False)
            if (len(alternatives)>0):
                transcript = alternatives
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand audio")
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))
       
    return transcript


# Audio Play

import pyaudio
import wave
import time
import sys
import pygame as pg

def play_audio(audio_file, volume=0.8):
    '''
    stream music with mixer.music module in a blocking manner
    this will stream the sound from disk while playing
    '''
    # set up the mixer
    freq = 44100     # audio CD quality
    bitsize = -16    # unsigned 16 bit
    channels = 2     # 1 is mono, 2 is stereo
    buffer = 2048    # number of samples (experiment to get best sound)
    pg.mixer.init()
    # volume value 0.0 to 1.0
    pg.mixer.music.set_volume(volume)
    clock = pg.time.Clock()
    try:
        pg.mixer.music.load(audio_file)
        print("Audio file {} loaded!".format(audio_file))
    except pg.error:
        print("File {} not found! ({})".format(audio_file, pg.get_error()))
        return
    pg.mixer.music.play()
    while pg.mixer.music.get_busy():
        # check if playback has finished
        clock.tick(30)
        
def play_any_audio(filename):
    pg.mixer.init()
    pg.mixer.music.load(filename)
    pg.mixer.music.play()

def play_wav_audio(filename):    
    WAVE_FILENAME = filename
    if len(sys.argv) < 2:
        print("Plays a wave file.\n\nUsage: %s filename.wav" % WAVE_FILENAME)
        sys.exit(-1)

    wf = wave.open(WAVE_FILENAME, 'rb')

    p = pyaudio.PyAudio()

    def callback(in_data, frame_count, time_info, status):
        data = wf.readframes(frame_count)
        return (data, pyaudio.paContinue)

    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True,
                    stream_callback=callback)

    stream.start_stream()

    while stream.is_active():
        time.sleep(0.1)

    stream.stop_stream()
    stream.close()
    wf.close()

    p.terminate()
    
def record_audio(filename, seconds): 
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    #CHANNELS = 2
    CHANNELS = 1
    #RATE = 44100
    RATE = 16000


    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("* recording")

    frames = []

    for i in range(0, int(RATE / CHUNK * seconds)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("* done recording")

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(filename, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

import gettext

class Text_Helper:    
    def __init__(self, language_path, audio_path, current_language):
        self.language_path = language_path
        self.audio_path = audio_path
        self.current_language = current_language   

        
    def get_():   
        l = gettext.translation('text_', localedir=self.language_path, languages=[self.current_language])
        l.install()
        _ = l.gettext
        return _


    
from gtts import gTTS
import os
from unidecode import unidecode
import gettext
from audio_utils import *
import cv2
from random import *
from speech_utils import *
import string

class Lang_Helper:


     
    def __init__(self, available_languages, language_path, audio_path, image_path, current_language):
        self.available_languages = available_languages
        self.language_path = language_path
        self.audio_path = audio_path
        self.audio_path = audio_path
        self.current_language = current_language   
        self.config_audios = []
        with open(self.language_path + self.current_language + '/audio_config.txt') as f:
            for line in f:
                self.config_audios.append(line.rstrip())
        l = gettext.translation('text_', localedir=self.language_path, languages=[self.current_language])
        l.install()
        self._ = l.gettext  

    def change_language(self, new_lang):
        self.config_audios = []
        with open(self.language_path+new_lang+'/audio_config.txt') as f:
            for line in f:
                self.config_audios.append(line.rstrip())
        
        l = gettext.translation('text_', localedir=self.language_path, languages=[new_lang])
        l.install()
        self._ = l.gettext  
        self.current_language = new_lang        
                
    def switch_to_next_language(self):
        idx = self.available_languages.index(self.current_language)
        new_idx = 0
        if (idx+1 < len(self.available_languages)):
            new_idx = idx+1
        self.change_language(self.available_languages[new_idx])
        

    def get_language_commands(self, available_commands):
        l = gettext.translation('text_', localedir=self.language_path, languages=[self.current_language])
        l.install()
        _ = l.gettext
        commands = []
        for command in available_commands:
            commands.append(_(command))
        return commands

    def capture_speech(self):
        rand = "".join(choice(string.ascii_letters) for x in range(randint(8, 8)))
        temp_wav = self.audio_path + 'temp/' + rand + '.wav'
        
        #Play beep
        self.play(self.audio_path + 'beep.mp3')
        
        #Record audio
        record_audio(temp_wav, 2)
        self.talk('one_moment')
        
        #Transcript audio
        transcript = transcript_audio(temp_wav, self.current_language, True) 
        transcript = unidecode(transcript)
        print('***'+transcript+'***')
        os.remove(temp_wav)
        return transcript.strip()

    def capture_selected_command(self):
        available_commands = ['who','what','save','language','cancel','repeat', 'quit','options', 'keys', 'repeat']
        lang_commands = self.get_language_commands(available_commands)
        transcript = self.capture_speech()
        if (transcript == '' or transcript == '##NONE##'): 
            return '##NONE##'
        elif (transcript.lower() in lang_commands):
            try:
                return available_commands[lang_commands.index(transcript.lower())]
            except Exception as e: 
                print('**c****')
                print(e)
                print('**c****')
                return '##NONE##'
            return transcript.lower()
        else:
            #return '##UNKNOWN##'
            return '##NONE##'

    def capture_custom_command(self, available_commands):
        lang_commands = self.get_language_commands(available_commands)
        transcript = self.capture_speech()
        if (transcript == '' or transcript == '##NONE##'): 
            return '##NONE##'
        elif (transcript.lower() in lang_commands):
            try:
                return available_commands[lang_commands.index(transcript.lower())]
            except Exception as e: 
                print('**c****')
                print(e)
                print('**c****')
                return '##NONE##'
            return transcript.lower()
        else:
            #return '##UNKNOWN##'
            return '##NONE##'
        

    def capture_name(self):
        available_commands = ['cancel']
        lang_commands = self.get_language_commands(available_commands)        
        
        transcript = self.capture_speech()    
        transcript = unidecode(transcript)
        
        #if transcript didn't capture anything then exit 
        if (transcript == '' or transcript == '##NONE##'): 
            return '##NONE##'
        #if transcript captures cancelation then cancel
        elif (transcript.lower() in lang_commands):                
            return available_commands[lang_commands.index(transcript.lower())]
        #if transcript ok then proceed
        else:
            return transcript
            
        
    def get_command(self, c):
        command = '##NONE##'
        if (c==' '):
            try:           
                while command == '##NONE##':
                    self.talk('choose_short')
                    nc = chr(cv2.waitKey(2)& 255)
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        break
                    else:
                        command = self.capture_selected_command()
                        if (command=='##NONE##'):
                            self.talk('not_understand')
                            self.talk('repeat_options')
                            new_command = '##NONE##'
                            new_command = self.capture_custom_command(['yes', 'no'])
                            if (new_command == 'yes'):
                                self.talk('commands')
                            else:
                                self.talk('ok')
                                command = 'cancel'
                                break;
                        else:                        
                            break                
            except Exception as e: 
                print('*c*****')
                print(e)
                print('*c*****')
        elif (c=='0'):
            try:           
                while command == '##NONE##':
                    self.talk('choose')
                    nc = chr(cv2.waitKey(2)& 255)
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        break
                    else:
                        new_command = '##NONE##'
                        new_command = self.capture_custom_command((['commands', 'keys', 'cancel']))
                        if (new_command=='##NONE##'):
                            self.talk('not_understand')
                            self.talk('commands')
                            return self.get_command(' ')
                        elif (new_command=='commands'):
                            self.talk('commands')
                            return self.get_command(' ')
                        elif (new_command=='keys'):
                            self.talk('keys')
                            return self.get_command(' ') 
                        elif (new_command=='cancel'):
                            return 'cancel'                     
                        else :                        
                            self.talk('not_understand')
                            self.talk('commands')
                            return self.get_command(' ')
            except Exception as e: 
                print('*c*****')
                print(e)
                print('*c*****')
                
        if (c=='L' or command=='language'):
            return 'language'
        elif (c=='A' or command=='who'):
            return 'who' 
        elif  (c=='S' or command=='save'):
            return 'save'
        elif (c=='C' or command=='cancel'):
            self.talk('canceled')
            return 'cancel'
        elif (command=='repeat'):
            self.talk('commands')
            return self.get_command(' ')        
        if (c=='Q' or command=='quit'):
            return 'quit'

    def get_language_audios(path, audios, preds):
        lang_audios = []
        for audio in audios:
            audio_path = path + audio
            for pred in preds:
                audio_path = audio_path.replace('['+pred+']', preds[pred])
            lang_audios.append(audio_path)
        return lang_audios

    def get_formatted_language_audios(self, predictions):
        lang_audios = []
        try:
            print(predictions)
            for prediction in predictions:
                for audio in self.config_audios:
                    key = audio.split(':')[0]
                    if (key == 'GENDER' and prediction['FULL_NAME'] != ''):
                        audio_path = self.audio_path + 'known/' + prediction['FULL_NAME'] + '.mp3'
                        lang_audios.append(audio_path)
                    else:
                        audio_path = self.language_path + self.current_language + '/' + audio.split(':')[1]                    
                        for key in prediction:
                            audio_path = audio_path.replace('['+key+']', prediction[key])
                        lang_audios.append(audio_path)
                    

        except Exception as e: 
            print('*a******')
            print(e)
            print('*a******')
        return lang_audios

    def get_formatted_language_text(self, prediction):
        lang_text = ''
        try:
            text_config = ''
            with open(self.language_path + self.current_language + '/text_config.txt') as f:
                for line in f:
                    text_config += line.rstrip()
            g = text_config.split(':')[0]
            lang_text = text_config.split(':')[1]
            
            for key in prediction:
                g = g.replace('['+key+']', prediction[key])
                
            l = gettext.translation('text_' + g, localedir=self.language_path, languages=[self.current_language])
            l.install()
            __ = l.gettext        
            t = ''
            if (prediction['NAME'] != ''):  
                t = prediction['NAME']
            else:
                if(prediction['GENDER'] != ''):
                    t = __(str(prediction['GENDER']))
                    
            lang_text = lang_text.replace('[GENDER]', t) 
            t = ''
            if(prediction['EMOTION'] != ''):
                t = __(prediction['EMOTION'])
                
            lang_text = lang_text.replace('[EMOTION]', t)      
        except Exception as e: 
            print('*t******')
            print(e)
            print('*t******')
        return lang_text

    def talk(self, audio_name):
        self.play(self.language_path + self.current_language + '/speech/' + audio_name + '.mp3')

    def play(self, audio_path):
        play_audio(audio_path)        


import matplotlib.pyplot as plt
from scipy.misc import imread
from scipy.misc import imresize
from random import shuffle
import numpy as np
import cv2
from keras.models import load_model
import numpy as np
from statistics import mode
import glob
import os
import face_recognition
import string 
from random import *
from gtts import gTTS

def get_labels(dataset_name):
    if dataset_name == 'fer2013':
        return {0:'angry',1:'disgust',2:'sad',3:'happy',
                    4:'sad',5:'surprise',6:'neutral'}
    elif dataset_name == 'imdb':
        return {0:'woman', 1:'man'}
    else:
        raise Exception('Invalid dataset name')

def preprocess_input(images):
    images = images/255.0
    return images

def _imread(image_name):
        return imread(image_name)

def _imresize(image_array, size):
        return imresize(image_array, size)

def split_data(ground_truth_data, training_ratio=.8, do_shuffle=False):
    ground_truth_keys = sorted(ground_truth_data.keys())
    if do_shuffle == True:
        shuffle(ground_truth_keys)
    num_train = int(round(training_ratio * len(ground_truth_keys)))
    train_keys = ground_truth_keys[:num_train]
    validation_keys = ground_truth_keys[num_train:]
    return train_keys, validation_keys

def display_image(image_array):
    image_array =  np.squeeze(image_array).astype('uint8')
    plt.imshow(image_array)
    plt.show()

def to_categorical(integer_classes, num_classes=2):
    integer_classes = np.asarray(integer_classes, dtype='int')
    num_samples = integer_classes.shape[0]
    categorical = np.zeros((num_samples, num_classes))
    categorical[np.arange(num_samples), integer_classes] = 1
    return categorical


# parameters
detection_model_path = '../models/face/haarcascade_frontalface_default.xml'
emotion_model_path = '../models/emotion/simple_CNN.530-0.65.hdf5'
gender_model_path = '../models/gender/simple_CNN.81-0.96.hdf5'
emotion_labels = get_labels('fer2013')
gender_labels = get_labels('imdb')
frame_window = 10
x_offset_emotion = 20
y_offset_emotion = 40
x_offset = 30
y_offset = 60

class Model_Helper:    
    def __init__(self, detection_model_path, emotion_model_path, current_language, audio_path, image_path):
        self.audio_path = audio_path
        self.image_path = image_path

        print('Loading gender detector...')
        self.gender_classifier = load_model(gender_model_path)
        
        print('Loading face detector...')
        self.face_detection = cv2.CascadeClassifier(detection_model_path)
        
        print('Loading emotion detector...')
        self.emotion_classifier = load_model(emotion_model_path)  

        print('Loading known faces...')

        self.known_faces = []

        for filepath in glob.iglob(self.image_path + 'known/*.*', recursive=True):  
            try:
                filename = os.path.splitext(os.path.basename(filepath))[0]
                name = os.path.splitext(filename)[0].split('-')[0]
                picture = face_recognition.load_image_file(filepath)
                encoding = face_recognition.face_encodings(picture)[0]
                self.known_faces.append([name, filename, encoding])
            except Exception as e: 
                try:
                    os.remove(self.image_path + 'known/' + filename+'.jpg')
                    os.remove(self.audio_path + 'known/' + filename+'.mp3')
                except Exception as e: 
                    print(e)                    
            

        print(str(len(self.known_faces)) + ' faces loaded')

    def update_known_faces(self, name, audio_file_name, face_encoding, current_encoding):
        temp_faces = []
        
        # Remove previous faces with same encoding
        for i in range(len(self.known_faces)):
            match = face_recognition.compare_faces([self.known_faces[i][2]], current_encoding)
            if match[0]:
                print(self.known_faces[i][1] + ' is match')
                image_file = self.image_path + 'known/' + self.known_faces[i][1]+'.jpg'
                audio_file = self.audio_path + 'known/' + self.known_faces[i][1]+'.mp3'
                os.remove(image_file)
                print(image_file + ' deleted')
                os.remove(audio_file)
                print(audio_file + ' deleted')
            else:
                print(self.known_faces[i][1] + ' no match')
                temp_faces.append(self.known_faces[i])
        # Add new encoding and data to known faces
        temp_faces.append([name, audio_file_name, face_encoding])     
        print(name + ' added')
        self.known_faces = temp_faces      

    def save_face(self, name, language, face, current_encoding):
        try:
            rand = "".join(choice(string.ascii_letters) for x in range(randint(8, 8)))
            full_name = name + '-' + rand
            path_audio = self.audio_path + 'known/' + full_name + '.mp3'
            path_image = self.image_path + 'known/' + full_name + '.jpg'
            
            #Convert transcript to standard audio
            tts = gTTS(text=name, lang=language, slow=False)        
            tts.save(path_audio)
            
            #cv2.imshow('image',face)
            cv2.imwrite(path_image, face)
            
            #Get face encoding
            picture = face_recognition.load_image_file(path_image)        
            face_encoding = face_recognition.face_encodings(picture)[0]

            self.update_known_faces(name, full_name, face_encoding, current_encoding)
            return full_name
        except Exception as e: 
            print('**s****')
            print(e)
            print('**s****')
            return ''

        


import cv2
from operator import itemgetter

# Variables
ENCODING_FREQ = 10
encoding_count = 0
last_faces_count = 0
face_encodings = []
predictions = []
font = cv2.FONT_HERSHEY_SIMPLEX

emotion_label_window = []
gender_label_window = []
last_faces = []
label_dict = {'EMOTION': '', 'GENDER': '', 'NAME': '', 'FULL_NAME': ''}

# Language and localization
AVAILABLE_LANGUAGES = ['es','en']
LANGUAGE = 'es'
LANGUAGE_PATH = '../lang/'
AUDIO_PATH = '../audio/'
IMAGE_PATH = '../images/'

lang_helper = Lang_Helper(AVAILABLE_LANGUAGES, LANGUAGE_PATH, AUDIO_PATH, IMAGE_PATH, LANGUAGE)


# Models
model_helper = Model_Helper('../models/face/haarcascade_frontalface_default.xml', 
                            '../models/emotion/simple_CNN.530-0.65.hdf5', 
                            '../models/gender/simple_CNN.81-0.96.hdf5',
                            AUDIO_PATH, IMAGE_PATH)



# Input image 
cv2.namedWindow('main')
video_capture = cv2.VideoCapture(0)


while True:
    predictions = []
    encoding_count += 1
    last_faces_count = len(last_faces)
    last_faces = []
    _, frame = video_capture.read()
    
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    faces = model_helper.face_detection.detectMultiScale(gray, 1.3, 5)
      
    do_encode = encoding_count>=ENCODING_FREQ or last_faces_count!=len(faces) 
    
    if (do_encode):
        face_encodings = []
    
    face_index = 0
    
    for (x,y,w,h) in sorted(faces, key=itemgetter(0)):
                
        pred_dict = label_dict.copy();
        
        face_index +=1 
        face = frame[(y - y_offset):(y + h + y_offset),
                    (x - x_offset):(x + w + x_offset)]        
        if (do_encode):
            print('re-encoding')
            face_encodings.append(face_recognition.face_encodings(frame, [tuple([int(y), int(x+w), int(y+h), int(x)])])[0])
            encoding_count = 0
        
        try:
            if (len(face_encodings)>0 & face_index -1 < len(face_encodings)):
                for i in range(len(model_helper.known_faces)):
                    match = face_recognition.compare_faces([model_helper.known_faces[i][2]], face_encodings[face_index-1])
                    if match[0]:
                        pred_dict['NAME'] = model_helper.known_faces[i][0]
                        pred_dict['FULL_NAME'] = model_helper.known_faces[i][1]
                        break;
                  
        except Exception as e: 
            print('*******')
            print(e)
            print('*******')
            continue            
        #print('-----')
        last_faces.append(cv2.cvtColor(face.copy(), cv2.COLOR_RGB2BGR))

        gray_face = gray[(y - y_offset_emotion):(y + h + y_offset_emotion),
                        (x - x_offset_emotion):(x + w + x_offset_emotion)]
        try:
            face = cv2.resize(face, (48, 48))
            gray_face = cv2.resize(gray_face, (48, 48))            
        except:
            continue
        face = np.expand_dims(face, 0)
        face = preprocess_input(face)
        gender_label_arg = np.argmax(model_helper.gender_classifier.predict(face))
        gender = gender_labels[gender_label_arg]
        gender_label_window.append(gender)

        gray_face = preprocess_input(gray_face)
        gray_face = np.expand_dims(gray_face, 0)
        gray_face = np.expand_dims(gray_face, -1)
        emotion_label_arg = np.argmax(model_helper.emotion_classifier.predict(gray_face))
        emotion = emotion_labels[emotion_label_arg]
        emotion_label_window.append(emotion)

        if len(gender_label_window) >= frame_window:
            emotion_label_window.pop(0)
            gender_label_window.pop(0)
        try:
            emotion_mode = mode(emotion_label_window)
            gender_mode = mode(gender_label_window)
        except:
            continue
        if gender_mode == gender_labels[0]:
            gender_color = (255, 0, 0)
        else:
            gender_color = (0, 255, 0)   
        
        pred_dict['EMOTION'] = emotion_mode
        pred_dict['GENDER'] = gender_mode
        
        display_text = lang_helper.get_formatted_language_text(pred_dict)
        
        cv2.rectangle(frame, (x, y), (x + w, y + h), gender_color, 2)
        cv2.putText(frame, display_text, (x, y - 30), font,
                        .7, gender_color, 1, cv2.LINE_AA)
        
        predictions.append(pred_dict)

    try:
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        cv2.imshow('main', frame)        
    except:
        continue
    c = chr(cv2.waitKey(2)& 255)
    if (c!= 'ÿ'):
        print(c + " pressed")   
    command = lang_helper.get_command(c.upper())
    if (command == 'language'):
        print('*** Language change *** ')
        lang_helper.switch_to_next_language()
        lang_helper.talk('lang_change')
    elif (command == 'who'):
        print('*** Output predictions selected *** ')
        if (len(predictions) > 0):
            lang_audios =  lang_helper.get_formatted_language_audios(predictions)
            for lang_audio in lang_audios:
                lang_helper.play(lang_audio)
        else:
            lang_helper.talk('no_image')            
    elif (command == 'save'):
        print('*** Save person selected *** ')
        try:
            if (len(last_faces)==1):
                name = '##NONE##'
                while name == '##NONE##':
                    lang_helper.talk('who')
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        break
                    else:
                        name = lang_helper.capture_name() 
                        if (name=='##NONE##'):
                            lang_helper.talk('not_understand')                            
                        elif (name == 'cancel'):
                            lang_helper.talk('canceled')                            
                            break
                        else:
                            print('saving face...')
                            full_name = model_helper.save_face(name, lang_helper.current_language, last_faces[0], face_encodings[face_index-1])    
                            print('///////')
                            print(full_name)
                            print(lang_helper.audio_path + 'known/' + full_name + '.mp3')
                            if (full_name!=''):
                                lang_helper.play(lang_helper.audio_path + 'known/' + full_name + '.mp3')
                                lang_helper.talk('saved')                        
                            break                
            elif (len(last_faces)>1):
                lang_helper.talk('more_than_one_face')
            else:
                lang_helper.talk('no_image')
        except:
            continue
    elif (command == 'quit'):
        break
    

video_capture.release()
cv2.destroyAllWindows()



Using TensorFlow backend.


Loading gender detector...
Loading face detector...
Loading emotion detector...
Loading known faces...
3 faces loaded
