In [None]:
from transformers import AutoTokenizer, TFRobertaForMaskedLM
import tensorflow as tf
import pyaudio
import wave
from pydub import AudioSegment
import speech_recognition as sr
from tensorflow.keras.models import load_model
import cv2 as cv
import numpy as np
import os

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = TFRobertaForMaskedLM.from_pretrained("roberta-base")

In [None]:
inputs = tokenizer("Can i have something to <mask>", return_tensors="tf")
logits = model(**inputs).logits
# retrieve index of <mask>
mask_token_index = tf.where((inputs.input_ids == tokenizer.mask_token_id)[0])
selected_logits = tf.gather_nd(logits[0], indices=mask_token_index)
predicted_token_id = tf.math.argmax(selected_logits, axis=-1)
tokenizer.decode(predicted_token_id)

In [None]:
def record_audio():   
    audio = pyaudio.PyAudio()
    stream = audio.open(format=pyaudio.paInt16, 
                        channels=1, 
                        rate=44100, 
                        input=True, 
                        frames_per_buffer=1024)

    frames = []

    try:
        print("Recording Started")
        while True:
            data = stream.read(1024)
            frames.append(data)
    except KeyboardInterrupt:
        print("Recording Stopped")
        pass

    stream.stop_stream()
    stream.close()
    audio.terminate()

    sound_file = wave.open("recording.wav", "wb")
    sound_file.setnchannels(1)
    sound_file.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
    sound_file.setframerate(44100)
    sound_file.writeframes(b''.join(frames))

In [None]:
def get_audio_text():
    r = sr.Recognizer()

    with sr.AudioFile('recording.wav') as source:
        audio_text = r.listen(source)
    

    return r.recognize_google(audio_text, show_all=False) + "."

In [None]:
def get_room_classification():

    model = load_model('models/VGG19-Classification.h5')  
    img = cv.imread('kitchen_test.jpg')
    resize = tf.image.resize(img, (224, 224))
    input_image = np.expand_dims(resize / 255, 0)
    yhat = model.predict(input_image)
    predicted_class_index = np.argmax(yhat)
    labels = ['Bathroom', 'Bedroom', 'Dinning', 'Kitchen', 'Living Room']
    predicted_label = labels[predicted_class_index]
    final= "Currently in the "+ predicted_label + "."

    return final
    

In [None]:
def final():

    record_audio()
    audio_text = get_audio_text()
    room_class = get_room_classification()
    user_text = "Can I have something to"

    final = room_class+" "+audio_text + " "+user_text + " <mask>" 

    print(final)

    inputs = tokenizer(final, return_tensors="tf")
    logits = model(**inputs).logits
    # retrieve index of <mask>
    mask_token_index = tf.where((inputs.input_ids == tokenizer.mask_token_id)[0])
    selected_logits = tf.gather_nd(logits[0], indices=mask_token_index)
    predicted_token_id = tf.math.argmax(selected_logits, axis=-1)
    print("Precited word: " +tokenizer.decode(predicted_token_id))