# Imports

In [None]:
from torchaudio.models.decoder import download_pretrained_files
from torchaudio.models.decoder import ctc_decoder
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
from torch import Tensor
from TTS.api import TTS
import torch.nn as nn
import torchaudio
import torch

from IPython.display import clear_output
import screen_brightness_control as sbc
from bs4 import BeautifulSoup as bs4
from playsound import playsound
from subprocess import call
import numpy as np
import unicodedata
import pyautogui
import datetime
import keyboard
import requests
import pyaudio
import random
import psutil
import time
import wave
import re
import os

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Chatbot

In [None]:
SOS_token = 0
EOS_token = 1
OOV_token = 2

class Lang(nn.Module):
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "OOV", 3: "PAD"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z'/!?]+", r" ", s)
    s = re.sub(r"'", " '", s)
    return s.strip()

In [None]:
def indexesFromSentence(lang, sentence):
    words = []
    for word in normalizeString(sentence).split(' '):
        if word in lang.word2index:
            words.append(lang.word2index[word])
        else:
            words.append(OOV_token)
    return words

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [None]:
MAX_LENGTH = 20

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [None]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [None]:
encoder_nn = torch.load("encoder.pth")
decoder_nn = torch.load("decoder.pth")
input_lang = torch.load("input_lang.pth")
output_lang = torch.load("output_lang.pth")

# Voice entity classification

In [None]:
class ActDropNormCNN1D(nn.Module):
    def __init__(self, n_feats, dropout, keep_shape=False):
        super(ActDropNormCNN1D, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(n_feats)
        self.keep_shape = keep_shape
    
    def forward(self, x):
        x = x.transpose(1, 2)
        # x = self.norm(self.dropout(F.gelu(x)))
        x = self.dropout(F.gelu(self.norm(x)))
        if self.keep_shape:
            return x.transpose(1, 2)
        else:
            return x


class VoiceEntityClassification(nn.Module):
    def __init__(self, hidden_size, num_classes, n_feats, num_layers, dropout):
        super(VoiceEntityClassification, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.cnn1 = nn.Sequential(
            nn.Conv1d(n_feats, n_feats, 10, 2, padding=10//2),
            ActDropNormCNN1D(n_feats, dropout, keep_shape=True),
        )
        self.cnn2 = nn.Sequential(
            nn.Conv1d(n_feats, n_feats, 10, 2, padding=10//2),
            ActDropNormCNN1D(n_feats, dropout),
        )
        self.dense = nn.Sequential(
            nn.Linear(n_feats*267, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(128, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(dropout),
        )
        
        self.layer_norm2 = nn.LayerNorm(128)
        self.dropout2 = nn.Dropout(dropout)
        self.final_fc = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def _init_hidden(self, batch_size):
        n, hs = self.num_layers, self.hidden_size
        return (torch.zeros(n*1, batch_size, hs).to(device),
                torch.zeros(n*1, batch_size, hs).to(device)
               )

    def forward(self, x, hidden):
        x = x.to(device)
        x = self.cnn1(x) # batch, channels, time, feature
        x = self.cnn2(x)
        x = torch.flatten(x, 1)
        x = self.dense(x) # batch, time, feature
        x = self.dropout2(F.gelu(self.layer_norm2(x)))  # (time, batch, n_class)
        return self.final_fc(x)

In [None]:
my_voice = torch.load("my_voice.pth")

# Voice target classifier

In [None]:
class TargetClassifier(nn.Module):
    def __init__(self, hidden_size, vocab_size, emb_size, num_layers, dropout_p=0.1):
        super(TargetClassifier, self).__init__()
#         self.positional_encoding = PositionalEncoding(emb_size, dropout_p)
        self.gru_layers = nn.ModuleList([nn.GRU(emb_size if i == 0 else hidden_size,
                                       hidden_size,
                                       batch_first=True)
                                       for i in range(num_layers)])
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.layernorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.out = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = x.to(device)
#         embedded = self.positional_encoding(x)
    
        for gru_layer in self.gru_layers:
            x, _ = gru_layer(x)
            
        x = x[:, -1, :]
        x = self.dense(x)
        x = self.dropout(F.gelu(self.layernorm(x)))
        return self.out(x)

In [None]:
voice_target = torch.load("target_recognition.pth")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoder = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def encodeString(text, tokenizer, encoder):
    indexed = tokenizer.encode_plus(text,
                                    max_length=20,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_tensors="pt")
    
    with torch.no_grad():
        encodings = encoder(**indexed)
        
    last_hidden_states = encodings.last_hidden_state
    return last_hidden_states

# Speech Recognition

In [None]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
acoustic_model = bundle.get_model()

files = download_pretrained_files("librispeech-4-gram")

In [None]:
# acoustic_model = torch.load("my_speech_recognition.pth")

In [None]:
LM_WEIGHT = 3
WORD_SCORE = -0.26

beam_search_decoder = ctc_decoder(
    lexicon=files.lexicon,
    tokens=files.tokens,
    lm=files.lm,
    nbest=3,
    beam_size=100,
    lm_weight=LM_WEIGHT,
    word_score=WORD_SCORE,
)

# Voice Generation

In [None]:
print(TTS().list_models())

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cpu")

# Necessary functions

In [None]:
ones_place = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, 
              "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9,
              "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13,
              "fourteen": 14, "fifteen": 15, "sixteen": 16, 
              "seventeen": 17, "eighteen": 18, "nineteen": 19}

tens_place = {"twenty": 20, "thirty": 30, "forty": 40, "fifty": 50,
              "sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90}

hundreds_place = {"hundred": 10**2, "thousand": 10**3, "million": 10**6, "billion": 10**9,
                  "trillion": 10**12, "quadrillion": 10**15, "quintillion": 10**18}

# Convert word form to numbers
# Works for numbers up to the quintillions
# Also works for dates like nineteen sixty three of twenty twenty four
def word2num(word):
    digits = word.split()
    num = 0
    
    has_ones = False
    has_tens = False
    tens = 0
    hundreds = 0
    for i in range(len(digits)):
        digit = digits[i]
        
        if digit in ones_place:
            # In a real number, there would never be a ones place or tens place next to another ones place
            # For example, there is never nine nineteen or four ninety
            # However, this is seen in a year such as nineteen forty one
            if (has_tens and ones_place[digit] >= 10) or has_ones:
                num *= 100
                
            num += ones_place[digit]
            # Store the tens place and hundreds place in case there is a thousand or million in front of them
            tens += ones_place[digit]
            hundreds += ones_place[digit]
            # Store boolean values to see if there has been a tens place or ones place
            has_ones = True
            has_tens = False
    
        elif digit in tens_place:
            if has_ones or has_tens:
                # If someone enters a year(eighteen twelve works different than one thousand eight hundred twelve)
                num *= 100
                has_tens = False
            else:
                has_tens = True
            
            num += tens_place[digit]
            tens += tens_place[digit]
            hundreds += tens_place[digit]
            has_ones = False
            
        elif digit in hundreds_place:
            if digit != "hundred" and hundreds:
                num += hundreds * hundreds_place[digit] - hundreds
                hundreds = 0
                tens = 0
            else:
                num += tens * hundreds_place[digit] - tens
                hundreds = tens * hundreds_place[digit]
                tens = 0
            has_ones = False
            has_tens = False
        
    return num

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
}
def weather(city, wob):
    res = requests.get(
        f'https://www.google.com/search?q={city}+weather&rlz=1C1VDKB_enUS1016US1016&oq={city}&aqs=chrome.0.69i59j69i57j69i59l2j0i271l2j69i61l2.905j0j7&sourceid=chrome&ie=UTF-8',
        headers=headers
    )
    soup = bs4(res.text, 'html.parser')
    return soup.select(f"#wob_{wob}")[0].getText().strip()

def volume(vol):
    if "increase" in vol:
        call(["amixer", "-D", "pulse", "sset", "Master", f"{word2num(vol)}%+"])
    elif "decrease" in vol:
        call(["amixer", "-D", "pulse", "sset", "Master", f"{word2num(vol)}%-"])
    elif "mute" in vol:
        call(["amixer", "-q", "-D", "pulse", "sset", "Master", "toggle"])
    else:
        f = call(["amixer", "-D", "pulse", "sset", "Master", f"{word2num(vol)}%"])

In [None]:
chunk = 1024
sample_format = pyaudio.paInt16
channels = 1
fs = 16000
seconds = 5
filename = "temp.wav"
frames = []

p = pyaudio.PyAudio()

In [None]:
def save(waveform):
    wf = wave.open(filename, "wb")
    # set the channels
    wf.setnchannels(1)
    # set the sample format
    wf.setsampwidth(p.get_sample_size(sample_format))
    # set the sample rate
    wf.setframerate(16000)
    # write the frames as bytes
    wf.writeframes(b"".join(waveform))
    # close the file
    wf.close()

In [None]:
def google(command):
    if command == "open":
        if "chrome" in (i.name() for i in psutil.process_iter()): # Chrome is already open
            # Create a new separate window
            pyautogui.hotkey('ctrl', 'n')
        else: # If google is not open
            # Open chrome
            pyautogui.press("winleft")
            pyautogui.typewrite("google chrome")
            pyautogui.press("enter")
    elif command == "close":
        pyautogui.hotkey("Alt", "f")
        pyautogui.press("x")

In [None]:
def anaconda(command):
    # Anaconda is not currently used on this device
    pass

In [None]:
def timer(command):
    time = word2num(command)
    desired_seconds = time
    
    if "minute" in command:
        desired_seconds = time * 60
    if "hour" in command:
        desired_seconds = time * 3600
    return desired_seconds

In [None]:
def respond(chat):
    output = evaluate(encoder_nn, decoder_nn, chat, input_lang, output_lang)
    response = ' '.join(output[0][:-1])
    desired_seconds = None
    print(response)
    
    # Get the date
    if "/udate" in response:
        response = response.replace("/udate", datetime.date.today().strftime("%B %d, %Y"))
    # Get the time
    if "/utime" in response:
        response = response.replace("/utime", datetime.datetime.now().strftime("%I:%M:%S"))
    # Get the temperature
    if "/utemp" in response:
        response = response.replace("/utemp", weather("boston", "tm"))
    # Get the humidity
    if "/uhumidity" in response:
        response = response.replace("/uhumidity", weather("boston", "hm"))
    # Get the wind speed
    if "/uwind" in response:
        response = response.replace("/uwind", weather("boston", "ws"))
    # Get the amount of precipitation
    if "/uprecipitation" in response:
        response = response.replace("/uprecipitation", weather("boston", "pp"))
    if "/uvolume" in response:
        after = response.split("/uvolume")[-1]
        vol = after.split("'")[1]
        response = response.replace("/uvolume"+"  '"+vol+"  '", "")
        volume(vol)
    if "/usleep" in response:
        response = response.replace("/usleep", "")
        return response
    if "/unewtab" in response:
        response = response.replace("/unewtab", "")
        pyautogui.hotkey('ctrl', 't')
    if "/uclosetab" in response:
        response = response.replace("/uclosetab", "")
        pyautogui.hotkey('ctrl', 'w')
    if "/uswitchtab" in response:
        after = response.split("/uswitchtab")[-1]
        new = after.split("'")[1]
        response = response.replace("/uswitchtab"+"  '"+new+"  '", "")
        pyautogui.hotkey('ctrl', str(word2num(new)))
    if "/ugoogle" in response:
        after = response.split("/ugoogle")[-1]
        command = after.split("'")[1]
        response = response.replace("/ugoogle"+"  '"+command+"  '", "")
        google(command)
    if "/uanaconda" in response:
        after = response.split("/ugoogle")[-1]
        command = after.split("'")[1]
        response = response.replace("/ugoogle"+"  '"+command+"  '", "")
        anaconda(command)
    if "/utimer" in response:
        after = response.split("/utimer")[-1]
        command = after.split("'")[1]
        response = response.replace("/utimer"+"  '"+command+"  '", "")
        desired_seconds = timer(command)
        
    tts.tts_to_file(text=response, \
            speaker_wav="jarvis_speech_files/killing.wav", language="en", file_path="output.wav")
    playsound('output.wav')

    return response, desired_seconds

In [None]:
n_mels = 128
win_length = 160
hop_length = 80
max_length = 85000

In [None]:
def load_waveform(filename):
    waveform, sample_rate = torchaudio.load(filename)

    # Check sample rate
    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
    return waveform

In [None]:
def pad_waveform(waveform, max_length, bundle, n_mels, win_length, hop_length):
    if len(waveform[0]) < max_length:
        padded_wav = torch.concat((waveform[0], torch.zeros(max_length - len(waveform[0])))).unsqueeze(-2)
    else:
        cut_length = len(waveform[0]) - max_length
        padded_wav = waveform[0][cut_length//2:len(waveform[0])-(cut_length//2)].unsqueeze(-2)
        
    spect = torchaudio.transforms.MelSpectrogram(
                                sample_rate=bundle.sample_rate, n_mels=n_mels,
                                win_length=win_length, 
                                hop_length=hop_length)(padded_wav)
    spect = np.log(spect + 1e-14)
        
    return spect

In [None]:
def reset_audio(filename):
    os.remove(filename)
    time.sleep(3)
    return [], 0

# Implementation

In [None]:
from IPython.display import clear_output
import keyboard
print('Recording')

stream = p.open(format=sample_format,
                channels=channels,
                rate=fs,
                frames_per_buffer=chunk,
                input=True)

if os.path.exists(filename):
    os.remove(filename)
frames = []

desired_seconds = None
prev_seconds = None

counter = 0
periodically_check = False
transcript_placeholder = ""
speaking_counter = 0
check_after = 3
while True:
    if len(frames) > seconds * fs / chunk:
        for i in range(int(len(frames) - (seconds * fs / chunk))):
            frames.pop(0)
        
    data = stream.read(chunk, exception_on_overflow=False)
    frames.append(data)
    
    save(frames)
    if counter % 10 == 0:
        waveform = load_waveform(filename) # Load wave
        spect = pad_waveform(waveform, max_length, bundle, n_mels, 
                             win_length, hop_length) # Pad wave(for my voice classification)
        
        # Automatic Speech Recognition
        emission, _ = acoustic_model(waveform)
        beam_search_result = beam_search_decoder(emission)
        beam_search_transcript = " ".join(beam_search_result[0][0].words).strip()
        
        # Check if still speaking
        if periodically_check:            
            if len(transcript_placeholder) >= len(beam_search_transcript): # if transcipt is no longer increasing
                if beam_search_result[0][0].words != []:
                    if speaking_counter == check_after: # if there's been a long enough pause
                        periodically_check = False
                        print("End of speaking")
                        # Check if it's my voice
                        if my_voice(spect, my_voice._init_hidden(1)):
                            # Check if I am talking to Jarvis
                            speech = encodeString(beam_search_transcript, tokenizer, encoder)
                            out = voice_target(speech)
                            target = F.sigmoid(out)

                            if torch.round(target) == 1:
                                print("Input: ", beam_search_transcript)
                                response, desired_seconds = respond(beam_search_transcript)
                                print("Response: ", response)
                                
                                # If a timer is created, set a timestamp
                                if desired_seconds != None:
                                    prev_seconds = time.time()
                                
                            else:
                                print("Not adressing Jarvis")
                        else:
                            print("Not my voice")
                        frames, speaking_counter = reset_audio(filename)
                        transcript_placeholder = ""

                    elif speaking_counter < check_after:
                        # If the length of the speaking has gone down, increment a counter
                        speaking_counter += 1
            else: # The transcript is still increasing
                speaking_counter = 0
                transcript_placeholder = beam_search_transcript
        
        # First detect speaking
        if beam_search_result[0][0].words != [] and not periodically_check:
            print("Speaking detected")
            periodically_check = True
            transcript_placeholder = beam_search_transcript
#             frames = []
#             os.remove(filename)
        
        print("Input: ", beam_search_transcript)
        clear_output(wait=True)
        
    counter += 1
    
    if desired_seconds != None and round(time.time() - prev_seconds) == desired_seconds:
        # playsound("timer")
        print("Timer ended")
        pass
    
# Stop and close the stream 
stream.stop_stream()
stream.close()
# Terminate the PortAudio interface
p.terminate()
print("Program terminated")

In [None]:
# from IPython.display import clear_output
# import keyboard
# print('Recording')

# stream = p.open(format=sample_format,
#                 channels=channels,
#                 rate=fs,
#                 frames_per_buffer=chunk,
#                 input=True)

# if os.path.exists(filename):
#     os.remove(filename)
# frames = []

# counter = 0
# periodically_check = False
# transcript_placeholder = ""
# speaking_counter = 0
# check_after = 3
# while True:
#     if len(frames) > seconds * fs / chunk:
#         for i in range(int(len(frames) - (seconds * fs / chunk))):
#             frames.pop(0)
        
#     data = stream.read(chunk, exception_on_overflow=False)
#     frames.append(data)
    
#     save(frames)
#     if counter % 10 == 0:
#         waveform, sample_rate = torchaudio.load(filename)

#         # Check sample rate
#         if sample_rate != bundle.sample_rate:
#             waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
        
#         # Pad or slice waveform(for detecting my voice)
#         if len(waveform[0]) < max_length:
#             padded_wav = torch.concat((waveform[0], torch.zeros(max_length - len(waveform[0])))).unsqueeze(-2)
#         else:
#             cut_length = len(waveform[0]) - max_length
#             padded_wav = waveform[0][cut_length//2:len(waveform[0])-(cut_length//2)].unsqueeze(-2)
            
#         spect = torchaudio.transforms.MelSpectrogram(
#                                     sample_rate=bundle.sample_rate, n_mels=n_mels,
#                                     win_length=win_length, 
#                                     hop_length=hop_length)(padded_wav)
#         spect = np.log(spect + 1e-14)
#         emission, _ = acoustic_model(waveform)
#         beam_search_result = beam_search_decoder(emission)
#         beam_search_transcript = " ".join(beam_search_result[0][0].words).strip()
        
#         # Check if still speaking
#         if periodically_check:            
#             if len(transcript_placeholder) >= len(beam_search_transcript):
#                 if beam_search_result[0][0].words != []:
#                     if speaking_counter == check_after:
#                         periodically_check = False
#                         print("End of speaking")
#                         # Check if it's my voice
#                         if my_voice(spect, my_voice._init_hidden(1)):
#                             # Check if I am talking to Jarvis
#                             speech = encodeString(beam_search_transcript, tokenizer, encoder)
#                             out = voice_target(speech)
#                             target = F.sigmoid(out)

#                             if torch.round(target) == 1:
#                                 print("Input: ", beam_search_transcript)
#                                 print("Response: ", respond(beam_search_transcript))
#                                 os.remove(filename)
#                                 frames = []
#                                 time.sleep(3)
#                     elif speaking_counter < check_after:
#                         # If the length of the speaking has gone down, increment a counter
#                         speaking_counter += 1
#             else:
#                 speaking_counter = 0
#                 transcript_placeholder = beam_search_transcript
        
#         # First detect speaking
#         if beam_search_result[0][0].words != [] and not periodically_check:
#             periodically_check = True
#             transcript_placeholder = beam_search_transcript
# #             frames = []
# #             os.remove(filename)
        
#         print("Input: ", beam_search_transcript)
#         clear_output(wait=True)
        
#     counter += 1
    
# # Stop and close the stream 
# stream.stop_stream()
# stream.close()
# # Terminate the PortAudio interface
# p.terminate()
# print("Program terminated")