In [None]:
import os
import time
import datetime
import speech_recognition as sr
from TTS.api import TTS
from IPython.display import display, HTML

# Initialize TTS model
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
available_speakers = tts.speakers

# Customize speaker mapping (adjust speaker names if known)
speaker_map = {
    "en": available_speakers[0],
    "ru": available_speakers[1]
}

# For test mode without microphone
TEST_MODE = False
test_inputs = iter([
    "english", "no", "no", "John", "25", "sms", "mobile"
])

# Helper to play audio in Jupyter
def display_and_play_audio(file_path="output.wav"):
    display(HTML(f"""
        <audio autoplay>
            <source src="{file_path}" type="audio/wav">
            Your browser does not support the audio element.
        </audio>
    """))

# Speak text using TTS
def speak(text, lang="en"):
    speaker = speaker_map.get(lang, available_speakers[0])
    print("Bot:", text)
    tts.tts_to_file(text=text, speaker=speaker, language=lang, file_path="output.wav")
    display_and_play_audio("output.wav")

# Listen and recognize speech
def listen(language="en-US", retries=2):
    if TEST_MODE:
        return next(test_inputs).lower()

    recognizer = sr.Recognizer()
    for _ in range(retries):
        with sr.Microphone() as source:
            print("üéôÔ∏è Listening...")
            audio = recognizer.listen(source)
        try:
            response = recognizer.recognize_google(audio, language=language)
            print("User:", response)
            return response.lower()
        except sr.UnknownValueError:
            speak("I didn't catch that. Please repeat.", lang=language[:2])
        except sr.RequestError:
            speak("Service error. Try again later.", lang=language[:2])
            return "error"
    return "unknown"

# Validate age from spoken input
def extract_age(text):
    for word in text.split():
        if word.isdigit():
            return int(word)
    return "unknown"

# Save to file (optional)
def save_log(result_dict, filename="call_log.txt"):
    with open(filename, "a", encoding="utf-8") as f:
        f.write(f"{result_dict['client_id']} | {result_dict['result']} | {result_dict['comment']}\n")

# Main bot logic
def make_call(client_id):
    speak("–ó–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ! Hello! This is a call from your bank.", lang="en")
    speak("Please say your preferred language: English or Russian.", lang="en")
    lang_response = listen("en-US")

    if "russian" in lang_response or "—Ä—É—Å" in lang_response:
        lang_code = "ru"
        recog_lang = "ru-RU"
        t = {
            "greet": "–ó–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ! –≠—Ç–æ –∑–≤–æ–Ω–æ–∫ –∏–∑ –≤–∞—à–µ–≥–æ –±–∞–Ω–∫–∞.",
            "debt": "–£ –≤–∞—Å –µ—Å—Ç—å –∑–∞–¥–æ–ª–∂–µ–Ω–Ω–æ—Å—Ç—å –ø–æ —Å—á–µ—Ç—É. –ö—Ä–∞–π–Ω–∏–π —Å—Ä–æ–∫ –æ–ø–ª–∞—Ç—ã ‚Äî –∑–∞–≤—Ç—Ä–∞. –í—ã —É–∂–µ –æ–ø–ª–∞—Ç–∏–ª–∏?",
            "reask": "–Ø –Ω–µ —Ä–∞—Å—Å–ª—ã—à–∞–ª. –ü–æ–≤—Ç–æ—Ä–∏—Ç–µ, –ø–æ–∂–∞–ª—É–π—Å—Ç–∞.",
            "help": "–í–∞–º –Ω—É–∂–Ω–∞ –ø–æ–º–æ—â—å? –Ø –º–æ–≥—É —Å–æ–µ–¥–∏–Ω–∏—Ç—å –≤–∞—Å —Å –æ–ø–µ—Ä–∞—Ç–æ—Ä–æ–º.",
            "callback": "–•–æ—Ä–æ—à–æ, —è –ø–µ—Ä–µ–∑–≤–æ–Ω—é –ø–æ–∑–∂–µ. –î–æ —Å–≤–∏–¥–∞–Ω–∏—è.",
            "tariff": "–ö –≤–∞—à–µ–º—É —Å–≤–µ–¥–µ–Ω–∏—é: —Ç–∞—Ä–∏—Ñ—ã –±—ã–ª–∏ –∏–∑–º–µ–Ω–µ–Ω—ã. –ü–ª–∞—Ç–∞ –∑–∞ –æ–±—Å–ª—É–∂–∏–≤–∞–Ω–∏–µ —É–≤–µ–ª–∏—á–µ–Ω–∞ –Ω–∞ 10 –ø—Ä–æ—Ü–µ–Ω—Ç–æ–≤.",
            "ask_name": "–ö–∞–∫ –≤–∞—Å –∑–æ–≤—É—Ç?",
            "ask_age": "–°–∫–æ–ª—å–∫–æ –≤–∞–º –ª–µ—Ç?",
            "ask_notify": "–ö–∞–∫ –≤—ã —Ö–æ—Ç–∏—Ç–µ –ø–æ–ª—É—á–∞—Ç—å —É–≤–µ–¥–æ–º–ª–µ–Ω–∏—è? –ù–∞–ø—Ä–∏–º–µ—Ä, –ø–æ –°–ú–° –∏–ª–∏ –∑–≤–æ–Ω–∫—É?",
            "call_time": "–í—Ä–µ–º—è –∑–≤–æ–Ω–∫–∞: ",
            "comm_type": "–ö–∞–∫–æ–π —Ç–∏–ø —Å–≤—è–∑–∏ –≤—ã –∏—Å–ø–æ–ª—å–∑—É–µ—Ç–µ: –º–æ–±–∏–ª—å–Ω—ã–π –∏–ª–∏ —Å—Ç–∞—Ü–∏–æ–Ω–∞—Ä–Ω—ã–π —Ç–µ–ª–µ—Ñ–æ–Ω?",
            "history": "–í–∞—à –ø–æ—Å–ª–µ–¥–Ω–∏–π –∫–æ–Ω—Ç–∞–∫—Ç —Å –±–∞–Ω–∫–æ–º –±—ã–ª 2 –Ω–µ–¥–µ–ª–∏ –Ω–∞–∑–∞–¥ –ø–æ –ø–æ–≤–æ–¥—É –∫—Ä–µ–¥–∏—Ç–Ω–æ–π –∫–∞—Ä—Ç—ã.",
            "thanks": "–°–ø–∞—Å–∏–±–æ –∑–∞ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é.",
            "goodbye": "–î–æ —Å–≤–∏–¥–∞–Ω–∏—è!"
        }
    else:
        lang_code = "en"
        recog_lang = "en-US"
        t = {
            "greet": "Hello! This is a call from your bank.",
            "debt": "You have an outstanding bill. The payment deadline is tomorrow. Have you already paid?",
            "reask": "I didn't catch that. Could you please repeat?",
            "help": "Do you need help? I can connect you to our support center.",
            "callback": "Okay, I will call you back later. Goodbye.",
            "tariff": "Just to inform you: the tariffs have changed. Service charges increased by 10 percent.",
            "ask_name": "May I know your name?",
            "ask_age": "How old are you?",
            "ask_notify": "How would you like to receive notifications? For example, via SMS or call?",
            "call_time": "Call time: ",
            "comm_type": "What type of phone connection do you use ‚Äî mobile or landline?",
            "history": "Your last interaction was 2 weeks ago regarding a credit card issue.",
            "thanks": "Thank you for the information.",
            "goodbye": "Goodbye!"
        }

    speak(t["greet"], lang=lang_code)
    speak(t["debt"], lang=lang_code)
    response = listen(recog_lang)

    if any(word in response for word in ["yes", "–¥–∞"]):
        result = "success"
        comment = "confirmed payment"
    elif any(word in response for word in ["no", "–Ω–µ—Ç", "not yet"]):
        result = "success"
        comment = "needs help or follow-up"
    elif any(word in response for word in ["call back", "–ø–µ—Ä–µ–∑–≤–æ–Ω–∏"]):
        speak(t["callback"], lang=lang_code)
        return {"client_id": client_id, "result": "fail", "comment": "asked for callback"}
    elif response in ["unknown", "error"]:
        speak(t["callback"], lang=lang_code)
        return {"client_id": client_id, "result": "fail", "comment": "unreachable"}
    else:
        result = "success"
        comment = "received info"

    speak(t["help"], lang=lang_code)
    help_response = listen(recog_lang)
    if any(word in help_response for word in ["yes", "i need help", "–¥–∞", "–Ω—É–∂–Ω–∞ –ø–æ–º–æ—â—å", "–ø–æ–º–æ–≥–∏"]):
        time.sleep(5)  # simulate transferring

    speak(t["tariff"], lang=lang_code)
    speak(t["ask_name"], lang=lang_code)
    name = listen(recog_lang)

    speak(t["ask_age"], lang=lang_code)
    age_text = listen(recog_lang)
    age = extract_age(age_text)

    speak(t["ask_notify"], lang=lang_code)
    notification = listen(recog_lang)

    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
    speak(f"{t['call_time']}{now}", lang=lang_code)

    speak(t["comm_type"], lang=lang_code)
    comm = listen(recog_lang)

    speak(t["history"], lang=lang_code)
    speak(t["thanks"], lang=lang_code)
    speak(t["goodbye"], lang=lang_code)

    return {
        "client_id": client_id,
        "result": result,
        "comment": f"{comment}\nname: {name}\nage: {age}\nnotify: {notification}\ncomm: {comm}"
    }

# Display result
def log_result(result_dict):
    print("\nüìû Call Summary:")
    print("client_id | result     | comment")
    print("----------------------------------------------")
    print(f"{result_dict['client_id']}        | {result_dict['result']} | {result_dict['comment']}")

# Run the bot
call_result = make_call(10001)
log_result(call_result)
# Optional file logging
# save_log(call_result)


 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-

Bot: Please say your preferred language: English or Russian.
 > Text splitted to sentences.
['Please say your preferred language: English or Russian.']
 > Processing time: 0.42269229888916016
 > Real-time factor: 0.11144009989168471


üéôÔ∏è Listening...
Bot: I didn't catch that. Please repeat.
 > Text splitted to sentences.
["I didn't catch that.", 'Please repeat.']
 > Processing time: 0.31808900833129883
 > Real-time factor: 0.09031488027578048


üéôÔ∏è Listening...
