In [1]:
%pip install groq

Collecting groq
  Using cached groq-0.11.0-py3-none-any.whl.metadata (13 kB)
Collecting distro<2,>=1.7.0 (from groq)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Using cached groq-0.11.0-py3-none-any.whl (106 kB)
Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, groq
Successfully installed distro-1.9.0 groq-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Step 1: Library Imports
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer
import torch
import soundfile as sf
import subprocess
import os
import time
from groq import Groq
from pydub import AudioSegment
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from tempfile import NamedTemporaryFile
# from parler_tts import ParlerTTSForConditionalGeneration


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 2: Audio Conversion Function
def convert_audio_to_wav(input_path, output_path="output.wav", target_sample_rate=16000):
    try:
        command = [
            "ffmpeg", "-i", input_path,
            "-ar", str(target_sample_rate),
            "-ac", "1",
            output_path
        ]
        subprocess.run(command, check=True)
        print(f"Converted {input_path} to {output_path} at {target_sample_rate} Hz.")
    except subprocess.CalledProcessError as e:
        print("An error occurred during conversion:", e)


In [3]:
async def convert_audio_to_wav(input_path, output_path="output.wav"):
    try:
        temp_file = NamedTemporaryFile(delete=False, suffix=".tmp")
        file_location = temp_file.name
        print(temp_file.name)
        with open(file_location, "wb") as f:
            f.write(await input_path)
        
        # Convert the file to WAV format
        wav_file_location = file_location + ".wav"
        try:
            audio = AudioSegment.from_file(file_location)
            audio.export(output_path, format="wav")
        except Exception as e:
            print(f"Error converting file to WAV: {e}")
            return JSONResponse(content={"error": "Failed to convert file to WAV"}, status_code=500)
    except Exception as e:
        return JSONResponse(content={"error": "Failed to convert file to WAV"}, status_code=200)

In [7]:

output_wav_file = "outputFile.wav"
# Transcribe the audio
model_repo = "shReYas0363/whisper-tiny-fine-tuned"

In [8]:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained(model_repo)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [9]:
# Step 3: Transcription Function
def transcribe_audio(audio_path, model_repo):


    #PREDEFINED UNTIL NOW
    
    audio_input, sampling_rate = sf.read(audio_path)
    
    if len(audio_input.shape) > 1:
        audio_input = audio_input.mean(axis=1)
    
    input_features = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_features
    input_features = input_features.to(device)
    
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return transcription


In [5]:
# Step 4: GROQ Chat Completion
def generate_chat_response(transcription):
    client = Groq(api_key='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')
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": f'''You are a medical assistant give a very friendly one line response for this query: {transcription} Remember to give a single reply in just one reply'''}],
        model="llama-3.2-1b-preview",
    )
    return chat_completion.choices[0].message.content


In [6]:
# # Step 5: Text-to-Speech (TTS) Functionality
# def synthesize_speech(prompt):
#     device = "cuda:0" if torch.cuda.is_available() else "cpu"
#     model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device)
#     tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
    
#     description = "Laura Female Indian voice normal"
#     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
#     prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    
#     generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
#     audio_arr = generation.cpu().numpy().squeeze()
    
#     sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate)


In [8]:
# Step 6: Execution Cell
# start_time = time.time()

# Specify your audio file path
input_audio_file = "sample.m4a"  
output_wav_file = "outputFile.wav"

# Convert audio to WAV
convert_audio_to_wav(input_audio_file, output_wav_file)


# yet to actually implement this for the .m4a to .wav

<coroutine object convert_audio_to_wav at 0x000002643ECBBAE0>

In [10]:

transcription = transcribe_audio(output_wav_file, model_repo)
print("Transcription:", transcription)

Transcription:  Hello, my check123


In [11]:


# Generate chat response
resps = generate_chat_response(transcription)
print("Chat Response:", resps)

# Synthesize speech
# synthesize_speech(resps)

# Measure execution time
# end_time = time.time()
# print(f"The total time taken is: {end_time - start_time:.2f} seconds")

# # Cleanup
# os.remove(output_wav_file)


Chat Response: You have a recent check-in and we're all set to review your health records as of today!"


In [12]:
import requests

# Define the URL and parameters
url = "http://[::1]:5002/api/tts"
params = {
    "text": resps,
    "speaker_id": "p374",
    "style_wav": "",
    "language_id": ""
}

# Make the GET request
response = requests.get(url, params=params)

if response.status_code == 200:
    # Write the binary response content to a WAV file
    with open('output_wav.wav', 'wb') as wav_file:
        wav_file.write(response.content)
    print("WAV file has been saved as 'output_wav.wav'.")
else:
    print(f"Error: {response.status_code} - {response.text}")


WAV file has been saved as 'output_wav.wav'.
