# Importing the libraries

In [2]:
import gradio as gr
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Model

In [3]:
# Load Model
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

def transcribe(audio_file):
    try:
        # Load the recorded audio file
        speech, sr = librosa.load(audio_file, sr=16000)  # Convert to 16kHz
        
        # Tokenization & Model Inference
        input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])
        
        return transcription.capitalize()
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio Interface
gr.Interface(
    fn=transcribe,
    inputs=[gr.Audio(type="filepath")],  # Returns a file path
    outputs=[gr.Textbox()],
    live=True
).launch()


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


