In [6]:
!pip install transformers torchaudio librosa



In [7]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import numpy as np
import librosa
import matplotlib.pyplot as plt
from scipy import stats

In [8]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def compute_gop_score(audio_file):
    # Step 3: Load the audio file and preprocess it
    audio_input, rate = librosa.load(audio_file, sr=16000)  # Resampling to 16kHz

    # Step 4: Process the audio with Wav2Vec2
    inputs = processor(audio_input, sampling_rate=rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(input_values=inputs.input_values).logits

    # Step 5: Decode the logits to get the transcription
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])

    print(f"Transcription: {transcription}")
    
    # Step 6: Calculate the posterior, likelihood, and likelihood ratio
    # Here, we will simulate these calculations for the sake of simplicity
    # In a real scenario, you would use frame-level probabilities for this

    # Example of calculating the "posterior" as average log probabilities
    log_probs = torch.log_softmax(logits, dim=-1).squeeze().cpu().numpy()
    
    # Compute Posterior (average log posterior for each token)
    posterior = np.mean(log_probs)
    
    # Simulate likelihood and likelihood ratio (using some made-up logic)
    # Likelihood is the average probability of the correct token at each frame
    likelihood = np.mean(np.exp(log_probs))

    # Likelihood ratio: A simple ratio of the max likelihood over the total likelihood
    max_likelihood = np.max(np.exp(log_probs))
    likelihood_ratio = max_likelihood / np.sum(np.exp(log_probs))

    # Return the computed GOP scores
    return posterior, likelihood, likelihood_ratio


In [10]:

audio_file = "/kaggle/input/telegu/Telugu_accent_hin_2_1.wav"


gop_posterior, gop_likelihood, gop_likelihood_ratio = compute_gop_score(audio_file)

print(f"GOP Posterior: {gop_posterior}")
print(f"GOP Likelihood: {gop_likelihood}")
print(f"GOP Likelihood Ratio: {gop_likelihood_ratio}")

Transcription: TORA TORA HILI ATTA MIRICOP
GOP Posterior: -17.03519058227539
GOP Likelihood: 0.03125
GOP Likelihood Ratio: 0.005847947672009468
