In [7]:
import os
import pandas as pd
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Load CSV file with audio file names and ground truth transcriptions
csv_file_path = "cv-other-train.csv"  # Change this to your CSV file path
data = pd.read_csv(csv_file_path)
def process_audio(audio_file):
    audio, _ = librosa.load(audio_file, sr=16000)
    input_values = processor(audio, return_tensors='pt').input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

def save_transcriptions_to_csv(audio_folder, csv_file):
    transcriptions = []
    for audio_file in os.listdir(audio_folder):
        if audio_file.endswith(".mp3"):  # Assuming all audio files are in mp3 format
            transcription = process_audio(os.path.join(audio_folder, audio_file))
            transcriptions.append(transcription)

    data['Model Transcriptions'] = transcriptions
    data.to_csv(csv_file, index=False)

# Task 1: Save transcriptions to the CSV file
audio_folder_path = "cv-other-train"  # Change this to your audio folder path
save_transcriptions_to_csv(audio_folder_path, csv_file_path)

# Task 2: Calculate metrics
ground_truth = data['text']  # Using the 'text' column as ground truth


# Convert model predictions to lowercase
model_transcriptions = data['Model Transcriptions'].str.lower()

# Task 2: Calculate metrics
ground_truth = data['text'].str.lower()  # Convert ground truth labels to lowercase for consistency

accuracy = accuracy_score(ground_truth, model_transcriptions)
precision = precision_score(ground_truth, model_transcriptions, average='weighted')
recall = recall_score(ground_truth, model_transcriptions, average='weighted')
f1 = f1_score(ground_truth, model_transcriptions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Accuracy: 0.3426294820717131
Precision: 0.350597609561753
Recall: 0.3426294820717131
F1 Score: 0.34528552456839307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Define the directory where you want to save the model
output_dir = "saved_wav2vec2_model"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save the model and processor
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)

print("Model and processor saved successfully at:", output_dir)


Model and processor saved successfully at: saved_wav2vec2_model
