In [1]:
import os
import pandas as pd
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the new pre-trained model and processor
new_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
new_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Load CSV file with audio file names and ground truth transcriptions
csv_file_path = "cv-other-train.csv"  # Change this to your CSV file path
data = pd.read_csv(csv_file_path)

def process_audio(audio_file, processor, model):
    audio, _ = librosa.load(audio_file, sr=16000)
    input_values = processor(audio, return_tensors='pt').input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

def save_transcriptions_to_csv(audio_folder, csv_file, processor, model):
    transcriptions = []
    for audio_file in os.listdir(audio_folder):
        if audio_file.endswith(".mp3"):  # Assuming all audio files are in mp3 format
            transcription = process_audio(os.path.join(audio_folder, audio_file), processor, model)
            transcriptions.append(transcription)

    data['Model Transcriptions'] = transcriptions
    data.to_csv(csv_file, index=False)

# Task 1: Save transcriptions to the CSV file
audio_folder_path = "cv-other-train"  # Change this to your audio folder path
save_transcriptions_to_csv(audio_folder_path, csv_file_path, new_processor, new_model)

# Task 2: Calculate metrics
ground_truth = data['text']  # Using the 'text' column as ground truth

# Convert model predictions to lowercase
model_transcriptions = data['Model Transcriptions'].str.lower()
# Convert ground truth labels to lowercase for consistency
ground_truth = data['text'].str.lower()

# Calculate metrics
accuracy = accuracy_score(ground_truth, model_transcriptions)
precision = precision_score(ground_truth, model_transcriptions, average='weighted')
recall = recall_score(ground_truth, model_transcriptions, average='weighted')
f1 = f1_score(ground_truth, model_transcriptions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


  from .autonotebook import tqdm as notebook_tqdm
preprocessor_config.json: 100%|████████████████████████████████████████████████████████| 159/159 [00:00<00:00, 158kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
tokenizer_config.json: 100%|██████████████████████████████████████████████████████████████████| 163/163 [00:00<?, ?B/s]
config.json: 100%|████████████████████████████████████████████████████████████████████████████| 843/843 [00:00<?, ?B/s]
vocab.json: 100%|█████████████████████████████████████████████████████████████████████████████| 291/291 [00:00<?, ?B/s]
special_tokens_map.json: 100%|██████████████████████████████████████████████████████████████| 85.0/85.0 [00:00<?, ?B/s]
pytorch_model.bin: 100%|██████████████████████████████████████████████████████████| 1

Accuracy: 0.4262948207171315
Precision: 0.4302788844621514
Recall: 0.4262948207171315
F1 Score: 0.42762284196547146


In [2]:
# Define the directory where you want to save the model
output_dir = "saved_wav2vec2_model\wav2vec2-large-960h"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save the model and processor
new_model.save_pretrained(output_dir)
new_processor.save_pretrained(output_dir)

print("Model and processor saved successfully at:", output_dir)


Model and processor saved successfully at: saved_wav2vec2_model\wav2vec2-large-960h


In [3]:
print("Ground Truth Sample:", ground_truth.head())
print("Model Transcriptions Sample:", model_transcriptions.head())


Ground Truth Sample: 0    he had to spit some tobacco out of his mouth
1           it took her a while to get used to it
2                 you will need some rubber boots
3    you can speak a label to click on an element
4                  the priest collapsed backwards
Name: text, dtype: object
Model Transcriptions Sample: 0    he had to spit some tobacco out of his mouth
1                   to gut awable to get you stip
2                 you will need some robber boots
3      you can speak label to click on an element
4                  the priest collapsed backwards
Name: Model Transcriptions, dtype: object


In [1]:
import pandas as pd
import nltk


def load_csv_data(csv_file_path):
    return pd.read_csv(csv_file_path)

def evaluate_from_csv(csv_file_path):
    total_wer = 0
    total_cer = 0
    total_words = 0
    total_chars = 0
    
   
    data = load_csv_data(csv_file_path)
    
    for index, row in data.iterrows():

        ground_truth = row['text']
        model_transcription = row['Model Transcriptions']
       
        if isinstance(ground_truth, str) and isinstance(model_transcription, str):
           
            ground_truth = ground_truth.lower()
            model_transcription = model_transcription.lower()
            
            
            wer = word_error_rate(model_transcription, ground_truth)
            total_wer += wer

           
            cer = character_error_rate(model_transcription, ground_truth)
            total_cer += cer

            
            num_words = len(ground_truth.split())
            num_chars = len(ground_truth)
            total_words += num_words
            total_chars += num_chars
    
    avg_wer = total_wer / len(data)
    avg_cer = total_cer / len(data)
    wer_rate = total_wer / total_words
    cer_rate = total_cer / total_chars

    print("Word Error Rate (WER) per word: {:.2f}%".format(wer_rate * 100))
    print("Character Error Rate (CER) per character: {:.2f}%".format(cer_rate * 100))

def word_error_rate(hypothesis, reference):
    hypothesis_words = hypothesis.split()
    reference_words = reference.split()
    dist = nltk.edit_distance(hypothesis_words, reference_words)
    wer = dist / len(reference_words)
    return wer

def character_error_rate(hypothesis, reference):
    dist = nltk.edit_distance(hypothesis, reference)
    cer = dist / len(reference)
    return cer


evaluate_from_csv("cv-other-train-copy2.csv")


Word Error Rate (WER) per word: 1.74%
Character Error Rate (CER) per character: 0.14%
