In [7]:
import os
import pandas as pd
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import noisereduce as nr

# Load the new pre-trained model and processor
new_processor1 = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
new_model1 = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

# Function to process audio files
def process_audio(audio_file, processor, model):
    audio, _ = librosa.load(audio_file, sr=16000)
    
    # Remove noise using noisereduce
    reduced_noise = nr.reduce_noise(y=audio, sr=16000)
    
    input_values = processor(reduced_noise, return_tensors='pt').input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

# Function to save transcriptions to CSV
def save_transcriptions_to_csv(audio_folder, csv_file, processor, model):
    transcriptions = []
    for audio_file in os.listdir(audio_folder):
        if audio_file.endswith(".mp3"):  # Assuming all audio files are in mp3 format
            audio_file = os.path.join(audio_folder, audio_file)
            transcription = process_audio(audio_file, processor, model)
            transcriptions.append(transcription)

    data['Model Transcriptions'] = transcriptions
    data.to_csv(csv_file, index=False)

# Task 1: Clean audio file names
audio_folder_path = "cv-other-train"  # Change this to your audio folder path
#for audio_file in os.listdir(audio_folder_path):
#    if audio_file.endswith(".mp3"):
 #       cleaned_filename = clean_audio_filename(audio_file)
 #       os.rename(os.path.join(audio_folder_path, audio_file), os.path.join(audio_folder_path, cleaned_filename))

# Task 2: Save transcriptions to the CSV file
csv_file_path = "cv-other-train.csv"  # Change this to your CSV file path
data = pd.read_csv(csv_file_path)
save_transcriptions_to_csv(audio_folder_path, csv_file_path, new_processor, new_model)

# Task 3: Calculate metrics
ground_truth = data['text']  # Using the 'text' column as ground truth

# Convert model predictions to lowercase
model_transcriptions = data['Model Transcriptions'].str.lower()
# Convert ground truth labels to lowercase for consistency
ground_truth = data['text'].str.lower()

# Calculate metrics
accuracy = accuracy_score(ground_truth, model_transcriptions)
precision = precision_score(ground_truth, model_transcriptions, average='weighted')
recall = recall_score(ground_truth, model_transcriptions, average='weighted')
f1 = f1_score(ground_truth, model_transcriptions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.maske

Accuracy: 0.4940239043824701
Precision: 0.50199203187251
Recall: 0.4940239043824701
F1 Score: 0.49667994687915
