In [37]:
import os
import librosa

def load_audio_files(folder_path):
    audio_files = []
    for file in os.listdir(folder_path):
        if file.endswith(".wav") or file.endswith(".mp3") :  # Assuming all audio files are in .wav format, adjust as needed
            file_path = os.path.join(folder_path, file)
            audio, sr = librosa.load(file_path, sr=16000)  # Load audio file with original sampling rate
            audio_files.append((audio, sr, file))  # Store audio data, sampling rate, and filename
    return audio_files

# Example usage:
folder_path = "cv-other-train"
audio_files = load_audio_files(folder_path)
print("Loaded", len(audio_files), "audio files.")


Loaded 251 audio files.


In [38]:
import os
import librosa
import noisereduce as nr
import numpy as np
from scipy.io import wavfile

def preprocess_audio(input_file, output_file):
    # Load audio file
    audio, sampling_rate = librosa.load(input_file, sr=16000)
    
    # Apply noise reduction
    noisy_part = audio[:]
    reduced_noise = nr.reduce_noise(y=noisy_part, sr=sampling_rate)
    
    # Increase volume
    max_amp = np.max(np.abs(reduced_noise))
    amplified_audio = 0.8 * (reduced_noise / max_amp)
    
    # Save processed audio
    wavfile.write(output_file, sampling_rate, amplified_audio.astype(np.float32))

def preprocess_audio_folder(input_folder, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over files in the input folder
    for file in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, file)
        output_file_path = os.path.join(output_folder, file)
        
        # Process audio and save to output folder
        preprocess_audio(input_file_path, output_file_path)

# Example usage:
input_folder = "cv-other-train"
output_folder = "processed-audios2"
preprocess_audio_folder(input_folder, output_folder)
print("Preprocessing completed.")


  sig_mult_above_thresh = (abs_sig_stft - sig_stft_smooth) / sig_stft_smooth


Preprocessing completed.


In [21]:
len(os.listdir(output_folder))


251

In [39]:
import os
import pandas as pd
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# Initialize Wav2Vec2 model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

def transcribe_audio(audio_file_path):
    # Load audio file
    audio_input, _ = sf.read(audio_file_path)
    
    # Tokenize audio input
    input_values = tokenizer(audio_input, return_tensors="pt").input_values
    
    # Transcribe audio
    with torch.no_grad():
        logits = model(input_values).logits
    
    # Decode transcription
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]
    
    return transcription

def add_transcriptions_to_excel(audio_folder, input_excel_path, output_excel_path):
    # Load input Excel file
    input_df = pd.read_csv(input_excel_path)
    
    # Add a new column for transcriptions
    input_df['transcription'] = ''
    
    # Transcribe each audio file and add transcription to the DataFrame
    for index, row in input_df.iterrows():
        audio_file_path = os.path.join(audio_folder, row['filename'])
        transcription = transcribe_audio(audio_file_path).lower()
        input_df.at[index, 'transcription'] = transcription
    
    # Save the DataFrame with transcriptions to a new Excel file
    input_df.to_excel(output_excel_path, index=False)

# Example usage:
audio_folder =  "processed-audios2"
input_excel_path = "cv-other-train.csv"
output_excel_path = "excel-output.xlsx"
add_transcriptions_to_excel(audio_folder, input_excel_path, output_excel_path)
print("Transcriptions added to Excel file.")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2

In [35]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import jiwer

def calculate_metrics(input_excel_path):
    # Load the Excel file with transcriptions
    df = pd.read_excel(input_excel_path)
    
    # Replace missing values in the 'transcription' column with an empty string
    df['transcription'].fillna('', inplace=True)
    
    # Calculate accuracy, precision, recall, f1-score, WER, and CER for each row
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    wers = []
    cers = []
    for index, row in df.iterrows():
        ground_truth = [c for c in row['text'].lower()]  # Ground truth text
        transcription = [c for c in str(row['transcription']).lower()]  # Transcribed text
        
        # Ensure the same length for ground truth and transcription
        max_len = max(len(ground_truth), len(transcription))
        ground_truth = ground_truth + [' '] * (max_len - len(ground_truth))
        transcription = transcription + [' '] * (max_len - len(transcription))
        
        # Calculate metrics
        accuracy = accuracy_score(ground_truth, transcription)
        precision = precision_score(ground_truth, transcription, average='weighted')
        recall = recall_score(ground_truth, transcription, average='weighted')
        f1 = f1_score(ground_truth, transcription, average='weighted')
        
        # Calculate WER and CER
        wer = jiwer.wer(''.join(ground_truth), ''.join(transcription))
        cer = jiwer.cer(''.join(ground_truth), ''.join(transcription))
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        wers.append(wer)
        cers.append(cer)
    
    # Add accuracy, precision, recall, f1-score, WER, and CER columns to the DataFrame
    df['accuracy'] = accuracies
    df['precision'] = precisions
    df['recall'] = recalls
    df['f1_score'] = f1_scores
    df['wer'] = wers
    df['cer'] = cers
    
    # Save the DataFrame with metrics to the Excel file
    df.to_excel(input_excel_path, index=False)

# Example usage:
input_excel_path = "excel-output.xlsx"
calculate_metrics(input_excel_path)
print("Metrics added to Excel file.")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Metrics added to Excel file.


In [36]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import jiwer

def calculate_metrics(input_excel_path):
    # Load the Excel file with transcriptions
    df = pd.read_excel(input_excel_path)
    
    # Replace missing values in the 'transcription' column with an empty string
    df['transcription'].fillna('', inplace=True)
    
    # Initialize lists to store individual metrics values
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    wers = []
    cers = []
    
    for index, row in df.iterrows():
        ground_truth = [c for c in row['text'].lower()]  # Ground truth text
        transcription = [c for c in str(row['transcription']).lower()]  # Transcribed text
        
        # Ensure the same length for ground truth and transcription
        max_len = max(len(ground_truth), len(transcription))
        ground_truth = ground_truth + [' '] * (max_len - len(ground_truth))
        transcription = transcription + [' '] * (max_len - len(transcription))
        
        # Calculate metrics
        accuracy = accuracy_score(ground_truth, transcription)
        precision = precision_score(ground_truth, transcription, average='weighted')
        recall = recall_score(ground_truth, transcription, average='weighted')
        f1 = f1_score(ground_truth, transcription, average='weighted')
        
        # Calculate WER and CER
        wer = jiwer.wer(''.join(ground_truth), ''.join(transcription))
        cer = jiwer.cer(''.join(ground_truth), ''.join(transcription))
        
        # Append individual metric values to lists
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        wers.append(wer)
        cers.append(cer)
    
    # Add accuracy, precision, recall, f1-score, WER, and CER columns to the DataFrame
    df['accuracy'] = accuracies
    df['precision'] = precisions
    df['recall'] = recalls
    df['f1_score'] = f1_scores
    df['wer'] = wers
    df['cer'] = cers
    
    # Save the DataFrame with metrics to the Excel file
    df.to_excel(input_excel_path, index=False)
    
    # Calculate average metrics values
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)
    avg_wer = sum(wers) / len(wers)
    avg_cer = sum(cers) / len(cers)
    
    return avg_accuracy, avg_precision, avg_recall, avg_f1_score, avg_wer, avg_cer

# Example usage:
input_excel_path = "excel-output.xlsx"
avg_accuracy, avg_precision, avg_recall, avg_f1_score, avg_wer, avg_cer = calculate_metrics(input_excel_path)
print("Average Accuracy:", avg_accuracy)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-score:", avg_f1_score)
print("Average WER:", avg_wer)
print("Average CER:", avg_cer)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Average Accuracy: 0.5770842589502627
Average Precision: 0.5808069215590537
Average Recall: 0.5770842589502627
Average F1-score: 0.5728860911012961
Average WER: 0.28849002369918714
Average CER: 0.14633002464093306


In [None]:
import whisper
import csv
from collections import defaultdict

def calculate_metrics(ground_truth, prediction):
  """
  Calculates accuracy, precision, recall, F1-score, WER, and CER.
  """
  true_positives = 0
  false_positives = 0
  false_negatives = 0

  for word_gt, word_pred in zip(ground_truth.split(), prediction.split()):
    if word_gt == word_pred:
      true_positives += 1
    else:
      false_positives += 1
      false_negatives += 1

  if true_positives + false_positives == 0:
    precision = 0
  else:
    precision = true_positives / (true_positives + false_positives)

  if true_positives + false_negatives == 0:
    recall = 0
  else:
    recall = true_positives / (true_positives + false_negatives)

  if precision + recall == 0:
    f1_score = 0
  else:
    f1_score = 2 * (precision * recall) / (precision + recall)

  accuracy = true_positives / (true_positives + false_positives + false_negatives)

  # Calculate WER (Word Error Rate)
  wer = sum(ground_truth != prediction for ground_truth, prediction in zip(ground_truth.split(), prediction.split())) / len(ground_truth.split())

  # Calculate CER (Character Error Rate)
  cer = sum(c1 != c2 for c1, c2 in zip(ground_truth, prediction)) / len(ground_truth)

  return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1_score": f1_score,
      "wer": wer,
      "cer": cer
  }

def transcribe_folder(model, audio_folder, csv_file):
  """
  Transcribes all audio files in a folder and saves results to a CSV.
  """
  metrics = defaultdict(list)  # Store overall metrics

  with open(csv_file, "r", newline="") as csv_in, open(csv_file, "a", newline="") as csv_out:
    reader = csv.DictReader(csv_in)
    writer = csv.DictWriter(csv_out, fieldnames=reader.fieldnames + ["transcription", "accuracy", "precision", "recall", "f1_score", "wer", "cer"])
    writer.writeheader()

    for row in reader:
      filename = row["filename"]
      ground_truth = row["text"].lower()  # Convert ground truth to lowercase for case-insensitive comparison

      try:
        audio_path = os.path.join(audio_folder, filename)
        transcription = model.transcribe(audio_path)["text"].lower()

        writer.writerow({**row, "transcription": transcription})

        metrics["filename"].append(filename)
        individual_metrics = calculate_metrics(ground_truth, transcription)
        metrics.update((metric, metrics[metric] + [individual_metrics[metric]]) for metric in individual_metrics)

      except Exception as e:
        print(f"Error transcribing {filename}: {e}")

  # Print overall metrics
  print("Overall Metrics:")
  for metric, values in metrics.items():
    if metric in ["wer", "cer"]:
      # Average WER/CER across files
      average = sum(values) / len(values)
      print(f"{metric}: {average:.4f}")
    else:
      # Print average or other appropriate statistic depending on the metric
      average = sum(values) / len(values)
      print(f"{metric}: {average:.4f}")



model = whisper.load_model("medium")  # Change "medium" to a different model size if needed
audio_folder = "cv-other-train"  # Replace with your audio folder path
csv_file = "/kaggle/input/audio-to-text/cv-other-train.csv"  # Replace with your CSV file path

transcribe_folder(model, audio_folder, csv_file)
