In [1]:
import pandas as pd
import os

from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
import torch
from transformers import  pipeline


import json
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ROOT_FOLDER = ''
# folder_path = os.path.join(ROOT_FOLDER, 'data/Rec/')
# audio_files = os.listdir(folder_path)
# Transcript_df = pd.read_csv(os.path.join(ROOT_FOLDER, 'data/ground_truth_gp4_zero_shot.csv'))

In [3]:
# Transcript_df

In [5]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="English", task="transcribe")

In [6]:
vanilla_pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=8,
    torch_dtype=torch.float32,
)

Device set to use cpu


In [44]:
# Read JSON file
ROOT_FOLDER = '.'
with open(os.path.join(ROOT_FOLDER, 'data/Processed_Files', "Transcript.json"), "r", encoding="utf-8") as f:
    transcript_json = json.load(f)

In [None]:
# --- File path ---
file = '20241208090446_9873656524.mp3_lc.wav'
audio_file_path = os.path.join(ROOT_FOLDER, 'data/Processed_Files/', file)

# --- Transcription ---
prediction = vanilla_pipe(audio_file_path)
print("Predicted transcript:", prediction["text"])

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


Predicted transcript:  Hello, good morning sir, I am speaking with Mrs. Idhan Jain. Sir, this is a Kang Shah calling from one card on the recorded line. I am speaking with Mrs. Idhan Jain. Yeah, thanks for confirming, sir. This call is regarding your latest statement updates of Federal Bank One Credit Card. As you can see, your statement is generated on November 20 and for the same due date was December 7. And you have made a payment of your 10,000 rupees. Thank you so much for the payment sir and I'll request you to make the payment of remaining amount you as early as possible. Hello. So this is a confirmation call about your payment that we have received your confirmation call that we have received your payment. This call is regarding that no issues. Thank you so much.


In [10]:
# --- Ground truth transcript ---
ground_truth = transcript_json[file]
print("Ground truth:", ground_truth)

# --- Load WER metric ---
wer_metric = evaluate.load("wer")

# --- Compute WER ---
wer_score = wer_metric.compute(
    predictions=[prediction["text"]],
    references=[ground_truth]
)
print(f"WER: {wer_score:.4f}")

Ground truth: hello good morning sir am i speaking with mister siddhaant jain sir this is akanksha calling from one card on a recorded line am i speaking with mister siddhaant jain yeah thanks for confirming sir this call is regarding your latest statement updates of federal bank one credit card as i can see your statement is generated on november twenty and for the same due date was december seven and you have made a payment of your ten thousand rupees thank you so much for the payment sir and i request you to make the payment of remaining amount due as early as possible hello sir this is a confirmation call about your payment that we have received your sir this is confirmation call that we have received your payment this call is regarding that okay sir no issues thank you so much
WER: 0.3776


## Train Test split

In [40]:
from sklearn.model_selection import train_test_split

processed_path = os.path.join(ROOT_FOLDER, "data", "Processed_Files")
all_files = [f for f in os.listdir(processed_path) if f != 'Transcript.json' ]

train_files, test_files = train_test_split(all_files, test_size=0.2, random_state=42)

df_train = pd.DataFrame({"filename": train_files, "split": "train"})
df_test = pd.DataFrame({"filename": test_files, "split": "test"})
df = pd.concat([df_train, df_test], ignore_index=True)

# Save CSV in the same folder
output_csv = os.path.join(processed_path, "file_splits.csv")
df.to_csv(output_csv, index=False)



In [41]:
df.shape

(1626, 2)

In [45]:
import os
import librosa

# Path to processed files
processed_path = os.path.join(ROOT_FOLDER, "data", "Processed_Files")

total_duration = 0.0
file_durations = {}

# Loop through all files
for fname in os.listdir(processed_path):
    fpath = os.path.join(processed_path, fname)
    
    if os.path.isfile(fpath) and fname.lower().endswith((".wav", ".mp3", ".flac", ".ogg")):
        try:
            duration = librosa.get_duration(filename=fpath)
            file_durations[fname] = duration
            total_duration += duration
        except Exception as e:
            print(f"Skipping {fname}, error: {e}")

# Convert to hours
total_hours = total_duration / 3600

print(f"✅ Total audio duration: {total_hours:.2f} hours")
print(f"✅ Number of files: {len(file_durations)}")


	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=fpath)


✅ Total audio duration: 14.46 hours
✅ Number of files: 1626
