In [None]:
from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
import os
import pandas as pd
import json
from datasets import Dataset, Audio
from typing import Any, List, Dict, Union
import torch

from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

import evaluate

# model_id = "openai/whisper-small.en"
# model = WhisperForConditionalGeneration.from_pretrained(model_id)
# processor = WhisperProcessor.from_pretrained(model_id,language="English", task="transcribe")

# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
# processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
#processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", task="transcribe")


model.generation_config.task = 'transcribe'
model.generation_config.forced_decoder_ids = None

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ROOT_FOLDER = ''

processed_path = os.path.join(ROOT_FOLDER, "data", "Processed_Files")

df_train = pd.read_csv(os.path.join(processed_path, "file_splits.csv"))
df_train = df_train[df_train['split'] == 'train']
df_train.reset_index(drop=True, inplace = True)

df_val = pd.read_csv(os.path.join(processed_path, "file_splits.csv"))
df_val = df_val[df_val['split'] == 'val']
df_val.reset_index(drop=True, inplace = True)


# df_train = df_train[:10]
# df_val = df_val[:10]


# Read JSON file
with open(os.path.join(ROOT_FOLDER, 'data/Processed_Files', "Transcript.json"), "r", encoding="utf-8") as f:
    transcript_json = json.load(f)

# Step 3: add text column by mapping filename → transcript
df_train["text"] = df_train["filename"].map(transcript_json)
df_val["text"] = df_val["filename"].map(transcript_json)
# Step 4: add path to audio file
df_train["path"] = df_train["filename"].apply(lambda x: os.path.join(processed_path, x))
df_val["path"] = df_val["filename"].apply(lambda x: os.path.join(processed_path, x))
df_train.head(1)

Unnamed: 0,filename,split,text,path
0,20241215150647_7686868599.mp3_lc_12.wav,train,hello hello sir am i audible hello han hello ...,data\Processed_Files\20241215150647_7686868599...


In [3]:
#removing not correctly transcriped cases
print(df_val.shape)
print(df_train.shape)
df_val = df_val[~df_val['text'].str.contains(r"\{FL\}")]
df_train = df_train[~df_train['text'].str.contains(r"\{FL\}")]
print(df_val.shape)
print(df_train.shape)

(226, 4)
(2558, 4)
(220, 4)
(2478, 4)


In [4]:
# df_train.drop([  53,  151,  159,  239,  253,  263,  299,  448,  480,  483,  627,  716,
#         762,  765,  794,  807,  843,  847,  859,  895, 1000, 1010, 1025, 1131,
#        1260, 970], inplace = True)
# df_val.drop([  85 ], inplace = True)

In [5]:
df_val.head(1)

Unnamed: 0,filename,split,text,path
0,20241217154703_8790066060.mp3_lc_3.wav,val,good afternoon i am i talking with mister red...,data\Processed_Files\20241217154703_8790066060...


In [6]:
print(model.config.to_dict().keys())

dict_keys(['vocab_size', 'num_mel_bins', 'd_model', 'encoder_layers', 'encoder_attention_heads', 'decoder_layers', 'decoder_attention_heads', 'decoder_ffn_dim', 'encoder_ffn_dim', 'dropout', 'attention_dropout', 'activation_dropout', 'activation_function', 'init_std', 'encoder_layerdrop', 'decoder_layerdrop', 'use_cache', 'num_hidden_layers', 'scale_embedding', 'max_source_positions', 'max_target_positions', 'classifier_proj_size', 'use_weighted_layer_sum', 'apply_spec_augment', 'mask_time_prob', 'mask_time_length', 'mask_time_min_masks', 'mask_feature_prob', 'mask_feature_length', 'mask_feature_min_masks', 'median_filter_width', 'return_dict', 'output_hidden_states', 'torchscript', 'torch_dtype', 'pruned_heads', 'tie_word_embeddings', 'chunk_size_feed_forward', 'is_encoder_decoder', 'is_decoder', 'cross_attention_hidden_size', 'add_cross_attention', 'tie_encoder_decoder', 'architectures', 'finetuning_task', 'id2label', 'label2id', 'task_specific_params', 'problem_type', 'tokenizer_cla

In [7]:
model.config.max_target_positions

448

In [8]:
#important preprocessing: there is a maximum sequence length which a model can produce so we need to
#remove the labels which after tokenization have length greater than "maximum sequence length" which is 1024 in this case 


df_token_len = df_train[['text']].copy()
def safe_token_length(x):
    try:
        return len(processor.tokenizer(x).input_ids)
    except Exception:
        return 0   # if error, set length to 0

df_token_len['token_length'] = df_token_len['text'].apply(safe_token_length)
print(df_token_len[df_token_len['token_length'] == 0].index)
print(df_token_len[df_token_len['token_length'] > 448].index)

Index([], dtype='int64')
Index([], dtype='int64')


In [9]:
def prepare_dataset(batch):
    # Load audio
    audio = batch['path']   # already decoded to dict: {"array": ..., "sampling_rate": ...}
        
    # # Extract features
    # batch['input_features'] = processor.feature_extractor(audio["array"], 
    #                                                 sampling_rate=audio["sampling_rate"]).input_features[0]
    
    #use below n_mels = 128 to run whisper large v3
    batch['input_features'] = processor.feature_extractor(
    audio["array"],
    sampling_rate=audio["sampling_rate"],
    n_mels=128
).input_features[0]
    

    # Tokenize text
    batch['labels'] = processor.tokenizer(batch["text"]).input_ids
    return batch

In [10]:
# df = pd.read_csv(os.path.join(processed_path, "file_splits.csv"))
# df = df[df['split'] == 'test']
# df["text"] = df["filename"].map(transcript_json)
# df[df['text'].str.contains(r"\{FL\}")].shape

In [11]:
val_dataset = Dataset.from_pandas(df_val)
val_dataset = val_dataset.cast_column("path", Audio(sampling_rate=16000))

val_dataset = val_dataset.map(prepare_dataset, batch_size=4, num_proc=1)

data_set_path = os.path.join(ROOT_FOLDER, "data", "data_sets",'prepared_val')

val_dataset.save_to_disk(data_set_path)
# from datasets import load_from_disk
# val_dataset = load_from_disk(data_set_path)

Map: 100%|██████████| 220/220 [00:54<00:00,  3.42s/ examples]

: 

In [13]:
df_train.shape

(2478, 4)