In [None]:
DATA_DIR = "/kaggle/input/shobdotori/Test"
OUTPUT_DIR = "./"
OUTPUT_FILE = "submission.csv"

In [None]:
!pip install --upgrade --force-reinstall numpy==1.26.4 scikit-learn==1.3.2
!pip install protobuf==3.20.*
!pip install datasets==3.6.0 transformers==4.48.3 torchaudio accelerate evaluate

In [None]:
import os
import pandas as pd
import librosa
import numpy as np
import torch
from tqdm import tqdm  
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torchaudio
from transformers import GenerationConfig

In [None]:
model_path = 'lucius-40/Whisper-bn-v2'
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

In [None]:
print("Loading test files...\n")

test_files = sorted([f for f in os.listdir(DATA_DIR) if f.endswith('.wav')])

print(f"Found {len(test_files)} test files")

if len(test_files) == 0:
    print("\nWARNING: No test files found! Check DATA_DIR")
else:
    print(f"\nTest data loaded successfully!")

In [None]:


print("Generating test predictions...\n")
print(f"Processing {len(test_files)} test files...\n")

predictions = []
errors = []

for i, audio_file in enumerate(tqdm(test_files, desc="Transcribing")):
    audio_path = os.path.join(DATA_DIR, audio_file)

    try:
        # Load and resample audio to 16kHz
        audio_array, sr = librosa.load(audio_path, sr=16000)

        # Extract features
        input_features = processor.feature_extractor(
            audio_array,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features

        input_features = input_features.to(device)

        # Generate transcription
        with torch.no_grad():
            predicted_ids = model.generate(
                input_features,
                language="bn",
                task="transcribe",
                max_length=225,
                num_beams = 4,
            )

        # Decode prediction
        transcription = processor.tokenizer.batch_decode(
            predicted_ids,
            skip_special_tokens=True
        )[0]

        predictions.append({
            'audio': audio_file,
            'text': transcription
        })

    except Exception as e:
        error_msg = f"Error processing {audio_file}: {str(e)}"
        errors.append(error_msg)
        print(f"\n{error_msg}")

        # Add empty prediction
        predictions.append({
            'audio': audio_file,
            'text': ""
        })

print(f"  Total predictions: {len(predictions)}")
print(f"  Errors: {len(errors)}")

submission_df = pd.DataFrame(predictions)
submission_df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')

print(f"Output file ’{OUTPUT_FILE}’ generated successfully!")

In [None]:
dfs = pd.read_csv(OUTPUT_FILE)

In [None]:
dfs