In [13]:
import pandas as pd
import os

from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
import torch
from transformers import  pipeline

import json
import evaluate

## Processed data stats

In [19]:
import os
import librosa

# Path to processed files
processed_path = os.path.join(ROOT_FOLDER, "data", "Processed_Files")

total_duration = 0.0
file_durations = {}

# Loop through all files
for fname in os.listdir(processed_path):
    fpath = os.path.join(processed_path, fname)
    
    if os.path.isfile(fpath) and fname.lower().endswith((".wav", ".mp3", ".flac", ".ogg")):
        try:
            duration = librosa.get_duration(filename=fpath)
            file_durations[fname] = duration
            total_duration += duration
        except Exception as e:
            print(f"Skipping {fname}, error: {e}")

# Convert to hours
total_hours = total_duration / 3600

print(f"✅ Total audio duration: {total_hours:.2f} hours")
print(f"✅ Number of files: {len(file_durations)}")


	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=fpath)


✅ Total audio duration: 14.46 hours
✅ Number of files: 1626


## Train Test Split

In [2]:
from sklearn.model_selection import train_test_split
import os
import pandas as pd

ROOT_FOLDER = ''

processed_path = os.path.join(ROOT_FOLDER, "data", "Processed_Files")
all_files = [f for f in os.listdir(processed_path) if f != 'Transcript.json']

# First split: Train (85%) vs Temp (15%)
train_files, temp_files = train_test_split(all_files, test_size=0.15, random_state=42)

# Second split: Temp (15%) into Validation (7.5%) and Test (7.5%)
val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

# Build DataFrames
df_train = pd.DataFrame({"filename": train_files, "split": "train"})
df_val   = pd.DataFrame({"filename": val_files, "split": "val"})
df_test  = pd.DataFrame({"filename": test_files, "split": "test"})

# Combine all
df = pd.concat([df_train, df_val, df_test], ignore_index=True)

# Save CSV in the same folder
output_csv = os.path.join(processed_path, "file_splits.csv")
df.to_csv(output_csv, index=False)

print(df["split"].value_counts(normalize=True))


split
train    0.849834
val      0.075083
test     0.075083
Name: proportion, dtype: float64


## Test Data set copy

In [18]:
ROOT_FOLDER = ''
df_test = pd.read_csv(os.path.join(ROOT_FOLDER, 'data', 'Processed_Files/file_splits.csv'))
df_test = df_test[df_test['split'] == 'test']

In [19]:
df_test.shape

(226, 2)

In [11]:
from pathlib import Path
import shutil
import pandas as pd

# Example: df_test already exists with a 'filename' column (strings like '...wav')
# df_test = pd.read_csv('df_test.csv')  # if loading from disk

src_dir = Path(os.path.join(ROOT_FOLDER, 'data', 'Processed_Files/'))
dst_dir = Path(os.path.join(ROOT_FOLDER, 'data', 'Processed_Files_2/'))
dst_dir.mkdir(parents=True, exist_ok=True)  # safe if it already exists

missing, copied = [], 0
for name in df_test['filename'].astype(str):
    src_path = src_dir / name
    if src_path.is_file():
        # copy2 preserves timestamps/metadata; copies into directory if dst is a dir
        shutil.copy2(src_path, dst_dir)  # destination is a directory
        copied += 1
    else:
        missing.append(name)

print(f"Copied {copied} files to {dst_dir}")
if missing:
    print("Missing in source:", missing)


Copied 226 files to data\Processed_Files_2
