<a href="https://colab.research.google.com/github/MikeCorv/WhisperFineTuning/blob/main/GoogleFleursAudio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- BLOCK 1: SETUP ---
!pip install transformers librosa soundfile accelerate


In [None]:
from google.colab import drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

In [None]:
!pip install datasets==3.6.0

In [None]:
import datasets
print(f"Current library version: {datasets.__version__}")

In [None]:
from datasets import load_dataset, Audio, concatenate_datasets

# Configuration
DATASET_ID = "google/fleurs"
LANGUAGE = "it_it"
SAVE_PATH = "/content/drive/MyDrive/fleurs_it_processed"

print(f"Downloading {DATASET_ID}...")

In [None]:
raw_train = load_dataset(DATASET_ID, LANGUAGE, split="train", trust_remote_code = True)


In [None]:
print(raw_train)
print(len(raw_train))

In [None]:
print(raw_train[0])

In [None]:
for feature, value in raw_train[0].items():
  print(f"{feature}: {value}")


In [None]:
for feature, value in raw_train[0]['audio'].items():
  print(f"{feature}: {value}")

In [None]:
raw_val = load_dataset(DATASET_ID, LANGUAGE, split="validation", trust_remote_code = True)

In [None]:
print(raw_val[0])

In [None]:
dataset = concatenate_datasets([raw_train, raw_val])

In [None]:
#Whisper is exclusively trained on 16KHz audio.

print("Resampling to 16kHz...")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
import re
def clean_text(text):
    if not text: return ""
    text = re.sub(r"[^a-zA-Z0-9àèéìòùÀÈÉÌÒÙ'\s]", "", text)
    return text.lower().strip()

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["audio"] = audio
    batch["sentence"] = clean_text(batch["transcription"])
    batch["duration"] = len(audio["array"]) / audio["sampling_rate"]
    return batch

In [None]:
print("Processing Audio...")
dataset = dataset.map(
    prepare_dataset,
    remove_columns=["id", "num_samples", "path", "transcription", "raw_transcription", "gender", "lang_id", "language", "lang_group_id"],
    num_proc=1,
    desc="Processing"
)

In [None]:
def filter_duration(batch):
    return 1.0 < batch["duration"] < 30.0

In [None]:
print(f"Original Count: {len(dataset)}")

In [None]:
dataset = dataset.filter(filter_duration)
print(f"Filtered Count: {len(dataset)}")

In [None]:
print("2. Splitting Test Set...")
# 2. Cut off 10% of the data to use for testing later
final_split = dataset.train_test_split(test_size=0.1)

In [None]:
print(f"3. Saving to Google Drive: {SAVE_PATH}...")
# 3. The most important step: Write the files to the Drive folder
final_split.save_to_disk(SAVE_PATH)