In [None]:
import os
import shutil
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import pandas as pd
import re
from pydub import AudioSegment
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Load error corrections from an Excel file
def charger_corrections(chemin_fichier):
    df = pd.read_excel(chemin_fichier)
    corrections = dict(zip(df['erreur'], df['correction']))
    return corrections

# Replace errors in text with corrections (case-insensitive)
def nettoyer_texte(texte, corrections):
    for erreur, correction in corrections.items():
        texte = re.sub(re.escape(erreur), correction, texte, flags=re.IGNORECASE)
    return texte

# Ensure the audio is in the required format (16 kHz, mono, WAV)
def preprocess_audio(file_path):
    try:
        AudioSegment.ffmpeg = "/opt/homebrew/bin/ffmpeg" # remove as needed
        AudioSegment.ffprobe = "/opt/homebrew/bin/ffprobe" # remove as needed
        audio = AudioSegment.from_file(file_path)
        print(f"Original sample rate: {audio.frame_rate}, channels: {audio.channels}")

        # Convert to mono and resample to 16 kHz
        audio = audio.set_channels(1)
        audio = audio.set_frame_rate(16000)
        print(f"Processed sample rate: {audio.frame_rate}, channels: {audio.channels}")

        # Export to WAV if not already in the correct format
        if not file_path.lower().endswith('.wav'):
            output_audio_file = file_path[:-3] + "wav"
            audio.export(output_audio_file, format="wav")
            return output_audio_file, True
        return file_path, False
    except Exception as e:
        print(f"Error processing audio file {file_path}: {e}")
        return None, False

class AudioFileHandler(FileSystemEventHandler):
    def __init__(self, pipeline, corrections, output_dir, processed_dir):
        self.pipeline = pipeline
        self.corrections = corrections
        self.output_dir = output_dir
        self.processed_dir = processed_dir

    def on_created(self, event):
        if not event.is_directory:
            self.process_file(event.src_path)

    def process_file(self, file_path):
        try:
            if file_path.lower().endswith(('.wav', '.mp3', '.m4a')):
                print(f"Processing file: {file_path}")

                # Preprocess the audio file
                preprocessed_file, converted = preprocess_audio(file_path)
                if preprocessed_file is None:
                    return
                # Get the audio length for status estimation
                audio_length = AudioSegment.from_file(preprocessed_file).duration_seconds
                estimated_time = int(audio_length / 60)

                # Generate a sanitized name for status file
                sanitized_basename = re.sub(r'[^\w\-_\. ]', '_', os.path.basename(file_path))
                status_file = os.path.join(
                    os.path.dirname(file_path),
                    f"transcription started for {sanitized_basename} - estimated run time {estimated_time} mins.txt"
                )
                with open(status_file, "w") as f:
                    f.write(f"Transcription for {os.path.basename(file_path)} started.\n")
                    f.write(f"Estimated time: {estimated_time} mins.\n")
                print(f"Created status file: {status_file}")

                # Transcribe the audio file using Whisper v3 pipeline
                result = self.pipeline(preprocessed_file, return_timestamps=True)

                # Extract and clean the transcription text
                transcription_text = nettoyer_texte(result['text'], self.corrections)

                # Save the transcription
                transcript_file = os.path.join(self.output_dir, os.path.basename(file_path) + '.txt')
                with open(transcript_file, "w") as f:
                    f.write(transcription_text)
                print(f"Transcription saved to: {transcript_file}")

                # Move the processed file to another directory
                shutil.move(file_path, os.path.join(self.processed_dir, os.path.basename(file_path)))

                # Remove converted file if applicable
                if converted:
                    os.remove(preprocessed_file)

                # Remove the status file
                os.remove(status_file)
                print(f"Deleted status file: {status_file}")

        except RuntimeError as e:
            print(f"RuntimeError during transcription for {file_path}: {e}")
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

if __name__ == "__main__":
    # Directories for monitoring, output, and processed files
    watch_dir = "../audio_files_drop"
    output_dir = "../transcripts"
    processed_dir = "../processed_audios"

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(processed_dir, exist_ok=True)

    # Device and model setup
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "openai/whisper-large-v3"

    """model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )"""
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )

    # Load corrections from Excel file
    corrections_file = "../scripts/whisper_errors.xlsx"
    corrections = charger_corrections(corrections_file)

    # Set up file observer
    observer = Observer()
    event_handler = AudioFileHandler(pipe, corrections, output_dir, processed_dir)
    observer.schedule(event_handler, path=watch_dir, recursive=False)

    observer.start()
    print(f"Watching directory: {watch_dir}")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()


Device set to use cpu


Watching directory: ../audio_files_drop
Processing file: /Volumes/@AEE/transcriber/audio_files_drop/08_ELEVEUR05_1124.MP3
Original sample rate: 44100, channels: 2
Processed sample rate: 16000, channels: 1
Created status file: /Volumes/@AEE/transcriber/audio_files_drop/transcription started for 08_ELEVEUR05_1124.MP3 - estimated run time 46 mins.txt


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Transcription saved to: ../transcripts/08_ELEVEUR05_1124.MP3.txt
Deleted status file: /Volumes/@AEE/transcriber/audio_files_drop/transcription started for 08_ELEVEUR05_1124.MP3 - estimated run time 46 mins.txt
Processing file: /Volumes/@AEE/transcriber/audio_files_drop/08_ELEVEUR05_1124.wav
Error processing audio file /Volumes/@AEE/transcriber/audio_files_drop/08_ELEVEUR05_1124.wav: [Errno 2] No such file or directory: '/Volumes/@AEE/transcriber/audio_files_drop/08_ELEVEUR05_1124.wav'
Processing file: /Volumes/@AEE/transcriber/audio_files_drop/09_CHASSEUR02_1124.MP3
Original sample rate: 44100, channels: 2
Processed sample rate: 16000, channels: 1
Created status file: /Volumes/@AEE/transcriber/audio_files_drop/transcription started for 09_CHASSEUR02_1124.MP3 - estimated run time 61 mins.txt




Transcription saved to: ../transcripts/09_CHASSEUR02_1124.MP3.txt
Deleted status file: /Volumes/@AEE/transcriber/audio_files_drop/transcription started for 09_CHASSEUR02_1124.MP3 - estimated run time 61 mins.txt
Processing file: /Volumes/@AEE/transcriber/audio_files_drop/10_ELEVEUR06_1124.MP3
Original sample rate: 44100, channels: 2
Processed sample rate: 16000, channels: 1
Created status file: /Volumes/@AEE/transcriber/audio_files_drop/transcription started for 10_ELEVEUR06_1124.MP3 - estimated run time 31 mins.txt




Transcription saved to: ../transcripts/10_ELEVEUR06_1124.MP3.txt
Deleted status file: /Volumes/@AEE/transcriber/audio_files_drop/transcription started for 10_ELEVEUR06_1124.MP3 - estimated run time 31 mins.txt
Processing file: /Volumes/@AEE/transcriber/audio_files_drop/11_ELEVEUR07_1124.MP3
Original sample rate: 44100, channels: 2
Processed sample rate: 16000, channels: 1
Created status file: /Volumes/@AEE/transcriber/audio_files_drop/transcription started for 11_ELEVEUR07_1124.MP3 - estimated run time 91 mins.txt


