In [2]:
import os
import re
import speech_recognition as sr
from moviepy.video.io.VideoFileClip import VideoFileClip
from pydub import AudioSegment
from pydub.silence import split_on_silence

In [3]:
def detect_silence(audio, silence_thresh=-80, min_silence_len=500):
    silence_segments = []
    non_silence_segments = split_on_silence(
        audio, silence_thresh=silence_thresh, min_silence_len=min_silence_len)
    start_time = 0

    for segment in non_silence_segments:
        end_time = start_time + segment.duration_seconds
        silence_segments.append((start_time, end_time))
        start_time = end_time

    return silence_segments

In [5]:
def transcribe_video(video_path):
    print("Starting transcription...")
    
    # Extract audio from the video
    print("Extracting audio from video...")
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio
    audio_path = "output.wav"
    
    audio_clip.write_audiofile(audio_path)

    # Check if audio file was created
    if not os.path.exists(audio_path) or os.path.getsize(audio_path) == 0:
        print("Audio extraction failed or the audio file is empty!")
        return
    print(f"Audio file saved to: {audio_path}")

    # Load audio with pydub
    print("Loading audio file...")
    audio = AudioSegment.from_wav(audio_path)
    
    # Convert to mono and set frame rate
    audio = audio.set_channels(1).set_frame_rate(16000)
    audio.export(audio_path, format="wav")  # Export modified audio

    silence_segments = detect_silence(audio, silence_thresh=-80, min_silence_len=500)
    print("Detected silence segments:", silence_segments)

    # Initialize the recognizer
    recognizer = sr.Recognizer()

    # Transcribe the audio
    try:
        print("Transcribing audio...")
        with sr.AudioFile(audio_path) as source:
            audio_data = recognizer.record(source)
            transcript = recognizer.recognize_google(audio_data)
            print("Transcription successful!")
            print("Transcript:", transcript)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio.")
        return
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        return
    except Exception as e:
        print(f"An unexpected error occurred during transcription: {e}")
        return

    # Generate the transcription text with timestamps
    print("Generating transcription with timestamps...")
    transcription_with_timestamps = []
    cumulative_duration = 0.0

    for i, sentence in enumerate(transcript.split('.')):
        if i < len(silence_segments):
            start_time, end_time = silence_segments[i]
            timestamp = (start_time + end_time) / 2000.0  # Convert to seconds
        else:
            timestamp = cumulative_duration + audio.duration_seconds / 1000.0

        timestamp_str = f"{timestamp:.3f}"
        transcription_with_timestamps.append(f"[{timestamp_str}] Speaker: {sentence.strip()}")

    # Save the transcription to a text file
    output_file_path = "Transcripted_Text.txt"
    with open(output_file_path, 'w') as file:
        file.write('\n'.join(transcription_with_timestamps))

    print(f"Transcription saved to: {output_file_path}")

# Call the function
video_path = "C:\\Users\\Madhu\\audio1.mp4"
transcribe_video(video_path)


Starting transcription...
Extracting audio from video...
MoviePy - Writing audio in output.wav


                                                                                                                       

MoviePy - Done.
Audio file saved to: output.wav
Loading audio file...
Detected silence segments: [(0, 326.428), (326.428, 329.765), (329.765, 331.568), (331.568, 335.534), (335.534, 337.84999999999997), (337.84999999999997, 340.207), (340.207, 341.693), (341.693, 343.078), (343.078, 345.37899999999996), (345.37899999999996, 347.51599999999996), (347.51599999999996, 349.655), (349.655, 350.628), (350.628, 352.178), (352.178, 353.099), (353.099, 355.149), (355.149, 357.83), (357.83, 359.231), (359.231, 361.028), (361.028, 364.677), (364.677, 368.96000000000004), (368.96000000000004, 370.973), (370.973, 372.447), (372.447, 374.574), (374.574, 378.584), (378.584, 380.85), (380.85, 384.87), (384.87, 386.996), (386.996, 389.251), (389.251, 390.286), (390.286, 392.162), (392.162, 395.68699999999995), (395.68699999999995, 405.13199999999995), (405.13199999999995, 418.37199999999996), (418.37199999999996, 430.97299999999996), (430.97299999999996, 446.57399999999996), (446.57399999999996, 458.94