<a href="https://colab.research.google.com/github/KrishnaPothula/PyDub-and-Whisper-Audio-Processing-and-NLP/blob/main/Segmenting_and_Transcribing_Audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pydub

In [None]:
pip install openai-whisper

In [None]:
pip install json

In [None]:
# Import the required libraries
from pydub import AudioSegment
from whisper import Whisper
import json

In [None]:
# Load the audio file
audio = AudioSegment.from_file("Test.mp3")

whisper = Whisper(dims=256)  # Adjust the dims parameter according to your model's requirements



In [None]:
# Define a function to detect silence in the audio
def detect_silence(audio, min_silence_len=1000, silence_thresh=-16):
    start = 0
    end = 0
    detected = []
    audio_len = len(audio)
    while start < audio_len:
        silence_start = audio[start:end].rfind(silence_thresh, 0, end-start)
        if silence_start == -1:
            break
        else:
            start += silence_start
            end = start + min_silence_len
            if end > audio_len:
                end = audio_len
        silence_end = audio[start:end].find(silence_thresh, 0, end-start)
        if silence_end == -1:
            detected.append((start, end))
            break
        else:
            end = start + silence_end
            detected.append((start, end))
            start = end + 1
    return detected



In [None]:
# Define a function to segment the audio by speaker
def segment_audio(audio, silences):
    segments = []
    prev_end = 0
    speaker = "caller"
    for start, end in silences:
        segment = audio[prev_end:start]
        segments.append((segment, speaker))
        speaker = "callee" if speaker == "caller" else "caller"
        prev_end = end
    if prev_end < len(audio):
        segment = audio[prev_end:]
        segments.append((segment, speaker))
    return segments



In [None]:
# Define a function to transcribe the audio segments using Whisper
def transcribe_segments(segments):
    transcriptions = []
    for segment, speaker in segments:
        text = whisper.transcribe(segment)
        transcriptions.append((text, speaker))
    return transcriptions



In [None]:
# Define a function to create a JSON file from the transcribed segments
def create_json_file(transcriptions, filename):
    data = {}
    data["messages"] = []
    for text, speaker in transcriptions:
        message = {}
        message["text"] = text.strip()
        message["speaker"] = speaker.capitalize()
        data["messages"].append(message)
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)



In [None]:
# Call the functions to complete the task
silences = detect_silence(audio)
segments = segment_audio(audio, silences)
transcriptions = transcribe_segments(segments)
create_json_file(transcriptions, "conversation.json")

# Print a message to indicate the task is done
print("The conversation has been segmented and transcribed successfully. The JSON file has been created.")
