


## importing required libraries




In [None]:
! pip install pandas numpy pytube pydub openai-whisper --quiet langchain_experimental langchain_openai

# Download Video and Extract Audio

In [None]:
from pytube import YouTube

In [None]:
link= YouTube("https://www.youtube.com/watch?v=Sby1uJ_NFIY")

In [None]:
video= link.streams.get_highest_resolution()

In [None]:
video.download('video')

'/content/video/Sarvam AI Wants To Leverage AI In Health & Education Says Co Founder Vivek Raghavan With OpenHathi.mp4'

In [None]:
audio=link.streams.get_audio_only()

In [None]:
audio.download('audio')

'/content/audio/Sarvam AI Wants To Leverage AI In Health & Education Says Co Founder Vivek Raghavan With OpenHathi.mp4'

# Transcription of Audio

### Explanation of the Chosen Model: Whisper

**Whisper** is an advanced speech recognition model developed by OpenAI, known for its high accuracy and versatility. Here's a detailed explanation of why Whisper is an excellent choice for transcription, along with techniques used to enhance the quality of the transcription:

#### Why Whisper?

1. **High Accuracy**:
   - Whisper is trained on a large and diverse dataset, which helps it achieve high accuracy in various languages and accents. It leverages a transformer-based architecture, which is known for its effectiveness in handling sequential data like audio.

2. **Robustness to Noise**:
   - Whisper is designed to perform well even in noisy environments, making it suitable for real-world applications where audio quality can be variable.

3. **Multilingual Support**:
   - Whisper supports transcription in multiple languages, making it a versatile tool for global applications.

4. **Automatic Punctuation and Formatting**:
   - The model is capable of adding punctuation and formatting to the transcriptions, which significantly improves readability and usability.

#### Techniques to Enhance the Quality of Transcription

While Whisper is inherently powerful, several techniques can further enhance the transcription quality:

1. **Preprocessing the Audio**:
   - **Noise Reduction**: Applying noise reduction algorithms to the audio before transcription can improve accuracy. This can be achieved using libraries like `pydub` or `noisereduce`.
   - **Normalization**: Normalizing the audio volume ensures consistent input levels, which can help the model perform better.

2. **Chunking the Audio**:
   - **Short Segments**: Splitting the audio into shorter segments (e.g., 15 seconds) can help manage long audio files more effectively and reduce errors due to model limitations on input length.
   - **Semantic Segmentation**: Using voice activity detection (VAD) to segment the audio based on speech presence ensures that each chunk contains meaningful speech, improving the model's focus and accuracy.

3. **Language Model Integration**:
   - **Custom Language Models**: Integrating domain-specific language models can help the transcription system better handle jargon, proper names, and context-specific terms. This involves fine-tuning Whisper or using external language models to post-process the transcription.

4. **Post-processing the Transcription**:
   - **Spell Check and Grammar Correction**: Using tools like `nltk` or `spaCy` for post-processing can help correct spelling and grammar errors in the transcription.
   - **Manual Review**: For critical applications, manual review and correction of the transcriptions can ensure the highest accuracy.




In [None]:

import whisper
model = whisper.load_model("base")

result = model.transcribe("/content/audio/Sarvam AI Wants To Leverage AI In Health & Education Says Co Founder Vivek Raghavan With OpenHathi.mp4", language="en", task="transcribe")

with open("transcript.txt", "w") as f:
    f.write(result["text"])


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 63.5MiB/s]


# Time-Align Transcript with Audio

In [None]:
import whisper
import pandas as pd
# Load the model
model = whisper.load_model("base")  # You can choose a different model
# Transcribe an audio file
result = model.transcribe("/content/audio/Sarvam AI Wants To Leverage AI In Health & Education Says Co Founder Vivek Raghavan With OpenHathi.mp4", language="en", task="transcribe")
# Print the text with timestamps

segments=[]
# Populate the DataFrame with segments
for  segment in result['segments']:
    start = segment['start']
    end = segment['end']
    text = segment['text']
    segments.append({'start': start, 'end': end, 'text': text})

df=pd.DataFrame(segments)
df.to_csv('transcription_segments.csv', index=False)



# Semantic Chunking of Data

### using open-ai langchain

In [None]:
!pip install --quiet langchain_experimental langchain_openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.5/199.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.9/307.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.2/121.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
# This is a long document we can split up.
with open("/content/transcript.txt") as f:
    transcript = f.read()

In [None]:
#create text splitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "sk-..."

In [None]:
text_splitter = SemanticChunker(
    OpenAIEmbeddings(), breakpoint_threshold_type="percentile"
)

In [None]:
docs = text_splitter.create_documents([transcript])

In [None]:
import pickle
with open('semantic_chunks.pkl', 'wb') as file:
    pickle.dump(docs, file)

### using Naive approach

In [None]:
import nltk
import whisper
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

def chunk_audio_text( max_chunk_length=15):
    chunks = []
    current_chunk = {"start": 0, "end": 0, "text": ""}
    current_length = 0
    chunk_id = 1
    # Load the model
    model = whisper.load_model("base")  # You can choose a different model
    # Transcribe an audio file
    transcription_result = model.transcribe("/content/audio/Sarvam AI Wants To Leverage AI In Health & Education Says Co Founder Vivek Raghavan With OpenHathi.mp4", language="en", task="transcribe")
    # Print the text with timestamps
    for segment in transcription_result['segments']:
        segment_start = segment['start']
        segment_end = segment['end']
        text = segment['text']

        # Split the text into sentences for better semantic chunks
        sentences = sent_tokenize(text)

        for sentence in sentences:
            sentence_length = (segment_end - segment_start) * len(sentence) / len(text)

            if current_length + sentence_length > max_chunk_length:
                chunks.append({
                    "chunk_id": chunk_id,
                    "chunk_length": current_chunk["end"] - current_chunk["start"],
                    "text": current_chunk["text"].strip(),
                    "start_time": current_chunk["start"],
                    "end_time": current_chunk["end"]
                })
                chunk_id += 1
                current_chunk = {"start": segment_start, "end": segment_start + sentence_length, "text": sentence}
                current_length = sentence_length
            else:
                current_chunk["end"] = segment_start + sentence_length
                current_chunk["text"] += " " + sentence
                current_length += sentence_length

    if current_chunk["text"]:
        chunks.append({
            "chunk_id": chunk_id,
            "chunk_length": current_chunk["end"] - current_chunk["start"],
            "text": current_chunk["text"].strip(),
            "start_time": current_chunk["start"],
            "end_time": current_chunk["end"]
        })

    return chunks

# Example usage
audio_text_chunks = chunk_audio_text( )

# Print the formatted output
for chunk in audio_text_chunks:
    print(chunk)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 164MiB/s]


{'chunk_id': 1, 'chunk_length': 19.382857142857144, 'text': "Congratulations to you Mr. Raghavan for that. Thank you so much for joining us.  Over to you.  Hi everybody. How are you?  Okay I am not hearing this at all. It's like a post lunch energy downer or something.  Let's hear it. Are you guys awake?", 'start_time': 0, 'end_time': 19.382857142857144}
{'chunk_id': 2, 'chunk_length': 16.16, 'text': "All right you better be because we have a superstar guest here.  You heard the 41 million dollars and I didn't hear honestly anything she said after that.  So we're going to ask for about 40 million dollars from him by the end of this conversation.", 'start_time': 21.84, 'end_time': 38.0}
{'chunk_id': 3, 'chunk_length': 16.479999999999997, 'text': "But let's get started. I want to introduce Vivek and Pratius, she's co-founder who's not here.  We wanted to start with a playing a video of what OpenHathe does. I encourage all of you to go  to the website, www.severalm.ai and check it out.", 

In [None]:
import pickle

# Saving the list to a file using pickle
file_path = 'semantic_chunks.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(audio_text_chunks, file)

print(f"Semantic chunks saved to {file_path}")
