In [None]:
!pip install moviepy
!pip install assemblyai
!pip install librosa
!pip install pydub
!pip install noisereduce
!pip install soundfile
!pip install huggingface_hub
!pip install transformer
!pip install evaluate
!pip install jiwer
!pip install sentencepiece
!pip install tensorflow_io

In [None]:
from moviepy.editor import *
import librosa
import numpy as np
import matplotlib.pyplot as plt
import os
import assemblyai as aai
import pandas as pd
from pydub import AudioSegment
from pydub.silence import split_on_silence
import noisereduce as nr
import soundfile as sf

In [None]:
def video_to_audio():

  folder_path = 'C:/Users/Harsh Patel/Desktop/Vintel/videos'
  for filename in os.listdir(folder_path):
      file_path = os.path.join(folder_path, filename)
      if os.path.isfile(file_path) and filename.endswith(('.mp4', '.mov', '.avi', '.mkv')):
            print(f"Processing video file: {filename}")

            # Open the video file and extract audio
            videoclip = VideoFileClip(file_path)
            audioclip = videoclip.audio

            # Write the audio file to mp3 format
            audio_output_path = f"C:/Users/Harsh Patel/Desktop/Vintel/audios/audiofile_{filename.split('.')[0]}.mp3"
            audioclip.write_audiofile(audio_output_path, codec="libmp3lame")
            print(f"Audio saved as: {audio_output_path}")

In [None]:
import os
from datasets import Dataset, DatasetDict
import soundfile as sf

def load_and_preprocess_data(audio_dir, transcript_dir):
    data = {'label': [], 'audio': [], 'transcript':[]}

    # Process audio files
    for index, audio_file in enumerate(os.listdir(audio_dir)):
        if audio_file.endswith('.mp3'):  # Adjust for audio file types
            file_path = os.path.join(audio_dir, audio_file)

            # Assuming preprocess_audio returns the processed audio and sample rate (sr)
#             processed_audio, sr = audio_reader(file_path)
            processed_audio, sample_rate = librosa.load(file_path, sr=16000, mono=True)

            # Append processed audio to the data dictionary
            data['audio'].append(processed_audio)

    # Process transcript files
    for index, transcript_file in enumerate(os.listdir(transcript_dir)):
        if transcript_file.endswith('.txt'):  # Adjust for transcript file types
            file_path = os.path.join(transcript_dir, transcript_file)

            # Read the transcript content and append it
            with open(file_path, 'r') as content:
                data['transcript'].append(content.read())

    # Ensure the label count matches the number of audios/transcripts
    num_entries = min(len(data['audio']), len(data['transcript']))  # Match the number of entries
    data['audio'] = data['audio'][:num_entries]  # Trim excess audio if necessary
    data['transcript'] = data['transcript'][:num_entries]  # Trim excess transcript if necessary

    data['label'] = [1] * num_entries  # Create a list of labels (e.g., all set to '1')

    return data

# Load and preprocess the audio and transcript data
audio_data1 = load_and_preprocess_data('/content/sample_data/audios', '/content/sample_data/transcripts')

train_dict = {k: v[:8] for k, v in audio_data1.items()}
test_dict = {k: v[8:] for k, v in audio_data1.items()}


# Convert it into a Hugging Face dataset
dataset1 = DatasetDict({
    'train': Dataset.from_dict(train_dict),
    'test': Dataset.from_dict(test_dict)# Split later into train/test if needed
})

print(dataset1)

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline

# Load your fine-tuned model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Create the ASR pipeline with chunking enabled, explicitly providing feature_extractor and tokenizer
asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=model,
    feature_extractor=processor.feature_extractor,  # Use the feature extractor
    tokenizer=processor.tokenizer,  # Use the tokenizer
    chunk_length_s=3  # Set chunk length in seconds (e.g., 5 seconds)
)

# Load the audio file path
audio_path = "/audios/audio_1.mp3"

# Perform inference with chunking and streaming
transcription = asr_pipeline(audio_path)

# Print the final transcription
print("Transcription:", transcription['text'])

In [None]:
from transformers import LEDTokenizer, LEDForConditionalGeneration

# Load model and tokenizer
model_name = "allenai/led-large-16384"
tokenizer = LEDTokenizer.from_pretrained(model_name)
model = LEDForConditionalGeneration.from_pretrained(model_name)

# The text to be summarized
text = """
Our cat teacher is going to teach you cosine similarity and cosine distance how it is used in data science. 
We'll look at some theory and then we'll move into python code. Let's say you are data scientist working for some financial company where on a Google Drive you have bunch of financial documents. Now, you don't know for what is the company associated with each of these documents, but when you open the document, when you read it, you can kind of figure out that this is probably about Apple. 
Why? Because iPhone is mentioned so many times. So when you're reading about Apple financial report, they might mention Galaxy as well because that's their main competitor. But here the ratio of iPhone to Galaxy is iPhone have iPhone is mentioned three times, Galaxy is mentioned one time. So, you know, overall you see iPhone occurrence more, much more than galaxy. 
If your Samsung document, of course, Samsung company document will have more mention of Galaxy than iPhone. 
Now, let's say new document comes in and you don't know what is the company associated with it. 
Here you can again count iPhone and Galaxy, and you find that iPhone is six times mentioned in the document. Galaxy is mentioned two times. Looking at the situation as a data scientist, you can figure out a simple formula that whenever a ratio of iPhone to galaxy is three to one, it should be an Apple document. So now you can auto annotate Apple as a company. This is a common problem in financial institute where you have a document and you want to tag some metadata to it, and we can use some automation, some rules, some coding to auto annotate. You know, you can manually annotate it, but using this particular formula, you can automate the annotation process. Unfortunately, things in real life are quite different. In our document, you know, there might be a mention of iPad. Google Pixel is another competitor. So now how do you come up with your formula? Maybe you say, okay, iPhone to galaxy ratio has to be three to one and iPhone to pixel ratio has to be three to one and iPad to pixel ratio has to be two to one. Wow, that's too complicated. My baby got confused. Well, vector mathematics comes at rescue. This thing can be presented as a vector here. And using vector mathematics, we can figure out document similarity. Let me go back to the simple case that we saw before. How do you represent this as a vector? On x axis, I have iPhone word count. Y axis has galaxy word count. So you see three, two, one. And this is the vector. Vector has magnitude and direction both. And when I have my yellow document, iPhone is six galaxies, two. Now look at the angle between these two arrows, blue and yellow, the angle is zero. So the angle determines the document similarity. If the angle is zero, it means documents are very similar. You might have another green document. You know, iPhone five, Galaxy one time here, the angle is still not that high. You know, it's a little bit of an angle. So you can say these documents are still similar. But when you have a Samsung document where iPhone is mentioned only once, Galaxy is four, you see the angle is much bigger. So when you have a bigger angle, you can say these documents are not similar. Meaning, if for yellow arrow, I know companies Apple. For blue arrow, I can say companies definitely not Apple. Versus if I have a green arrow where the theta two angle is much, you know, much closer to yellow arrow, you can say green and yellow documents are similar, whereas blue is not similar. Let's say the angle between these two is 17 degree. You can use this angle to define document similarity. One thing you can say is document similarities is 17 degree. Hmm. I mean, when I say that, it doesn't sound that obvious or intuitive. It's not a good way to represent similarity. What if I can present the similarity between some number between, let's say, zero to one? When I say document similarity is 0.9, which means they're 90% similar, you know, if the document similarity is one, they are very similar. If it is zero, they are very different. Okay, so how can you transform this degree angle into a range? Because that range seems like a good convention. You just take cosine. If you don't know about sine cosine. I made a video previously in the same series. Watch that. But cosine of 17 is 0.95. Now see, my convention is much more obvious, much more easier. I can say document similarity here between green and yellow vector is 0.95. Hooray. All right, now, cosine similarity is nothing but a cosine of an angle between the two vectors. And the great part about math is we looked at very simple scenario of two dimensional vector in real life. You will have 100 dimension vector, and these dimensions are basically the features. So your document might have 100 features and you can present those as a vector and you can still do the math. See, that's the beauty of math. 
You cannot visualize 100 dimension, but the math will continue to work. Okay, the academic formula is between a and b. These are the two vectors. And the cosine similarity between these two vectors is dot product, which is a dot b divided by magnitude of a magnitude of b, and a dot b is usually b cos theta. So you know it is cosine similarity is nothing but cos theta. All right, now if you have arrows pointing in the same direction, then the cosine similarity is one, which means those arrows, those vectors, those documents are quite similar. If they are at 90 degree means the similarity is zero, they are very different. And if it is 180 degree, similarity is minus one, which means they represent opposite concept. Now, cosine distance, a very simple concept, just it is used to represent the same thing. It's one minus cosine similarity. So here when two arrows are pointing in the same direction, when you're talking about same vector, the cosine distance will be zero. So when you say distance is zero, which means they are similar, they are closer. That's the only idea behind cosine distance. When you have two vectors at 90 degree cosine distance is one, which means they are very different. And cosine distances are represented only in a positive space. That's why I'm not talking about that 180 degree case that we saw in a previous slide. Let's write some Python code. Now we'll be using Python's sklearn module for importing cosine similarity method here. And when you have this method, it expects two vectors. And those two vectors are going to be the case that we looked at for apple document was three, one and six and two. So let's see what is the similarity between three, one and six and two. So three, one, six and two. Now this method expects two dimensional array. So you have to put one additional array here. I mean that's the signature of the method. You see similarity is one, which means the documents are very similar. And if you find a cosine distance of the same thing is going to be zero cosine distance. So this is two dimensional, okay, see one e raise two -16 this is very close to zero. Okay, now if you find out a distance between, let's say, these two documents, you know, 3132 see they are iPhone three times the galaxy one times, iPhone three times galaxy two times. Still they both are Apple documents. So there is a 0.96 per similarity. One similarity means they're very, very similar. Now let's look at some real documents. I'm going to create some variable variables here with financial document string. So you can see the first one is iPhone, Apple document, Apple document, Samsung. Samsung. Okay. And what you can do is you can create a pandas data frame. So I'm going to import pandas here and we create a pandas data frame and the data in the data frame, we are going to hard code some arrays. Okay, so what are those records? In the first document, you see, iPhone came one, two, three times, galaxies one time. Okay, so iPhone three times, galaxy one time. Similarly, I have pre counted the iPhone and galaxy word count in remaining documents. And that's, that's what it looks like now in the index, want to supply the document as an index. So instead of 0123, I will say, okay, doc one, doc two, doc three, and my data frame will look like this. So it says document one has iPhone three times, galaxy one times, and so on. You can clearly see, first two documents are Apple documents, and second and third documents are Samsung documents. Now, when you do something like this, it returns you first document, but it is two dimensional array or a data frame. And our cosine similarity function expects that data frame. And that's why we are doing this, you know, otherwise I would have done simply this. I want to compare, let's say document one and document two. So I would have done this, right. But see it, it expects two dimensional array and that's the only reason I am doing this kind of range. Okay, we are getting some error. Let's see. Oh yeah, this is a syntax error. Actually it has to be this way. See, they're very similar. .94 so we can say both are Apple documents. But if you do the same thing with, let's say, document one and document three, they are not that similar. See. .6 why .6 well, let's compare doc one and Doc three. See three, one and one three. There's some similarity, you know, but if you do doc three and four, let's say, then they will be more closer to one. See point 98. And now for the same documents, if I do cosine distance, it will be 0.01. So quotient distance is one minus similarity. So if you add these two numbers, they will add up to one. That's all pretty much I had. If you want to do the same thing in Tensorflow, then Tensorflow has this particular function that you can use. I hope you like this video. If you did, please share it with your friends. Thank you.
"""

# Preprocessing the input text for T5
input_text = "summarize: " + text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate summary (set max_length to control the length of the summary)
summary_ids = model.generate(input_ids, max_length=250, num_beams=2, length_penalty=2.0, early_stopping=True)

# Decode the generated summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary:", summary)
