In [1]:
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

In [2]:
import pandas as pd
import numpy as np
import speech_recognition as sr 
import os 
from pydub import AudioSegment
from pydub.silence import split_on_silence
import moviepy.editor as mp



In [3]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [4]:
from sklearn.pipeline import Pipeline
import socket

In [5]:
# a function that splits the audio file into chunks
# and applies speech recognition
def get_large_audio_transcription(path):
    """
    Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks
    """
    r = sr.Recognizer()
    # open the audio file using pydub
    sound = AudioSegment.from_wav(path)  
    # split audio sound where silence is 500 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # experiment with this value for your target audio file
        min_silence_len = 500,
        # adjust this per requirement
        silence_thresh = sound.dBFS-14,
        # keep the silence for 1 second, adjustable as well
        keep_silence=1000,
    )
    folder_name = "audio-chunks"
    # create a directory to store the audio chunks
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    # process each chunk 
    for i, audio_chunk in enumerate(chunks, start=1):
        # export audio chunk and save it in
        # the `folder_name` directory.
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        # recognize the chunk
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            # try converting it to text
            try:
                text = r.recognize_google(audio_listened)
            except sr.UnknownValueError as e:
                print("Error:", str(e))
            else:
                text = f"{text.capitalize()}. "
                print(chunk_filename, ":", text)
                whole_text += text
    # return the text for all chunks detected
    return whole_text

In [6]:
def step1():
    clip = mp.VideoFileClip(r"best.3gpp")
    clip.audio.write_audiofile(r"converted.wav")
    # create a speech recognition object
#     r = sr.Recognizer()
    path = "converted.wav"
    socket.getaddrinfo('localhost', 8080)
    text = get_large_audio_transcription(path)
    return text

In [7]:
def step2(text):
    model = TFAutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# T5 uses a max_length of 512 so we cut the article to 512 tokens.
    inputs = tokenizer("summarize: " + text, return_tensors="tf", max_length=512, truncation=True)
    outputs = model.generate(
        inputs['input_ids'], max_length=150, min_length=100, length_penalty=5.0, num_beams=2,early_stopping=True 
    )
    summary = tokenizer.decode(outputs[0])
    print(summary)
    
    str(tokenizer.decode(outputs[0]))
    
    displacy.render(nlp(str(text)), jupyter=True, style='ent')

In [8]:
# def step3()

In [9]:
# pipe = Pipeline([('step1' , step1),
#     ('step2',step2)])
# pipe_transform = pipe.fit_transform('rahul.3gpp')

In [10]:
text = step1()
# 

MoviePy - Writing audio in converted.wav


                                                                        

MoviePy - Done.
audio-chunks\chunk1.wav : The first end up on the light austria when i grow up i could combine together there i could see myself becoming a farm broken the factory or anything like that even though my parents wanted me to stay in a normal life but that was not mine by visually different types of the response to something special person in hindi for sending day when i went to school remember was 11 years old and ensure that are came entry about the meraka this documentary skyscrapers the high rise the huge threat to the 693 wise no of this stuff that's why i want to be around i want to be america. 
audio-chunks\chunk2.wav : I read that this is the blueprint. 
audio-chunks\chunk3.wav : This is exactly what i wanted to become a party building champion i want to get into movies in imo the big millions of which famous just like right path rate it helps that i know where i was going on i am your feet on the way i am going i know where i was going i am going to take a particle

In [11]:
step2(text)

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


</s><s>When i grow up i could combine together there i could see myself becoming a farm broken the factory or anything like that even though my parents wanted me to stay in a normal life but that was not mine by visually different types of the response to something special person in hindi for sending day. I read that this is the blueprint. This is exactly what i wanted to become a party building champion. I want to get into movies in imo the big millions of which famous just like right path rate it helps that i know where i was going on.
