In [1]:
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.preprocessing import FunctionTransformer

In [2]:
import pandas as pd
import numpy as np
import speech_recognition as sr 
import os 
from pydub import AudioSegment
from pydub.silence import split_on_silence
import moviepy.editor as mp



In [3]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [4]:
from sklearn.pipeline import Pipeline
import socket

In [5]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [6]:
# a function that splits the audio file into chunks
# and applies speech recognition
def get_large_audio_transcription(path):
    """
    Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks
    """
    r = sr.Recognizer()
    # open the audio file using pydub
    sound = AudioSegment.from_wav(path)  
    # split audio sound where silence is 500 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # experiment with this value for your target audio file
        min_silence_len = 500,
        # adjust this per requirement
        silence_thresh = sound.dBFS-14,
        # keep the silence for 1 second, adjustable as well
        keep_silence=500,
    )
    folder_name = "audio-chunks"
    # create a directory to store the audio chunks
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    # process each chunk 
    for i, audio_chunk in enumerate(chunks, start=1):
        # export audio chunk and save it in
        # the `folder_name` directory.
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        # recognize the chunk
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            # try converting it to text
            try:
                text = r.recognize_google(audio_listened)
            except sr.UnknownValueError as e:
                print("Error:", str(e))
            else:
                text = f"{text.capitalize()}. "
                print(chunk_filename, ":", text)
                whole_text += text
    # return the text for all chunks detected
    return whole_text

In [7]:
def step1(video):
    clip = mp.VideoFileClip(r"rahul.3gpp")
    clip.audio.write_audiofile(r"converted.wav")
    # create a speech recognition object
#     r = sr.Recognizer()
    path = "converted.wav"
    socket.getaddrinfo('localhost', 8080)
    text = get_large_audio_transcription(path)
    return text

In [8]:
def step2(t):
    model = TFAutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# T5 uses a max_length of 512 so we cut the article to 512 tokens.
    inputs = tokenizer("summarize: " + t, return_tensors="tf", max_length=512, truncation=True)
    outputs = model.generate(
        inputs['input_ids'], max_length=150, min_length=100, length_penalty=5.0, num_beams=4,early_stopping=True 
    )
    summary = tokenizer.decode(outputs[0])
    print(summary)
    
    str(tokenizer.decode(outputs[0]))
    
    displacy.render(nlp(str(t)), jupyter=True, style='ent')

In [9]:
# def step3(t):
#     tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
#     model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

#     nlp = pipeline("ner", model=model, tokenizer=tokenizer)
#     ner_results = nlp(t)
#     print(ner_results)

In [10]:
step1_transformer = FunctionTransformer(step1)
step2_transformer = FunctionTransformer(step2)

In [11]:
pipe = Pipeline([('step1' , step1_transformer),
    ('step2',step2_transformer)])
# pipe_transform = pipe.fit_transform('rahul.3gpp')

In [12]:
# text1 = step1()

In [13]:
# text2 = step2(text1)

In [14]:
# step3(text2)

In [16]:
pipe.fit_transform('rahul.3gpp')

MoviePy - Writing audio in converted.wav


                                                                                                                       

MoviePy - Done.
Error: 
audio-chunks\chunk2.wav : How to be principal. 
audio-chunks\chunk3.wav : Only. 
audio-chunks\chunk4.wav : Million ways to be fixed. 
audio-chunks\chunk5.wav : Each one of us has to find our own path. 
audio-chunks\chunk6.wav : Cute picture of boy born in a typical middle class family. 
audio-chunks\chunk7.wav : Who is that boy. 
audio-chunks\chunk8.wav : Some points in the early years. 
audio-chunks\chunk9.wav : Find me what are you doing want to do. 
audio-chunks\chunk10.wav : Telling myself in the household was no accident the first is a. 
audio-chunks\chunk11.wav : I know that sometimes inspiration disturb you in the face. 
audio-chunks\chunk12.wav : School days. 
audio-chunks\chunk13.wav : How is getting more into my own world of cricket. 
audio-chunks\chunk14.wav : Winning went to school tournament held back winning the world cup. 
audio-chunks\chunk15.wav : School taking for my greatest mission. 
audio-chunks\chunk16.wav : Alone sometimes it comes from th

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


</s><s>School taking for my greatest mission. My own reason to support along the way. Despite not company musically that might help me build on our future alone with my girlfriend. My mum is to remain focused on my eyes on the definition of it and it was then that i realise that you don't have to be number one in the world you just have tobe number one yourself reaching that it is the highest peak. Let the mountain climber and answers in the next mountain to climb on 7th august.


In [None]:
# text1