In [1]:
# pip install yt-dlp moviepy pydub nltk rake-nltk scikit-learn numpy

# Speech Recognition

In [15]:
# sudo apt update
# sudo apt install ffmpeg
# ffmpeg -version
# ffprobe -version

In [9]:
import yt_dlp as youtube_dl
from moviepy.editor import AudioFileClip

# Download YouTube video
ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': 'audio.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '192',
    }],
}
url = "https://www.youtube.com/watch?v=X8MZWCGgIb8"

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([url])

# Extract Audio
audio_path = 'audio_output.wav'
audio_clip = AudioFileClip("audio.wav")
audio_clip.write_audiofile(audio_path)


[youtube] Extracting URL: https://www.youtube.com/watch?v=X8MZWCGgIb8
[youtube] X8MZWCGgIb8: Downloading webpage
[youtube] X8MZWCGgIb8: Downloading ios player API JSON
[youtube] X8MZWCGgIb8: Downloading web creator player API JSON
[youtube] X8MZWCGgIb8: Downloading m3u8 information
[info] X8MZWCGgIb8: Downloading 1 format(s): 251
[download] Destination: audio.webm
[download] 100% of    5.32MiB in 00:00:00 at 10.39MiB/s  
[ExtractAudio] Destination: audio.wav
Deleting original file audio.webm (pass -k to keep)
MoviePy - Writing audio in audio_output.wav


                                                                      

MoviePy - Done.


In [15]:
import time
import speech_recognition as sr

def recognize_speech_with_retry(audio_file, retries=3, delay=5):
    recognizer = sr.Recognizer()
    for attempt in range(retries):
        try:
            with sr.AudioFile(audio_file) as source:
                audio = recognizer.record(source)
                text = recognizer.recognize_google(audio)
                return text
        except sr.RequestError as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                raise
        except Exception as e:
            print(f"An error occurred: {e}")
            break
    return None

In [16]:
from pydub import AudioSegment
import os

def split_audio(file_path, chunk_length_ms=60000):
    audio = AudioSegment.from_wav(file_path)
    chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
    chunk_files = []
    
    for i, chunk in enumerate(chunks):
        chunk_filename = f"chunk_{i}.wav"
        chunk.export(chunk_filename, format="wav")
        chunk_files.append(chunk_filename)
    
    return chunk_files

# Split the audio into chunks
chunks = split_audio("audio_output.wav")

# Process each chunk individually
all_text = []
for chunk in chunks:
    text = recognize_speech_with_retry(chunk)
    if text:
        all_text.append(text)
    os.remove(chunk)  # Clean up the chunk file after processing

final_text = " ".join(all_text)
print("Extracted Text:", final_text)


Extracted Text: two computer engineers and best friends to decide to play the game of flip the coin on tapes computer with the computer being the opponent and the computer plays the first move but it doesn't know what it was not true what is the quantum computer and quantum computers are advanced machines inspired by Quantum Physics study of the behaviour of atoms and particles so quantum computers operate by studying in controlling behaviour of these particles within away completely different computers or even supercomputers it is an upgraded and not exactly the next generation questions because whether you choose to flip the coin or not the outcome of still be there between both possibilities just like a mixture of lemon juice and water lemon juice is very little messages from one location to another it can be difficult for people without condom and certainly this type of unique and Unbreakable corruption is already tested by banks and companies like JP morgan's etc can be used in th

# Text Processing

In [18]:
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')

# Text Preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    tokens = nltk.word_tokenize(text)  # Tokenization
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stopwords]  # Remove stopwords
    return " ".join(tokens)

processed_text = preprocess_text(text)
print("Processed Text:", processed_text)

Processed Text: quantum computers future upcoming generation work still taking technology business world companies like google microsoft competing build quantum computing tools progress benefits quantum computation realised long search large


[nltk_data] Downloading package punkt to /home/jarin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jarin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Keyword Extraction

In [19]:
from rake_nltk import Rake

# Initialize RAKE
r = Rake()

# Extract keywords from the text
r.extract_keywords_from_text(processed_text)

# Get ranked phrases
keywords = r.get_ranked_phrases()

print("Keywords:", keywords)


Keywords: ['quantum computers future upcoming generation work still taking technology business world companies like google microsoft competing build quantum computing tools progress benefits quantum computation realised long search large']


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [processed_text]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Get feature names and scores
feature_names = vectorizer.get_feature_names_out()
scores = X.T.sum(axis=1).A1

# Create a dictionary of keywords and their scores
keywords = dict(zip(feature_names, scores))

# Sort and print keywords
sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)
print("Keywords with Scores:", sorted_keywords)


Keywords with Scores: [('quantum', 0.5222329678670935), ('benefits', 0.17407765595569785), ('build', 0.17407765595569785), ('business', 0.17407765595569785), ('companies', 0.17407765595569785), ('competing', 0.17407765595569785), ('computation', 0.17407765595569785), ('computers', 0.17407765595569785), ('computing', 0.17407765595569785), ('future', 0.17407765595569785), ('generation', 0.17407765595569785), ('google', 0.17407765595569785), ('large', 0.17407765595569785), ('like', 0.17407765595569785), ('long', 0.17407765595569785), ('microsoft', 0.17407765595569785), ('progress', 0.17407765595569785), ('realised', 0.17407765595569785), ('search', 0.17407765595569785), ('taking', 0.17407765595569785), ('technology', 0.17407765595569785), ('tools', 0.17407765595569785), ('upcoming', 0.17407765595569785), ('work', 0.17407765595569785), ('world', 0.17407765595569785)]


# Text Classification

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups

# Example: Using 20 Newsgroups Dataset for Classification
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)

# Train a simple model
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

model = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
model.fit(X_train, y_train)
predicted_category = model.predict([processed_text])

print("Predicted Category:", newsgroups.target_names[predicted_category[0]])

Predicted Category: comp.graphics


# Vectorization Techniques

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize Text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([processed_text] + newsgroups.data)

# Cosine Similarity
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
similarities = cos_sim[0][1:]  # Exclude the first comparison with itself

print("Cosine Similarities:", similarities)


Cosine Similarities: [0.00235485 0.         0.01736099 ... 0.01278517 0.00950991 0.        ]


# Recommendation System

In [23]:
import numpy as np

# Recommend Docs Based on Similarity
top_indices = np.argsort(similarities)[-5:][::-1]  # Get top 5 most similar
recommended_videos = [(newsgroups.data[i], similarities[i]) for i in top_indices]

print("Recommended Videos:")
for video, sim in recommended_videos:
    print(f"Similarity: {sim}\nDoc Text: {video[:100]}...\n")

Recommended Videos:
Similarity: 0.08310863269725427
Doc Text: From: danj@welchgate.welch.jhu.edu (Dan Jacobson)
Subject: Re: Is there an FTP achive for USGS terra...

Similarity: 0.07108841793384753
Doc Text: From: peterbak@microsoft.com (Peter Bako)
Subject: JPEG file format?
Organization: Microsoft Corp.
L...

Similarity: 0.06337028515567467
Doc Text: From: mangoe@cs.umd.edu (Charley Wingate)
Subject: Re: Yeah, Right
Lines: 30

Benedikt Rosenau write...

Similarity: 0.05237382670904888
Doc Text: From: ingles@engin.umich.edu (Ray Ingles)
Subject: Re: Yeah, Right
Organization: University of Michi...

Similarity: 0.051638653385994784
Doc Text: From: jenk@microsoft.com (Jen Kilmer)
Subject: Re: sex education
Organization: Microsoft Corporation...

