## module 1,2,3 combined

This will record continuos audio and transcribe them it the set of 10 second audio clips until an stop event occure

In [2]:
import pyaudio
import wave
import speech_recognition as sr
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from transformers import BertTokenizer, BertModel, BartForConditionalGeneration, BartTokenizer
import torch
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Parameters
FORMAT = pyaudio.paInt16  # Audio format
CHANNELS = 1  # Number of channels
RATE = 44100  # Sample rate (Hz)
CHUNK = 1024  # Chunk size (number of frames per buffer)
RECORD_SECONDS = 10  # Duration of the recording (seconds)
OUTPUT_FILENAME = "recorded_audio.wav"  # Output filename

try:
    # Initialize PyAudio
    audio = pyaudio.PyAudio()

    # Open stream
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)

    print("Recording...")

    frames = []

    # Record data
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Recording finished.")

    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    audio.terminate()

    # Save the recorded data as a WAV file
    with wave.open(OUTPUT_FILENAME, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))

    print(f"Audio recorded and saved as {OUTPUT_FILENAME}")

    # Speech recognition
    r = sr.Recognizer()
    with sr.AudioFile(OUTPUT_FILENAME) as source:
        audio_data = r.record(source)  # Read the entire audio file

        try:
            # Recognize the speech using Google Web Speech API
            text = r.recognize_google(audio_data)
            print("Transcription: " + text)
            # Append the transcription to a text file
            with open("transcription.txt", "a") as f:
                f.write(text + "\n")
            # Save the transcription to a text file
            with open("transcription.txt", "w") as f:
                f.write(text)
                
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand the audio")
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))

    # Text processing
    with open("transcription.txt", "r") as file:
        text = file.read()

    # Tokenize the text
    words = word_tokenize(text)

    # Remove punctuation and make lowercase
    words = [word.lower() for word in words if word.isalnum()]

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word not in stop_words]

    # Count the frequency of each word
    word_freq = Counter(filtered_words)

    # Select the top N keywords (you can adjust N as needed)
    N = 10
    keywords = word_freq.most_common(N)

    # Print the keywords
    print("Top keywords:")
    for keyword, freq in keywords:
        print(f"{keyword}: {freq}")

    # Save the keywords to a text file
    with open("keywords.txt", "w") as file:
        for keyword, freq in keywords:
            file.write(f"{keyword}\n")

    # Process keywords with BERT
    # Load pre-trained BERT model and tokenizer
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Loading the dataset of keywords
    datapath = r"C:\Users\Lenovo\Documents\Rohit_AI_ML\SummariseIT\dataset.csv"
    df = pd.read_csv(datapath)
    # Flatten the dataset to create a set of valid keywords
    valid_keywords = set()
    for column in df.columns:
        valid_keywords.update(df[column].dropna().str.strip().tolist())

    def extract_keywords_from_tokens(text, model, tokenizer, num_keywords=5):
        # Tokenize input
        inputs = tokenizer(text, return_tensors='pt')
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state

        # Convert token IDs to tokens
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        
        # Get the [CLS] token's embedding
        cls_embedding = last_hidden_states[:, 0, :].squeeze()
        
        # Calculate similarity between each token embedding and the [CLS] embedding
        similarities = torch.matmul(last_hidden_states.squeeze(), cls_embedding)
        
        # Get the indices of the top-n tokens with the highest similarity
        top_indices = similarities.topk(num_keywords).indices

        # Extract the corresponding tokens, excluding [CLS] and checking if they are in valid_keywords
        keywords = [tokens[i] for i in top_indices if tokens[i] != '[CLS]' and tokens[i] in valid_keywords]
        
        return keywords

    # Read and process input text
    file_path = "keywords.txt"
    with open(file_path, "r") as file:
        text = file.read()

    # Use the function to extract keywords
    extracted_keywords = extract_keywords_from_tokens(text, bert_model, bert_tokenizer)

    # Print extracted keywords
    print("Extracted keywords:")
    for idx, keyword in enumerate(extracted_keywords, start=1):
        print(f"Keyword {idx}: {keyword}")

        # Search the web for the keyword on Wikipedia
        search_url = f"https://en.wikipedia.org/wiki/{keyword}"
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract the relevant information from the search result
        paragraphs = soup.find_all("p")
        extracted_text = ""
        for paragraph in paragraphs:
            extracted_text += paragraph.get_text() + " "
        extracted_text = extracted_text.strip()

        # Generate a summary using BART
        bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
        bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

        def generate_summary(text, model, tokenizer):
            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
            summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=150, early_stopping=True)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            return summary

        # Generate summary
        summary = generate_summary(extracted_text, bart_model, bart_tokenizer)
        print(f"Summary of {keyword}:")
        print(summary)

except OSError as e:
    print(f"OSError encountered: {e}")


ModuleNotFoundError: No module named 'pyaudio'