## Importing the neccessary libraries 

In [16]:
import speech_recognition as sr
import pyttsx3

### Initializing The Recognizer

In [17]:
r = sr.Recognizer()

## Now we are going to record the audio and convert this audio file into a text file

In [18]:
import pyaudio
import wave

# Parameters
FORMAT = pyaudio.paInt16  # Audio format
CHANNELS = 1  # Number of channels
RATE = 44100  # Sample rate (Hz)
CHUNK = 1024  # Chunk size (number of frames per buffer)
RECORD_SECONDS = 10  # Duration of the recording (seconds)
OUTPUT_FILENAME = "recorded_audio.wav"  # Output filename

try:
    # Initialize PyAudio
    audio = pyaudio.PyAudio()

    # Open stream
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)

    print("Recording...")

    frames = []

    # Record data
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Recording finished.")

    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    audio.terminate()

    # Save the recorded data as a WAV file
    with wave.open(OUTPUT_FILENAME, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))

    print(f"Audio recorded and saved as {OUTPUT_FILENAME}")
    with sr.AudioFile(OUTPUT_FILENAME) as source:
        audio = r.record(source)  # Read the entire audio file

        try:
            # Recognize the speech using Google Web Speech API
            text = r.recognize_google(audio)
            print("Transcription: " + text)
            # Append the transcription to a text file
            with open("transcription.txt", "a") as f:
                f.write(text + "\n")
            # Save the transcription to a text file
            with open("transcription.txt", "w") as f:
                f.write(text)
                
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand the audio")
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))
except OSError as e:
    print(f"OSError encountered: {e}")

Recording...
Recording finished.
Audio recorded and saved as recorded_audio.wav
Transcription: kya aap Humse Pyar Karte Hain DU u love MI no main tumse pyar nahin karta Nahin main tumse pyar nahin karta no


## Checking Stop Words and removing them from the Transcription file

## Tokenization

 This `tokenizer` divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences. It must be trained on a large collection of plaintext in the target language before it can be used.

 ` Stop Words` are commonly used word (such as “the”, “a”, “an”, or “in”) that a `Search Engine` has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query. 

`Script` to extract the keywords from the transcription file

In [14]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Read the text file
with open("transcription.txt", "r") as file:
    text = file.read()

# Tokenize the text
words = word_tokenize(text)

# Remove punctuation and make lowercase
words = [word.lower() for word in words if word.isalnum()]

# Remove stop words
stop_words = set(stopwords.words("english"))
filtered_words = [word for word in words if word not in stop_words]

# Count the frequency of each word
word_freq = Counter(filtered_words)

# Select the top N keywords (you can adjust N as needed)
N = 10
keywords = word_freq.most_common(N)

# Print the keywords
print("Top keywords:")
for keyword, freq in keywords:
    print(f"{keyword}: {freq}")

# Save the keywords to a text file
with open("keywords.txt", "w") as file:
    for keyword, freq in keywords:
        file.write(f"{keyword}: {freq}\n")


Top keywords:
pyar: 3
nahin: 3
main: 2
tumse: 2
karta: 2
kya: 1
aap: 1
humse: 1
karte: 1
hain: 1


# Work for tommorow categories the keywords

In [None]:
# start here

## Its used to generate dataset for training the module

In [27]:
import csv
import random

# Generate random words
def generate_word():
    length = random.randint(3, 10)
    return ''.join(random.choices(string.ascii_lowercase, k=length))

# Define the number of categories and entries per category
num_categories = 100
entries_per_category = 10000

# Create a list of unique words for each category (for simplicity, using a base list and ensuring no repetition)
def generate_unique_words(base_list, num_entries):
    return random.sample(base_list, num_entries)

# Base word lists for different themes
base_words_technology = ['Algorithm', 'Processor', 'Network', 'Software', 'Database', 'API', 'Cloud', 'AI', 'MachineLearning', 'BigData', 'Cybersecurity']
base_words_physics = ['Quantum', 'Particle', 'Relativity', 'Force', 'Energy', 'Atom', 'Wave', 'Photon', 'Neutron', 'Electron']
base_words_biology = ['Gene', 'Cell', 'Evolution', 'Ecosystem', 'Mutation', 'Protein', 'Enzyme', 'DNA', 'RNA', 'Genome']
base_words_finance = ['Stock', 'Bond', 'Investment', 'Interest', 'Inflation', 'Market', 'Budget', 'Asset', 'Liability', 'Dividend']

# Function to generate the dataset
def generate_dataset(num_categories, entries_per_category):
    with open('large_dataset.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        # Generate header with category names
        categories = [f'Category_{i+1}' for i in range(num_categories)]
        writer.writerow(categories)
        
        # Generate rows with unique words for each category
        for i in range(entries_per_category):
            row = []
            for category in categories:
                # Use different base lists for variety
                if 'Technology' in category:
                    row.append(generate_unique_words(base_words_technology, entries_per_category)[i % len(base_words_technology)])
                elif 'Physics' in category:
                    row.append(generate_unique_words(base_words_physics, entries_per_category)[i % len(base_words_physics)])
                elif 'Biology' in category:
                    row.append(generate_unique_words(base_words_biology, entries_per_category)[i % len(base_words_biology)])
                elif 'Finance' in category:
                    row.append(generate_unique_words(base_words_finance, entries_per_category)[i % len(base_words_finance)])
                else:
                    row.append(f'Word_{i+1}')
            writer.writerow(row)

# Generate the large CSV file
generate_dataset(num_categories, entries_per_category)
