<a href="https://colab.research.google.com/github/Guhan2348519/SPR_labs/blob/main/2348519_SPR_lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Record audio using PyAudio
def record_audio(filename="output.wav", duration=5):
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    RECORD_SECONDS = duration
    WAVE_OUTPUT_FILENAME = filename

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("Speak something...")  # Prompt the user to start speaking

    frames = []

    for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Recording finished.")  # Inform the user recording has ended

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

In [None]:
# Preprocess audio using Librosa (extract MFCC features)
def preprocess_audio(filename):
    print("Recognizing...")  # Indicate that recognition is in progress
    try:
        audio, sr = librosa.load(filename, sr=16000)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

        # Visualize MFCCs
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(mfccs, sr=sr, x_axis='time')
        plt.colorbar()
        plt.title('MFCC')
        plt.tight_layout()
        plt.show()

        return mfccs
    except Exception as e:
        print("Error during recognition:", str(e))
        return None

In [None]:
import pyaudio
import wave
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tkinter import Tk, filedialog

In [None]:

# Recognize speech using a pre-trained KNN model
def recognize_speech(mfccs, knn):
    try:
        # Predict the speech-to-text conversion from the MFCCs
        predicted_text = knn.predict([mfccs.mean(axis=1)])  # Example prediction logic
        print(f"Speech recognized: '{predicted_text[0]}'")
        print("Speech successfully converted to text!")
    except Exception as e:
        print(f"Error in recognizing speech: {e}")
        print("Speech Recognition could not understand audio. Please try speaking more clearly.")

In [None]:
def train_model():
    # Simulating the training process
    X = np.random.rand(100, 13)  # Example MFCC features
    y = np.array(["hello", "lights", "off", "on"] * 25)  # Example labels

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)

    # Test the model
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Training complete. Model accuracy: {accuracy * 100:.2f}%")

    return knn

In [None]:
# Function to upload an audio file
def upload_audio_file():
    root = Tk()
    root.withdraw()  # Hide the main window
    filename = filedialog.askopenfilename(title="Select an Audio File", filetypes=[("Audio Files", "*.wav *.mp3")])
    return filename

def main():
    # Train the model (for demonstration purposes)
    knn = train_model()

    # Provide user with two options
    print("Choose an option:")
    print("1. Record speech using microphone")
    print("2. Upload an audio file")

    choice = input("Enter 1 or 2: ")

    if choice == '1':
        # Record audio via microphone
        record_audio()
        filename = "output.wav"  # Recorded audio will be saved as output.wav
    elif choice == '2':
        # Upload audio file
        filename = upload_audio_file()
        if not filename:
            print("No file selected. Exiting.")
            return
    else:
        print("Invalid choice. Exiting.")
        return

    # Preprocess audio and extract MFCCs
    mfccs = preprocess_audio(filename)

    # Recognize speech based on MFCCs
    if mfccs is not None:
        recognize_speech(mfccs, knn)

if __name__ == "__main__":
    main()

In [4]:

import os
import librosa

# Define the path to the downloaded dataset
train_audio_path = '/root/.cache/kagglehub/datasets/antfilatov/mini-speech-commands/versions/1/mini_speech_commands/'  # Adjust this path if needed

# Define the labels (commands) based on the dataset structure
labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'stop', 'go']  # Modify based on your dataset

# Initialize lists to store all wave samples and labels
all_wave = []
all_label = []

# Loop through each label to load and process audio files
for label in labels:
    print(f'Processing label: {label}')
    # List all .wav files in the label's directory
    waves = [f for f in os.listdir(os.path.join(train_audio_path, label)) if f.endswith('.wav')]

    for wav in waves:
        # Load the audio file
        samples, sample_rate = librosa.load(os.path.join(train_audio_path, label, wav), sr=16000)

        # Resample the audio to 8000 Hz
        samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=8000)

        # Check if the length of samples is 8000
        if len(samples) == 8000:
            all_wave.append(samples)
            all_label.append(label)
        else:
            print(f'Skipped {wav}: Length is {len(samples)} (expected 8000)')

# Optionally, check how many samples were collected for each label
print(f'Total samples collected: {len(all_wave)}')

Processing label: yes
Skipped b36c27c2_nohash_0.wav: Length is 7168 (expected 8000)
Skipped 5eb5fc74_nohash_1.wav: Length is 7168 (expected 8000)
Skipped b5aacf2c_nohash_0.wav: Length is 6486 (expected 8000)
Skipped 41285056_nohash_2.wav: Length is 7510 (expected 8000)
Skipped 3ac2e76f_nohash_0.wav: Length is 7059 (expected 8000)
Skipped 4f781a59_nohash_0.wav: Length is 7431 (expected 8000)
Skipped 52bfbce8_nohash_0.wav: Length is 6144 (expected 8000)
Skipped 6f5b4d3d_nohash_0.wav: Length is 5202 (expected 8000)
Skipped e98cb283_nohash_1.wav: Length is 4459 (expected 8000)
Skipped 748cb308_nohash_2.wav: Length is 7431 (expected 8000)
Skipped e4200516_nohash_0.wav: Length is 6316 (expected 8000)
Skipped 434a267c_nohash_0.wav: Length is 4438 (expected 8000)
Skipped 84999496_nohash_2.wav: Length is 6688 (expected 8000)
Skipped cb164eea_nohash_0.wav: Length is 7168 (expected 8000)
Skipped 3ab9ba07_nohash_0.wav: Length is 7851 (expected 8000)
Skipped dbb7723a_nohash_0.wav: Length is 6144 (e

In [None]:
# Initialize lists to store all wave samples and labels
all_wave = []
all_label = []

# Loop through each label to load and process audio files
for label in labels:
    print(f'Processing label: {label}')
    waves = [f for f in os.listdir(os.path.join(train_audio_path, label)) if f.endswith('.wav')]

    for wav in waves:
        # Load the audio file
        samples, sample_rate = librosa.load(os.path.join(train_audio_path, label, wav), sr=16000)

        # Resample the audio to 8000 Hz
        samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=8000)

        # Check if the length of samples is 8000
        if len(samples) == 8000:
            all_wave.append(samples)
            all_label.append(label)
        else:
            print(f'Skipped {wav}: Length is {len(samples)} (expected 8000)')

# Encode labels
le = LabelEncoder()
y = le.fit_transform(all_label)

# Reshape all_wave to fit Conv1D input requirements
all_wave = np.array(all_wave).reshape(-1, 8000, 1)

# Split into train and validation sets
x_tr, x_val, y_tr, y_val = train_test_split(all_wave, y, stratify=y, test_size=0.2, random_state=777, shuffle=True)

# Print shapes of the training and validation sets
print(f'Training data shape: {x_tr.shape}, Validation data shape: {x_val.shape}')
print(f'Training labels shape: {y_tr.shape}, Validation labels shape: {y_val.shape}')

# Define the model architecture using Conv1D
model = keras.Sequential([
    layers.Conv1D(32, kernel_size=5, activation='relu', input_shape=(8000, 1)),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(64, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(128, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(labels), activation='softmax')  # Output layer for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

# Train the model
history = model.fit(x_tr, y_tr, epochs=10, batch_size=32, validation_data=(x_val, y_val))

Processing label: yes
Skipped b36c27c2_nohash_0.wav: Length is 7168 (expected 8000)
Skipped 5eb5fc74_nohash_1.wav: Length is 7168 (expected 8000)
Skipped b5aacf2c_nohash_0.wav: Length is 6486 (expected 8000)
Skipped 41285056_nohash_2.wav: Length is 7510 (expected 8000)
Skipped 3ac2e76f_nohash_0.wav: Length is 7059 (expected 8000)
Skipped 4f781a59_nohash_0.wav: Length is 7431 (expected 8000)
Skipped 52bfbce8_nohash_0.wav: Length is 6144 (expected 8000)
Skipped 6f5b4d3d_nohash_0.wav: Length is 5202 (expected 8000)
Skipped e98cb283_nohash_1.wav: Length is 4459 (expected 8000)
Skipped 748cb308_nohash_2.wav: Length is 7431 (expected 8000)
Skipped e4200516_nohash_0.wav: Length is 6316 (expected 8000)
Skipped 434a267c_nohash_0.wav: Length is 4438 (expected 8000)
Skipped 84999496_nohash_2.wav: Length is 6688 (expected 8000)
Skipped cb164eea_nohash_0.wav: Length is 7168 (expected 8000)
Skipped 3ab9ba07_nohash_0.wav: Length is 7851 (expected 8000)
Skipped dbb7723a_nohash_0.wav: Length is 6144 (e

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m  7/180[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:47[0m 2s/step - accuracy: 0.1030 - loss: 2.1149