In [16]:
import os
import numpy as np
import librosa
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Function to extract MFCC features from audio files
def extract_mfcc_features(file_path, sr=16000, n_mfcc=13, mfcc_max_pad_len=100):
    try:
        audio, _ = librosa.load(file_path, sr=sr)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        # Pad or truncate the MFCC sequence to a fixed length
        if mfccs.shape[1] > mfcc_max_pad_len:
            mfccs = mfccs[:, :mfcc_max_pad_len]
        else:
            pad_width = mfcc_max_pad_len - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        return mfccs.flatten()
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

# Function to load data from a folder containing audio files
def load_data_from_folder(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.mp3'):
            file_path = os.path.join(folder_path, file_name)
            features = extract_mfcc_features(file_path)
            if features is not None:
                data.append(features)
    return np.array(data)

# Load data
audio_folder = r'C:\Users\gowtham.veepujerla\Downloads\Text to audio\data\cv-corpus-11.0-delta-2022-09-21\en\clips'
X = load_data_from_folder(audio_folder)


In [15]:
if len(X) == 0:
    print("No audio files found in the specified folder.")
else:
    # Perform dimensionality reduction if needed
    # pca = PCA(n_components=50)
    # X_pca = pca.fit_transform(X)

    # Perform clustering with a range of cluster numbers
    silhouette_scores = []
    for num_clusters in range(2, 11):
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        clusters = kmeans.fit_predict(X)
        silhouette_avg = silhouette_score(X, clusters)
        silhouette_scores.append(silhouette_avg)

    # Find the optimal number of clusters
    optimal_num_clusters = np.argmax(silhouette_scores) + 2  # Add 2 since we started from 2 clusters

    # Perform clustering with the optimal number of clusters
    kmeans = KMeans(n_clusters=optimal_num_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)


No audio files found in the specified folder.


In [1]:
import os
import requests
import tarfile

In [2]:
# Define the URL of the LibriSpeech dataset
url = "https://www.openslr.org/resources/12/train-clean-100.tar.gz"

# Define the directory to save the downloaded data
download_dir = r"C:\Users\gowtham.veepujerla\Downloads\Text to audio\audio_files"

In [3]:
# Create the directory if it doesn't exist
os.makedirs(download_dir, exist_ok=True)

# Send a GET request to download the tar.gz file
response = requests.get(url, stream=True)

In [4]:
# Save the downloaded tar.gz file
tar_file_path = os.path.join(download_dir, "train-clean-100.tar.gz")
with open(tar_file_path, "wb") as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

In [5]:
# Extract the contents of the tar.gz file
with tarfile.open(tar_file_path, "r:gz") as tar:
    tar.extractall(download_dir)

# Remove the downloaded tar.gz file
os.remove(tar_file_path)

print("LibriSpeech dataset downloaded and extracted successfully.")

LibriSpeech dataset downloaded and extracted successfully.


In [None]:
import os
import librosa
import numpy as np

# Function to preprocess audio files
def preprocess_audio(audio_dir, output_dir):
    # Iterate over all audio files in the directory
    for root, _, files in os.walk(audio_dir):
        for file in files:
            if file.endswith(".flac"):  # Adjust file extension if necessary
                file_path = os.path.join(root, file)
                # Load the audio file
                audio, sr = librosa.load(file_path, sr=None)
                # Perform preprocessing (e.g., feature extraction, normalization)
                # For example, you can extract MFCC features
                mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
                # Normalize the features
                normalized_mfccs = (mfccs - np.mean(mfccs)) / np.std(mfccs)
                # Save the preprocessed features
                output_file = os.path.join(output_dir, file.replace(".flac", ".npy"))
                np.save(output_file, normalized_mfccs)
                print(f"Processed {file}")

# Define input and output directories
audio_dir = r"C:\Users\gowtham.veepujerla\Downloads\Text to audio\audio_files\train-clean-100"
output_dir = r"C:\Users\gowtham.veepujerla\Downloads\Text to audio\Output"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Preprocess the audio files
preprocess_audio(audio_dir, output_dir)

print("Preprocessing complete.")


Processed 103-1240-0000.flac
Processed 103-1240-0001.flac
Processed 103-1240-0002.flac
Processed 103-1240-0003.flac
Processed 103-1240-0004.flac
Processed 103-1240-0005.flac
Processed 103-1240-0006.flac
Processed 103-1240-0007.flac
Processed 103-1240-0008.flac
Processed 103-1240-0009.flac
Processed 103-1240-0010.flac
Processed 103-1240-0011.flac
Processed 103-1240-0012.flac
Processed 103-1240-0013.flac
Processed 103-1240-0014.flac
Processed 103-1240-0015.flac
Processed 103-1240-0016.flac
Processed 103-1240-0017.flac
Processed 103-1240-0018.flac
Processed 103-1240-0019.flac
Processed 103-1240-0020.flac
Processed 103-1240-0021.flac
Processed 103-1240-0022.flac
Processed 103-1240-0023.flac
Processed 103-1240-0024.flac
Processed 103-1240-0025.flac
Processed 103-1240-0026.flac
Processed 103-1240-0027.flac
Processed 103-1240-0028.flac
Processed 103-1240-0029.flac
Processed 103-1240-0030.flac
Processed 103-1240-0031.flac
Processed 103-1240-0032.flac
Processed 103-1240-0033.flac
Processed 103-