In [None]:
#import the necessary libraries
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import cv2 as cv
import librosa
import requests
import os
import zipfile
import tarfile
from tensorflow.keras.callbacks import TensorBoard
from datetime import datetime
from glob import glob
import shutil


In [None]:
def download_data():
    datasets = {
    'cv_corpus_url' : 'https://download1514.mediafire.com/kkfvb0kznvwgHV0TJ_H9h5N6TXGzs59Y6Jn4HKNxAbhBV-1TgjNKx-AKkUouCiidWlvdDML7UHWGX0Akq_hmzAqNNP6Xww3H6h5lmh3xy4_-o_V8umkXOc0HbWbZRSfoXL-Kpju01uWmTRI8UBUlUiRyRx8bW_rqWmnlMaJNf3PmMsI/26zyiwi3d4c5nil/cv-corpus-15.0-delta-2023-09-08-en.tar.gz',
    'LJ_Speech_url' : 'https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2'
    }

    cv_corpus_path = os.path.join(os.getcwd(),'cv-corpus-15.0-delta-2023-09-08-en.tar.gz')
    LJ_speech_path = os.path.join(os.getcwd(),'LJSpeech-1.1.tar.bz2')

    for data in datasets.keys():
        print(f'Downloading {data}')
        print(datasets[data])
        # requests.get(datasets[data])

        if data == 'cv_corpus_url':
          response = requests.get(datasets[data])
          with open(cv_corpus_path, 'wb') as f:
            f.write(response.content)
            print('Done downloading CVcorpus')
            try:
              print('extracting cvcorpus')
              with tarfile.open(cv_corpus_path,'r:gz') as z:
                  z.extractall()
                  print('Done extracting CVcorpus')
            except:
                print("Unable to complete CVcorpus request")

            os.remove(cv_corpus_path)

        elif data == 'LJ_Speech_url':
            files = requests.get(datasets[data])
            with open(LJ_speech_path, 'wb') as f:
              f.write(files.content)
            print('Done downloading LJSpeech')
            try:
              print("Extracting LJSpeech")
              with tarfile.open(LJ_speech_path, "r:bz2") as f:
                  f.extractall()
                  print('Done downloading LJ_Speech')
            except:
                print("Unable to complete LJSpeech request")

            os.remove(LJ_speech_path)



In [None]:
download_data()

In [None]:
os.makedirs(os.path.join(os.getcwd(),'cv-corpus'))
os.makedirs(os.path.join(os.getcwd(),'LJSpeech'))

In [None]:
def prepare_cv_corpus():
  cv_corpus_audio_file_paths = os.path.join(os.getcwd(),'cv-corpus-15.0-delta-2023-09-08','en','clips')
  cv_corpus_audio_files = os.listdir(cv_corpus_audio_file_paths)
  for files in cv_corpus_audio_files:
    src_path = os.path.join(os.getcwd(),'cv-corpus-15.0-delta-2023-09-08','en','clips',files)
    dst_path = os.path.join(os.getcwd(),'cv-corpus',files)
    shutil.move(src_path,dst_path)
  shutil.rmtree( os.path.join(os.getcwd(),'cv-corpus-15.0-delta-2023-09-08'))



In [None]:
def prepare_LJSpeech():
  LJSpeech_audio_file_paths = os.path.join(os.getcwd(),'LJSpeech-1.1','wavs',)
  LJSpeech_audio_files = os.listdir(LJSpeech_audio_file_paths)
  for files in LJSpeech_audio_files:
    src_path = os.path.join(os.getcwd(),'LJSpeech-1.1','wavs',files)
    dst_path = os.path.join(os.getcwd(),'LJSpeech',files)
    shutil.move(src_path,dst_path)
  shutil.rmtree( os.path.join(os.getcwd(),'LJSpeech-1.1'))



In [None]:
#specify the paths for your audio files
class Paths:
    LJSpeech: str = os.path.join(os.getcwd(),'LJSpeech')
    CVcorpus: str = os.path.join(os.getcwd(),'cv-corpus')
    Spectrum_images_train: str = os.path.join(os.getcwd(),'Spectrum_Images','train')
    Spectrum_images_valid: str = os.path.join(os.getcwd(),'Spectrum_Images','valid')
    CVcorpus_Spectrum_train_images: str = os.path.join(os.getcwd(),'Spectrum_Images','train','CVcorpus')
    LJSpeech_Spectrum_train_images: str = os.path.join(os.getcwd(),'Spectrum_Images','train','LJSpeech')
    CVcorpus_Spectrum_valid_images: str = os.path.join(os.getcwd(),'Spectrum_Images','valid','CVcorpus')
    LJSpeech_Spectrum_valid_images: str = os.path.join(os.getcwd(),'Spectrum_Images','valid','LJSpeech')

In [None]:
#create folders for the different audio files
os.makedirs(Paths.CVcorpus_Spectrum_train_images)
os.makedirs(Paths.LJSpeech_Spectrum_train_images)
os.makedirs(Paths.CVcorpus_Spectrum_valid_images)
os.makedirs(Paths.LJSpeech_Spectrum_valid_images)

In [None]:

#function to process audio and generate the spectrum images

class Prep_data():
    def process_audio(audio_path,save_location,filename):
        audio, fs = librosa.load(audio_path)
        D = librosa.amplitude_to_db(librosa.stft(audio), ref=np.max)
        # Save the spectrogram
        plt.figure(figsize=(10, 5))
        plt.axis('off')
        librosa.display.specshow(D, sr=fs, x_axis='time', y_axis='linear',cmap='viridis')
        plt.savefig(save_location+'/'+filename+'.jpg', dpi=300, bbox_inches='tight', pad_inches=0,transparent=True)
        plt.close()
        print(save_location+'/'+filename+'.jpg')

    def crop_image(imagepath):
        image = cv.imread(imagepath)
        x, y, w, h = 100,50,610,535
        cropped_image = image[y:y + h, x:x + w] # Crop the image using the bounding rectangle coordinates
        # image =np.array(image)
        cv.imwrite(imagepath,cropped_image)



In [None]:
for audios in os.listdir(Paths.CVcorpus)[:300]:
    filename = audios[:-4]
    Prep_data.process_audio(Paths.CVcorpus+f'/{audios}',Paths.CVcorpus_Spectrum_train_images,filename)
    Prep_data.crop_image(Paths.CVcorpus_Spectrum_train_images+'/'+filename+'.jpg')


In [None]:
for audios in os.listdir(Paths.CVcorpus)[400:500]:
    Prep_data.process_audio(Paths.CVcorpus+f'/{audios}',Paths.CVcorpus_Spectrum_valid_images,filename)
    Prep_data.crop_image(Paths.CVcorpus_Spectrum_valid_images+'/'+filename+'.jpg')

In [None]:
for audios in os.listdir(Paths.LJSpeech)[:300]:
    filename = audios[:-4]
    Prep_data.process_audio(Paths.LJSpeech+f'/{audios}',Paths.LJSpeech_Spectrum_train_images,filename)
    Prep_data.crop_image(Paths.LJSpeech_Spectrum_train_images+'/'+filename+'.jpg')

In [None]:
for audios in os.listdir(Paths.LJSpeech)[400:500]:
    Prep_data.process_audio(Paths.LJSpeech+f'/{audios}',Paths.LJSpeech_Spectrum_valid_images,filename)
    Prep_data.crop_image(Paths.LJSpeech_Spectrum_valid_images+'/'+filename+'.jpg')

In [None]:
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255,fill_mode='nearest',horizontal_flip=True,zoom_range=0.5,vertical_flip=True)
valid_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)

In [None]:
train_images = train_datagen.flow_from_directory(Paths.Spectrum_images_train,target_size=(300,300),class_mode='binary')
valid_images = valid_datagen.flow_from_directory(Paths.Spectrum_images_valid,target_size=(300,300),class_mode='binary')

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.applications.vgg16.VGG16(include_top=False,weights='imagenet',input_shape=(300,300,3)))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(512,activation='relu'))
model.add(tf.keras.layers.Dense(32,activation='relu'))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='sgd',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
# Create a TensorBoard callback
log_dir = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)



In [None]:
model.fit(train_images,validation_data=valid_images,epochs=20, callbacks=[tensorboard_callback])

In [None]:
model.save('final_model.h5')