In [1]:
import torchaudio
import numpy as np
from pydub import AudioSegment
from IPython.display import Audio, display
sound_path = "./sample_audio.wav"
import soundfile as sf
import librosa
original_audio = AudioSegment.from_wav(sound_path)
waveform, sample_rate = torchaudio.load(sound_path)

waveform_sf, sample_rate = sf.read(sound_path)
waveform_librosa, sample_rate = librosa.load(sound_path)




Increase or Decrease Volume

In [2]:
def manipulateVolume(vol_in_DB):
    return original_audio + vol_in_DB

Manipulate Pitch

In [3]:
import librosa
def manipulatePitch(waveform,sample_rate,num_of_seminodes):
    return librosa.effects.pitch_shift(waveform,sr=sample_rate,n_steps=num_of_seminodes)

Fade in or Fade out

In [4]:
def fadeInAudio(original_audio,time_in_milsec):
    return original_audio.fade_in(time_in_milsec)

def fadeOutAudio(original_audio,time_in_milsec):
    return original_audio.fade_out(time_in_milsec)

Predict Audio

In [5]:
import torch
Bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
labels= Bundle.get_labels()
model_sample_rate = Bundle.sample_rate

if(sample_rate!=model_sample_rate):
    torchaudio.functional.resample(waveform,sample_rate,model_sample_rate)

model = Bundle.get_model()
with torch.inference_mode():
    emission,_ = model(waveform)



Decoder

In [6]:
class GreedyDecoderCTC(torch.nn.Module):
    def __init__(self, labels):
        super().__init__()
        self.blank = 0
        self.labels = labels
    def forward(self, emission: torch.tensor)->str:
        indices = torch.argmax(emission,dim=-1)
        indices = torch.unique_consecutive(indices,dim=-1)
        indices = [i for i in indices if i!=self.blank]
        return "".join(self.labels[i] for i in indices)
    

In [7]:
def predictText():
    Decoder = GreedyDecoderCTC(labels=labels)
    transcript = Decoder(emission[0])
    result = str(transcript)
    result = result.replace("|"," ")
    return result

Main Menu

In [8]:
def audioAugmentation():
    while(True):
        print("Audio Augmentation Menu")
        print("1. Increase or decrease volume")
        print("2. Increase or decrease pitch")
        print("3. Insert fade in effect")
        print("4. Insert fade out effect")
        print("5. Back to main menu")
        print(">> ")
        userInput = input()

        match userInput:
            case "1":
                while(True):
                    vol_in_db = int(input("Input volume DB: "))
                    if(vol_in_db > -10000 and vol_in_db<100):
                        break
                augmented_audio = manipulateVolume(vol_in_db)
                
                while(True):
                    filename = input("Input filename(.wav): ")
                    if(filename[-4:]==".wav"):
                        break
                
                augmented_audio.export(filename,format="wav")
            case "2":
                while(True):
                    pitch_rate = int(input("Input pitch rate: "))
                    if(pitch_rate > -10 and pitch_rate<10):
                        break
                augmented_audio = manipulatePitch(waveform_librosa, sample_rate,pitch_rate)
                
                while(True):
                    filename = input("Input filename(.wav): ")
                    if(filename[-4:]==".wav"):
                        break
                sf.write(filename,augmented_audio,sample_rate,format="wav")
            case "3":
                while(True):
                    fade_in_duration = int(input("Input fade in duration(in milsec): "))
                    if(fade_in_duration > 0):
                        break
                augmented_audio = fadeInAudio(original_audio,fade_in_duration)
                
                while(True):
                    filename = input("Input filename(.wav): ")
                    if(filename[-4:]==".wav"):
                        break
                augmented_audio.export(filename,format="wav") 
                        
            case "4":
                while(True):
                    fade_out_duration = int(input("Input fade out duration(in milsec): "))
                    if(fade_out_duration>0):
                        break
                augmented_audio = fadeOutAudio(original_audio,fade_out_duration)
                
                while(True):
                    filename = input("Input filename(.wav): ")
                    if(filename[-4:]==".wav"):
                        break
                augmented_audio.export(filename,format="wav")
               
            case "5":
                break
            case '_':
                print("Invalid Input")

In [9]:
while(True):
    print("Transcript Assistance")
    print("1. Audio Augmentation")
    print("2. Predict Text")
    print("3. Exit")
    print(">> ")
    userInput = input()

    match userInput:
        case '1':
            audioAugmentation()
        case '2':
            result = predictText()
            print("Transciption result: ",result)
        case '3':
            break
        case '_':
            print("Invalid Input")

Transcript Assistance
1. Audio Augmentation
2. Predict Text
3. Exit
>> 
