# This code works only on wav file.

In [4]:
import numpy

Function flags2segs() finds the segments of the audio on the basis of different speakers and it gives a key value(flag) to each speaker. It returns the segments and flags corresponding to each speaker.

In [5]:
def flags2segs(Flags, window):
    '''
    ARGUMENTS:
     - Flags:     a sequence of class flags (per time window)
     - window:    window duration (in seconds)

    RETURNS:
     - segs:    a sequence of segment's limits: segs[i,0] is start and segs[i,1] are start and end point of segment i
     - classes:    a sequence of class flags: class[i] is the class ID of the i-th segment
    '''

    preFlag = 0
    curFlag = 0
    numOfSegments = 0

    curVal = Flags[curFlag]
    segsList = []
    classes = []
    while (curFlag < len(Flags) - 1):
        stop = 0
        preFlag = curFlag
        preVal = curVal
        while (stop == 0):
            curFlag = curFlag + 1
            tempVal = Flags[curFlag]
            if ((tempVal != curVal) | (curFlag == len(Flags) - 1)):  # stop
                numOfSegments = numOfSegments + 1
                stop = 1
                curSegment = curVal
                curVal = Flags[curFlag]
                segsList.append((curFlag * window))
                classes.append(preVal)
    segs = numpy.zeros((len(segsList), 2))

    for i in range(len(segsList)):
        if i > 0:
            segs[i, 0] = segsList[i-1]
        segs[i, 1] = segsList[i]
    return (segs, classes)

In [6]:
import os
counter = 0
directory = '/Users/macbook/Desktop/Audio Recordings'
for filename in os.listdir(directory):
    
    # You can remove the counter if you want to iterate over whole data.
    if counter==10:
        break
    
    if filename.endswith(".wav"):
        counter+=1
        
        #print(os.path.join(directory, filename))
        main_filename = filename
        filename=os.path.join(directory, filename)
        print(filename)
        
        # Here we are importing the wav file
        import scipy.io.wavfile as wav
        sampling_rate,sig = wav.read(filename)
        
        # Here we are calculating the length of signal
        len_of_signal = len(sig)
        
        '''
        Logmmse:-
        Log-spectrum based minimum mean square error is described by Emphraim and Malah after simple MMSE.
        This algorithm assumes a gaussian model for the complex spectral amplitude of both speech and noise.
        It gives the optimum estimate of the log spectrum of the clean speech signal. A decision-directed
        approach is used for estimating the a priori SNR.
        
        It reduces the background noise of the audio.
        '''
        from logmmse import logmmse_from_file
        sig = logmmse_from_file(filename)
        
        
        '''Wav file can be of 1D and 2D shape. Hence, we are converting all the wav files into 1D for easily
        working on it.'''
        flag = 0
        try:
            flag=sig.shape[1]
        except Exception as e:
            flag=1
            
        if flag==2:
            new_sig = []
            for i in range(len(sig)):
                new_sig.append(sig[i][0])
            sig = new_sig.copy()
        
        
        '''For applying speaker diarization on the audio file, the file needs to have a lot of features in it
        in order to retreive them. But sometimes size of file is small and features are not been able to 
        recognized. Hence, we are duplicating it four folds.'''
        temp = []
        
        for elem in sig:
            temp.append(elem)
        for elem in sig:
            temp.append(elem)
        for elem in sig:
            temp.append(elem)
        for elem in sig:
            temp.append(elem)
            
        sig = numpy.array(temp)
        
        '''Here we are calculating speaker diarization.'''
        import pyAudioAnalysis.audioSegmentation as aS
        cls = aS.speaker_diarization(rate=sampling_rate,sig=sig,n_speakers=0)
        segs,flags = flags2segs(cls,0.2)
        
        #########PROBLEM 1##########
        print('Number of people detected in the audio are: ',len(set(flags)))
        
        '''We created our own product list from where we are matching the words of speakers respectively.'''
        import json
        with open('product_list.json', 'r') as fp:
            product_list = json.load(fp)
        
        '''Matching is taking place here.'''
        result = {}
        import speech_recognition as sr
        file = filename
  
        # create a speech recognition object 
        r = sr.Recognizer() 
        for i in range(len(segs)):

            with sr.AudioFile(file) as source:
                audio = r.record(source,duration=(segs[i][1]-segs[i][0]),offset=segs[i][0])
                try:
                    text = r.recognize_google(audio)
                except Exception as e:
                    text = ""
                text = text.split()
                for elem in text:
                    if elem.lower() in product_list:
                        result[flags[i]] = 1 + result.get(flags[i],0)
                        
        '''result variable contains the speakers and the number of matches respectively.'''
        #print(result)   
        
        '''calculating the max matched value'''
        max_match_value=0
        if len(result)!=0:
            max_match_value = max(result.values())
        
        '''matched_flag variable will contain the flag number of speakers with max value matched.'''
        matched_flag = []
        for elem in result:
            if result[elem]==max_match_value:
                matched_flag.append(elem)
                
        #This will happen because our dataset is not large enough.
        if len(matched_flag)==0:
            segments=[]
        
        #If matched_flag length is 1, then we got our primary speaker.
        elif len(matched_flag)==1:
            segments = []
            for i in range(len(flags)):
                if flags[i]==matched_flag[0]:
                    segments.append(segs[i])
        
        #If not, check for the maximum amplitude speaker.
        else:
            
            rate = sampling_rate
            amplitude = []
            for flag in matched_flag:
                for i in range(len(flags)):
                    if flags[i]==flag:
                        s = 0
                        for j in range(int(segs[i][0]*rate),int(segs[i][1]*rate)):
                            s+=abs(sig[j])
                        amplitude.append((s/(rate*(segs[i][1]-segs[i][0])),flag))
            max_ampl = max(amplitude)

            segments = []
            for i in range(len(flags)):
                if flags[i]==max_ampl[1]:
                    segments.append(segs[i])
        
        '''Now we have got the segments of primary speaker.'''
        #print(segments)
        
        '''Now we are taking the values of signal to recreate the voice of primary speaker.'''
        rate = sampling_rate
        arr = []
        for elem in segments:
             if(int(elem[1]*rate)<=len_of_signal):
                for j in range(int(elem[0]*rate),int(elem[1]*rate)):
                    arr.append(sig[j])
        
        '''Case of exception when we have no results due to no match.'''
        if len(arr)==0:
            arr=[0 for i in range(rate)]
        arr = numpy.array(arr)
        
        ########## MAIN PROBLEM ###########
        '''Here we got our results. The file will contain the voice of primary speaker.'''
        wav.write(main_filename,rate=rate,data=arr)
        
        '''Here we are using flipkart ASR API.'''
        import requests
        headers = {'Authorization' : 'Token 3715119fd7753d33bedbd3c2832752ee7b0a10c7'}
        data = {'user' : '310' ,'language' : 'HI'}
        files = {'audio_file' : open(main_filename,'rb')}
        url = 'https://dev.liv.ai/liv_transcription_api/recordings/'
        res = requests.post(url, headers = headers, data = data, files = files)
        print(res.text)  
        
        print()
        print()
        print()

/Users/macbook/Desktop/Audio Recordings/new_1(1).wav




Number of people detected in the audio are:  3
{"app_session_id":"33e0d858-3d6c-46f1-b41e-6dacd8360af9","transcriptions":[{"utf_text":"सोनी","utf_text_en":"sony","confidence_per_word":[],"confidence_score":0.704957},{"utf_text":"शो मी","utf_text_en":"show me","confidence_per_word":[],"confidence_score":0.54414}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_1(2).wav




Number of people detected in the audio are:  2
{"app_session_id":"1ec78481-e380-4cc5-8e2e-23d7072a044d","transcriptions":[{"utf_text":"","utf_text_en":"","confidence_per_word":[],"confidence_score":0.874107},{"utf_text":" ","utf_text_en":"","confidence_per_word":[],"confidence_score":0.0}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_1.wav




Number of people detected in the audio are:  3
{"app_session_id":"47211f33-616c-423f-a17a-c71aa3ed951a","transcriptions":[{"utf_text":"मुझे मोबाइल लेना है","utf_text_en":"mujhe mobile lena hai","confidence_per_word":[],"confidence_score":0.746328},{"utf_text":"मुझे मोबाइल देना है","utf_text_en":"mujhe mobile daina hai","confidence_per_word":[],"confidence_score":0.727263}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_10(1).wav




Number of people detected in the audio are:  6
{"app_session_id":"b7809819-9581-41ee-aff7-0be92b5ef44e","transcriptions":[{"utf_text":"सर दे दो जो","utf_text_en":"sir de do jo","confidence_per_word":[],"confidence_score":0.536236},{"utf_text":"सर दे दो एक जो","utf_text_en":"sir de do ek jo","confidence_per_word":[],"confidence_score":0.532201}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_10(2).wav




Number of people detected in the audio are:  4
{"app_session_id":"6e269439-eb8c-4e71-9145-81138799fd8f","transcriptions":[{"utf_text":"","utf_text_en":"","confidence_per_word":[],"confidence_score":0.874107},{"utf_text":" ","utf_text_en":"","confidence_per_word":[],"confidence_score":0.0}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_10.wav




Number of people detected in the audio are:  4
{"app_session_id":"5c3b86cf-971e-44cb-8d92-41ec1c938a7e","transcriptions":[{"utf_text":"हम लेना है","utf_text_en":"hum lena hai","confidence_per_word":[],"confidence_score":0.671265},{"utf_text":"होम लेना है","utf_text_en":"home lena hai","confidence_per_word":[],"confidence_score":0.621145}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_11(1).wav




Number of people detected in the audio are:  2
{"app_session_id":"8ee3384b-37c0-4afc-b7e9-1ce94555b0df","transcriptions":[{"utf_text":"आपके पास रबर हैंड क्लोज है क्या","utf_text_en":"aapke paas rubber hand close hai kya","confidence_per_word":[],"confidence_score":0.699902},{"utf_text":"आपके पास रबर हैंड क्लोज हैं क्या","utf_text_en":"aapke paas rubber hand close hain kya","confidence_per_word":[],"confidence_score":0.694825}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_11(2).wav




Number of people detected in the audio are:  5
{"app_session_id":"381925e4-e5ee-430c-b10e-4d09f08804b9","transcriptions":[{"utf_text":"स्वीका परीक्षा","utf_text_en":"svika pariksha","confidence_per_word":[],"confidence_score":0.602762},{"utf_text":"हस्वीका परीक्षा","utf_text_en":"hasvika pariksha","confidence_per_word":[],"confidence_score":0.597349}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_11.wav




Number of people detected in the audio are:  2
{"app_session_id":"8aff210b-c4a3-45ca-af73-c4dbee18f303","transcriptions":[{"utf_text":"","utf_text_en":"","confidence_per_word":[],"confidence_score":0.763731},{"utf_text":"तो","utf_text_en":"to","confidence_per_word":[],"confidence_score":0.126525}],"recording_index":1,"is_last":true}



/Users/macbook/Desktop/Audio Recordings/new_12(1).wav




Number of people detected in the audio are:  4
{"app_session_id":"0abb027c-d1f3-48c9-bd29-026f30ce67da","transcriptions":[{"utf_text":"डी लिंक वायरलेस र","utf_text_en":"d link wireless r","confidence_per_word":[],"confidence_score":0.709305},{"utf_text":"डी लिंक वायरलेस रा","utf_text_en":"d link wireless ra","confidence_per_word":[],"confidence_score":0.667327}],"recording_index":1,"is_last":true}



