In [1]:
import pyaudio
import wave
import numpy as np
import keyboard
import time

class Recorder(object):
    '''A recorder class for recording audio to a WAV file.
    Records in mono by default.
    '''
    def __init__(self,FORMAT, CHANNEL, RATE, CHUNK,recordmode="Hit_to_talk",filename="Huangrui Chu.wav",VAD_setting=None,expected_time_silence=0.5):
        #VAD_setting is a dictionary with keys "mode","forgetfactor","onset_threshold",“dynamic_range”,“adjustment”
        self.format=FORMAT
        self.channels = CHANNEL
        self.rate = RATE
        self.frames_per_buffer = CHUNK
        self.recordmode=recordmode
        self.filename=filename
        self.VAD_setting=VAD_setting
        self.expected_time_silence=expected_time_silence
        self._pa = pyaudio.PyAudio()
    def _calc_energy(self,input_data):
        #to calculate Decibel
        #first translate the data type from byte to number
        if self.format==pyaudio.paInt16:
            #16-bit resolution
            output_data=np.frombuffer(np.array(input_data),np.int16)
            #then change the data format to in64 therefore when calculate the decibel, it would outside the uplimit value of int16
            output_data=np.array(output_data,dtype='int64')
        summed_value=np.sum(output_data**2)
        if summed_value<=1:
            energy=0
        else:
            energy=10*np.log10(summed_value)
        return output_data,energy
    
    def classifyFrameI(self,current,forgetfactor,onset_threshold,adjustment):
        #One threshold呢 当我开始阴阳怪气的说话的时候会很快被认为不说话了。
        #An Adaptive Endpointing Algorithm
        isSpeech = False
        self.level = ((self.level * forgetfactor) + current) / (forgetfactor+ 1)
        if (current < self.background):
            self.background = current
        else:
            self.background += (current - self.background) * adjustment
        if (self.level < self.background): 
            self.level = self.background
        if (self.level - self.background >onset_threshold): 
            isSpeech = True
        print("background is {}".format(self.background))
        print("difference is {}".format(self.level-self.background))
        print(isSpeech)
        self.isSpeech=isSpeech
        
    def classifyFrameII(self,current,forgetfactor,onset_threshold,dynamic_range,adjustment):
        offset_threshold=onset_threshold-dynamic_range
        #A More Complex Algorithm with Two-Threshold Formula: onset_threshold,offset_threshold
        if (current < self.background):
            self.background = current
        self.level = ((self.level * forgetfactor) + current) / (forgetfactor+ 1)
        #updata speech state
        if self.isSpeech:
            if (self.level-self.background < offset_threshold):
                self.isSpeech=0
        else:
             if (self.level-self.background > onset_threshold):
                    self.isSpeech=1
        #updata background
        if self.isSpeech:
            if self.level-self.background > onset_threshold:
                 self.background += (self.level-self.background)* adjustment
        print("background is {}".format(self.background))
        print("difference is {}".format(self.level-self.background))
    def _classify_chunk(self,current_energy):
        if self.VAD_setting["mode"]=="I":
            self.classifyFrameI(current_energy,self.VAD_setting["forgetfactor"],self.VAD_setting["onset_threshold"],self.VAD_setting["adjustment"])
        
        elif self.VAD_setting["mode"]=="II":
            self.classifyFrameII(current_energy,self.VAD_setting["forgetfactor"],self.VAD_setting["onset_threshold"],
                            self.VAD_setting["dynamic_range"],self.VAD_setting["adjustment"])
        
    def _get_callback(self):
        array_frames=np.array(self.frames)
        if self.format==pyaudio.paInt16:
            #16-bit resolution
            frames=np.frombuffer(array_frames, np.int16)
        return frames
    
    
    def start(self):
        self.start_recording()
        if self.recordmode=="Hit_to_talk":
            self.hit_to_talk()
        self.stop_recording()
        #store the record
        wf = wave.open(self.filename, 'wb')
        wf.setnchannels(self.channels)
        wf.setsampwidth(self._pa.get_sample_size(self.format))
        wf.setframerate(self.rate)
        wf.writeframes(b''.join(self.frames))
        wf.close()
        print("Audio data saved as {}".format(self.filename))
        
        
    def hit_to_talk(self):
        print("Hit to talk!")
        self.isSpeech=False
        self.record=False
        self.level=0
        self.background=0
        self.time_silence=0#track the salient time
        self.chunkenergy=[]
        self.frames=[] # to store the speeking data
        while True:
            #read the data from sound card
            data = self._stream.read(self.frames_per_buffer)
            decoded,energy=self._calc_energy(data)
            if not self.record:
                self.chunkenergy.append(energy)
                #print(self.chunkenergy)
                if keyboard.is_pressed("a"):
                    self.record=True
                    if len(self.chunkenergy)<=10:
                        self.level=self.chunkenergy[0]
                        self.background=np.average(np.array(self.chunkenergy))-5
                    else:
                        self.level=self.chunkenergy[-10]
                        self.background=np.average(np.array(self.chunkenergy[-10:]))-5
                    print("beginning level is {}".format(self.level))
                    print("beginning background is {}".format(self.background))
                    print("please talking!")
            else:
                self._classify_chunk(energy)# check whether this chunk is speeking
                num_silence_frame=0
                if not self.isSpeech:
                    #if it is silence record this silence time
                    self.time_silence+=self.frames_per_buffer/self.rate
                    num_silence_frame+=1
                else:
                    # set the silence time back to zero
                    self.time_silence=0
                    num_silence_frame=0
                #save the data
                self.frames.append(data)
                #when the time of silence exceed the tolerance, then stop recording
                if self.time_silence>=self.expected_time_silence:
                    print(num_silence_frame)
                    self.frames=self.frames[2:-num_silence_frame-1]
                    #self.frames=self.frames[2:]
                    break
        print("Too long time no one speaking, stop recording!")
        
        
    def start_recording(self):
        # Use a stream with no callback function in blocking mode
        self._stream = self._pa.open(format=pyaudio.paInt16,
                                        channels=self.channels,
                                        rate=self.rate,
                                        input=True,
                                        frames_per_buffer=self.frames_per_buffer)
        print("* recording with format {}, {} channel(s),a sampling rate of {} samples per second and {} frames per buffer"
          .format(self.format,self.channels  ,self.rate ,self.frames_per_buffer))
        print("click \"a\" to start recording!")    
        
    def stop_recording(self):
        self._stream.stop_stream()
    def terminate(self):
        self._stream.close()
        self._pa.terminate()
        
        


In [31]:
setting={}
setting["mode"]="II"
setting["forgetfactor"]=1
setting["onset_threshold"]=5
setting["dynamic_range"]=3
setting["adjustment"]=0.05

In [9]:
37274921

37274921

In [107]:
rec=Recorder(FORMAT=pyaudio.paInt16, CHANNEL=1, RATE=16000,CHUNK=1600,filename="../project6data/8642097531_0.wav",VAD_setting=setting,expected_time_silence=0.5)
rec.start()

* recording with format 8, 1 channel(s),a sampling rate of 16000 samples per second and 1600 frames per buffer
click "a" to start recording!
Hit to talk!
beginning level is 64.75291632150751
beginning background is 58.47018999254514
please talking!
background is 59.11222310717803
difference is 12.198629178025023
background is 59.63880214158226
difference is 10.005001653680353
background is 60.01677240491476
difference is 7.181435003317574
background is 60.285514639268015
difference is 5.106102452711767
background is 60.285514639268015
difference is 4.134628841157323
background is 60.285514639268015
difference is 3.8227024911766705
background is 60.285514639268015
difference is 3.1288839415905585
background is 60.285514639268015
difference is 2.712035766270432
background is 60.285514639268015
difference is 2.7353577239208846
background is 60.285514639268015
difference is 2.880043543655596
background is 60.285514639268015
difference is 2.6296776982161703
background is 61.48995589196706
d

background is 63.603146486697504
difference is 0.7257387972444533
1
Too long time no one speaking, stop recording!
Audio data saved as ../project6data/8642097531_0.wav


In [7]:
rec.terminate()