# Data Preprocessing

In [None]:
from pydub import AudioSegment
df = pd.DataFrame(columns=['Sample', ' original_spk', ' gender', ' original_time', ' type_voc', ' start_voc', 'end_voc'])
file1 = open("vocalizationcorpus/labels.txt","r")
for aline in file1:
    values = aline.split(',')
    values[-1] = values[-1].strip("\n")
    df.loc[len(df)] = values[0:7]

In [None]:
for row in df.iterrows():    
    t1 = float(row[1][5]) * 1000
    t2 = float(row[1][6]) * 1000
    duration = t2 - t1
    newAudio = AudioSegment.from_wav("vocalizationcorpus/data/" + row[1][0] + ".wav")
    newAudio = newAudio[t1:t2]
    if row[1][4] == 'filler' and duration > 100:
        newAudio.export("Filler Words/" + row[1][0] + ".wav", format="wav")
    if row[1][4] == 'laughter' and duration > 100:
        newAudio.export("Laughter/" + row[1][0] + ".wav", format="wav")
    print(row[1][0], row[1][4], duration)

In [None]:
def partitionAudio(filename, name):
    newAudio = AudioSegment.from_wav(filename)
    t1 = 0
    t2 = 1000 * 1
    while t2 < len(newAudio):
        print(t2/1000)
        newAudio[t1:t2].export('Laughter/' + name + '-laugh-' + str(t2) + '.wav', format="wav")
        t1 += 1000 * 1
        t2 += 1000 * 1
#partitionAudio("Silence.wav")

import os

directory = os.fsencode("Laughter")

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    print(filename)
    partitionAudio("Laughter/" + filename, filename)

# DeepCall

Step 1: Feature Extraction - after collection audio data, extract the features  
Step 2: Speaker Clustering - identify who is speaker 1 and who is speaker 2  
Step 3: Training - train your model to classify the data into: Speech, Laughter, Filler Words  
Step 4: User Study

In [None]:
import pandas as pd
import numpy as np
from pydub import AudioSegment
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
from pyAudioAnalysis import audioSegmentation
from pyAudioAnalysis import audioTrainTest as aT
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# Classifies audio into 2 speakers + plot functionality
def speakerDiarization(filename, plot = False):
    speakers = audioSegmentation.speakerDiarization(filename, 2, plot_res=False)                                            
    dataframe=pd.DataFrame(speakers, columns=['category'])
    dataframe["seconds"] = np.linspace(0,dataframe.shape[0]/5,dataframe.shape[0])
    
    if plot == True:
        generatePlot(dataframe)
    return dataframe

# Plot functionality for speakerDiarization
def generatePlot(dataframe):
    figure(num=None, figsize=(15, 3), dpi=80, facecolor='w', edgecolor='k')
    plt.style.use('ggplot')
    x = dataframe["seconds"]
    plt.xticks(np.arange(min(x), max(x)+1, 5.0))
    plt.xticks(rotation=90)
    plt.yticks([0, 1])
    plt.xlabel("Seconds")
    plt.ylabel("Speakers")
    plt.title("Speaker Diarization")
    plt.plot(dataframe["seconds"], dataframe["category"])
    plt.show()

In [None]:
[fs, x] = audioBasicIO.readAudioFile("Sinclair.wav")

In [None]:
fs

In [None]:
speakers = audioSegmentation.speakerDiarization("Sinclair.wav", 2, plot_res=False)                                            
dataframe=pd.DataFrame(speakers, columns=['category'])
dataframe["seconds"] = np.linspace(0,dataframe.shape[0]/5,dataframe.shape[0])
dataframe.head()

# Speaker Diarization 

This algorithm allows us to know who is speaking in a conversation.

In [None]:
speakerDiarization("Sinclair.wav", plot = True)

# Machine Learning Classification!

We extract mid-term features. We use a long-term averaging of the mid-term features, leading to 1 feature vector for each class.

In [None]:
aT.featureAndTrain(["Filler Words","Laughter", "Speech"], 0.1, 0.1, aT.shortTermWindow, aT.shortTermStep, "randomforest", "rf", perTrain=0.60)

In [None]:
aT.featureAndTrain(["Filler Words","Laughter", "Speech"], 0.1, 0.1, aT.shortTermWindow, aT.shortTermStep, "gradientboosting", "gb",perTrain=0.60)

In [None]:
aT.featureAndTrain(["Filler Words","Laughter", "Speech", "Background"], 0.2, 0.2, aT.shortTermWindow, aT.shortTermStep, "svm_rbf", "svm",perTrain=0.60)

# Uploading Trained Model

In [None]:
import pickle

file = open("svm",'rb')
svm = pickle.load(file)
print(svm.get_params())

# Evaluating Speaker Classification

In [None]:
dataframe = speakerDiarization("Sinclair.wav", plot = True) 

In [None]:
dataframe["ground_Truth"] = 0

In [None]:
# Try to annotate who is speaking by the second.
df1 = dataframe[dataframe["seconds"].between(0, 7)]["ground_Truth"].replace(0, 1) # Speaker 1 is Joe Rogan
df2 = dataframe[dataframe["seconds"].between(7, 37)]["ground_Truth"] # Speaker 0 by default 
df3 = dataframe[dataframe["seconds"].between(37, 38)]["ground_Truth"].replace(0, 1)
df4 = dataframe[dataframe["seconds"].between(38, 67)]["ground_Truth"]
df5 = dataframe[dataframe["seconds"].between(67, 83)]["ground_Truth"].replace(0, 1)
df6 = dataframe[dataframe["seconds"].between(83, 88)]["ground_Truth"]
df7 = dataframe[dataframe["seconds"].between(88, 106)]["ground_Truth"].replace(0, 1)
df8 = dataframe[dataframe["seconds"].between(106, 113)]["ground_Truth"]
df9 = dataframe[dataframe["seconds"].between(113, 117)]["ground_Truth"].replace(0, 1)
df10 = dataframe[dataframe["seconds"].between(117, 118)]["ground_Truth"]
df11 = dataframe[dataframe["seconds"].between(118, 135)]["ground_Truth"].replace(0, 1)
df12 = dataframe[dataframe["seconds"].between(135, 185)]["ground_Truth"]
df13 = dataframe[dataframe["seconds"].between(185, 194)]["ground_Truth"].replace(0, 1)
df14 = dataframe[dataframe["seconds"].between(194, 209)]["ground_Truth"]

In [None]:
# Combine everything!
labels = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14])

In [None]:
# Add your annotations as a column
dataframe["ground_Truth"] = labels
dataframe

In [None]:
# Sklearn provides a quick way to check accuracy between ground truth and category.

from sklearn.metrics import accuracy_score
accuracy_score(list(dataframe["ground_Truth"]), list(dataframe["category"].astype(int)))

In [None]:
def phoneAnalyzer(filename, df):
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
                             
    dataframe=pd.DataFrame(speakers, columns=['category'])
    dataframe["seconds"] = np.linspace(0,dataframe.shape[0]/5,dataframe.shape[0])

    windowSize = 1
    startWin = 0
    endWin = round(fs * windowSize)
    shift = round(fs * windowSize)
    timer = 0

    while endWin < len(x):
        windowSlice = x[startWin:endWin]
        mt_win = 0.2
        mt_step = 0.2
        st_win = aT.shortTermWindow # 0.05
        st_step = aT.shortTermStep # 0.05
        [classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model("svm")
        [mt_term_feats, st_features, _] = audioFeatureExtraction.mtFeatureExtraction(windowSlice, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step))
        mt_term_feats = mt_term_feats.mean(axis=1) 
        curFV = (mt_term_feats - MEAN) / STD
        R = classifier.predict(curFV.reshape(1,-1))[0]
        P = classifier.predict_proba(curFV.reshape(1,-1))[0]
        startWin += shift
        endWin += shift
        timer += windowSize
        speakerID = int(df[df["seconds"] > timer].iloc[0,0])
        print("Time (Seconds): ", timer, "Speaker: ", speakerID, "Classification: ", R, "Probability: ", round(max(P), 2))

phoneAnalyzer("toastmaster.wav", dataframe)

# Recording Sound

In [None]:
import pyaudio
import wave
import keyboard

FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "file.wav"
 
audio = pyaudio.PyAudio()
 
# start Recording
stream = audio.open(format=FORMAT, channels=CHANNELS,
                rate=RATE, input=True,
                frames_per_buffer=CHUNK)
print("Recording...")
frames = []
    
while True:  # making a loop
    try:  # used try so that if user pressed other than the given key error will not be shown
        data = stream.read(CHUNK)
        frames.append(data)
        if keyboard.is_pressed('q'):  # if key 'q' is pressed 
            print('You Pressed A Key!')
            break  # finishing the loop
        else:
            pass
    except:
        break 
        
print("Finished Recording")

stream.stop_stream()
stream.close()
audio.terminate()
 
waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
waveFile.close()