# Data Preprocessing

In [None]:
df = pd.DataFrame(columns=['Sample', ' original_spk', ' gender', ' original_time', ' type_voc', ' start_voc', 'end_voc'])
file1 = open("vocalizationcorpus/labels.txt","r")
for aline in file1:
    values = aline.split(',')
    values[-1] = values[-1].strip("\n")
    df.loc[len(df)] = values[0:7]

In [None]:
for row in df.iterrows():    
    t1 = float(row[1][5]) * 1000
    t2 = float(row[1][6]) * 1000
    duration = t2 - t1
    newAudio = AudioSegment.from_wav("vocalizationcorpus/data/" + row[1][0] + ".wav")
    newAudio = newAudio[t1:t2]
    if row[1][4] == 'filler' and duration > 100:
        newAudio.export("Filler Words/" + row[1][0] + ".wav", format="wav")
    if row[1][4] == 'laughter' and duration > 100:
        newAudio.export("Laughter/" + row[1][0] + ".wav", format="wav")
    print(row[1][0], row[1][4], duration)

# DeepCall

Step 1: Feature Extraction - after collection audio data, extract the features  
Step 2: Speaker Clustering - identify who is speaker 1 and who is speaker 2  
Step 3: Training - train your model to classify the data into: Speech, Laughter, Filler Words  
Step 4: User Study

In [None]:
import pandas as pd
import numpy as np
from pydub import AudioSegment
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
from pyAudioAnalysis import audioSegmentation
from pyAudioAnalysis import audioTrainTest as aT
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# Classifies audio into 2 speakers + plot functionality
def speakerDiarization(filename, plot = False):
    speakers = audioSegmentation.speakerDiarization(filename, 2, plot_res=False)                                            
    dataframe=pd.DataFrame(speakers, columns=['category'])
    dataframe["seconds"] = np.linspace(0,dataframe.shape[0]/5,dataframe.shape[0])
    
    if plot == True:
        generatePlot(dataframe)

# Plot functionality for speakerDiarization
def generatePlot(dataframe):
    figure(num=None, figsize=(15, 3), dpi=80, facecolor='w', edgecolor='k')
    plt.style.use('ggplot')
    x = dataframe["seconds"]
    plt.xticks(np.arange(min(x), max(x)+1, 5.0))
    plt.xticks(rotation=90)
    plt.yticks([0, 1])
    plt.xlabel("Seconds")
    plt.ylabel("Speakers")
    plt.title("Speaker Diarization")
    plt.plot(dataframe["seconds"], dataframe["category"])
    plt.show()

In [None]:
[fs, x] = audioBasicIO.readAudioFile("Sinclair.wav")

In [None]:
fs

In [None]:
speakers = audioSegmentation.speakerDiarization("Sinclair.wav", 2, plot_res=False)                                            
dataframe=pd.DataFrame(speakers, columns=['category'])
dataframe["seconds"] = np.linspace(0,dataframe.shape[0]/5,dataframe.shape[0])
dataframe.head()

# Speaker Diarization 

This algorithm allows us to know who is speaking in a conversation.

In [None]:
speakerDiarization("Sinclair.wav", plot = True)

# Machine Learning Classification!

We extract mid-term features. We use a long-term averaging of the mid-term features, leading to 1 feature vector for each class.

In [None]:
aT.featureAndTrain(["Filler Words","Laughter", "Speech"], 0.1, 0.1, aT.shortTermWindow, aT.shortTermStep, "randomforest", "rf", perTrain=0.60)

In [None]:
aT.featureAndTrain(["Filler Words","Laughter", "Speech"], 0.1, 0.1, aT.shortTermWindow, aT.shortTermStep, "gradientboosting", "gb",perTrain=0.60)

In [None]:
aT.featureAndTrain(["Filler Words","Laughter", "Speech"], 0.1, 0.1, aT.shortTermWindow, aT.shortTermStep, "svm_rbf", "svm",perTrain=0.60)

# Uploading Trained Model

In [None]:
import pickle

file = open("svm",'rb')
svm = pickle.load(file)
print(svm.get_params())

In [None]:
aT.fileClassification("Test/laugh.wav", "svm", "svm_rbf")

In [None]:
from scipy.io.wavfile import write
[fs, x] = audioBasicIO.readAudioFile("Laughter/S0005.wav")
x = audioBasicIO.stereo2mono(x)

write('Test/laugh.wav', fs, x)

In [None]:
def phoneAnalyzer(filename):
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)

    speakers = audioSegmentation.speakerDiarization(filename, 2, plot_res=False)                                            
    dataframe=pd.DataFrame(speakers, columns=['category'])
    dataframe["seconds"] = np.linspace(0,dataframe.shape[0]/5,dataframe.shape[0])

    windowSize = 0.4
    startWin = 0
    endWin = round(fs * windowSize)
    shift = round(fs * windowSize)
    timer = 0

    while endWin < len(x):
        windowSlice = x[startWin:endWin]
        mt_win = 0.1
        mt_step = 0.1
        st_win = aT.shortTermWindow # 0.05
        st_step = aT.shortTermStep # 0.05
        [classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model("svm")
        [mt_term_feats, st_features, _] = audioFeatureExtraction.mtFeatureExtraction(windowSlice, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step))
        mt_term_feats = mt_term_feats.mean(axis=1) 
        curFV = (mt_term_feats - MEAN) / STD
        R = classifier.predict(curFV.reshape(1,-1))[0]
        P = classifier.predict_proba(curFV.reshape(1,-1))[0]
        startWin += shift
        endWin += shift
        timer += windowSize
        speakerID = int(dataframe[dataframe["seconds"] > timer].iloc[0,0])
        print("Speaker: ", speakerID, "Classification: ", R, "Probability: ", round(max(P), 2))

phoneAnalyzer("Sinclair.wav")