# Raw Audio classification with WaveNet
Requires Pytorch 0.4.0

In [1]:
import librosa
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
import network_modules as wn
%load_ext autoreload
%autoreload 1

WORKDIR = "/data/WorkData/ideology_from_audio/RESULTS/"

## 1. Auxiliary functions

In [12]:
def vstack_with_padding(a,b):
    """
    Stacks one dimensional arrays on top of each other.
    """
    if len(a.shape) == 1:
        a = a.reshape(1,-1)
    if len(b.shape) == 1:
        b = b.reshape(1,-1)
        
    if a.shape[1] > b.shape[1]:
        b = np.hstack((b, np.zeros((b.shape[0],a.shape[1]-b.shape[1]))))
    elif b.shape[1] > a.shape[1]:
        a = np.hstack((a, np.zeros((a.shape[0],b.shape[1]-a.shape[1]))))
    return np.vstack((a,b))

def get_minibatch(data_df, batch_size = 32, seed = 0):
    """ 
    Returns a minibatch of size batch_size from data_dict,
    starting at index given by seed.
    """
    N = len(data_df.index)
    assert seed < N, "seed out of bounds"
    X_batch = None
    y_batch = []
    for i in range(seed, min(seed + batch_size, N)):
        row = data_df.iloc[i]
        used_word = row.Word
        filename = row.Filename
        path = WORKDIR + 'WordAudio/' + used_word + "/" + filename
        waveform, _ = librosa.load(path)
        if X_batch is None:
            X_batch = waveform
        else:
            X_batch = vstack_with_padding(X_batch, waveform)
        y_batch.append(row.Ideology)
    return X_batch, np.array(y_batch)

def get_accuracy(data_df, classifier, threshold = 0.5, 
                 verbose = True, gpu = False):
    """
    Computes accuracy of classifier over examples in data_df
    """
    if gpu and torch.cuda.is_available():
        classifier = classifier.cuda()
    
    classifier.test()
    
    correct = 0
    N = len(data_df.index)
    for i in range(N):
        if verbose and i % 10 == 0:
            print(" {}/{}".format(i,N))
        row = data_df.iloc[i]
        used_word = row.Word
        filename = row.Filename
        path = WORKDIR + 'WordAudio/' + used_word + "/" + filename
        waveform, sample_rate = librosa.load(path)
        inp = torch.Tensor(waveform).view(1,1,-1)
        if gpu and torch.cuda.is_available():
            inp = inp.cuda()
        scores = classifier(inp)[0]
        if scores[0] >= threshold:
            y_pred = 1
        else:
            y_pred = 0
        y_gt = row.Ideology
        if y_pred == y_gt:
            correct += 1
    classifier = None
    torch.cuda.empty_cache()
            
    return float(correct) / N

## 2. Loading the data and defining test

In [4]:
records = pd.read_csv(WORKDIR + "final_raw_wave_ideology.csv").drop('Unnamed: 0', axis = 1)
records.head()

Unnamed: 0,Speaker,Word,Year,Filename,Ideology
0,roberts,FEDERAL,2008,2008_FEDERAL6502.wav,1
1,goldstein,JUSTICE,2012,2012_JUSTICE15772.wav,0
2,ginsburg,ISSUE,2008,2008_ISSUE3553.wav,0
3,olson,GOVERNMENT,2007,2007_GOVERNMENT3580.wav,1
4,breyer,ARGUMENT,2011,2011_ARGUMENT6159.wav,0


In [5]:
used_words = list(map(lambda s: s.upper(), ['Justice', 'Honor', 'Federal', 'Congress', 
              'Government', 'Evidence', 'Argument', 'Issue', 'Science', 'Taxation']))
records_by_word = {}

for used_word in used_words:
    records_by_word[used_word] = records[records.Word == used_word]
    records_by_word[used_word] = records_by_word[used_word].reset_index().drop('index', axis = 1)

Not training on SCIENCE and TAXATION:

In [6]:
train_used_words = used_words[:-2]
train_used_words

['JUSTICE',
 'HONOR',
 'FEDERAL',
 'CONGRESS',
 'GOVERNMENT',
 'EVIDENCE',
 'ARGUMENT',
 'ISSUE']

We do a word by word splitting:

In [7]:
valid_ratio = 0.1

records_by_word_train = {}
records_by_word_test = {}

for used_word in used_words:
    recs = records_by_word[used_word]
    if used_word in train_used_words:
        num_occur = len(recs.index)
        split_index = int(num_occur * (1-valid_ratio))
        records_by_word_train[used_word] = recs.iloc[:split_index].reset_index().drop('index', axis = 1)
        records_by_word_test[used_word] = recs.iloc[split_index:].reset_index().drop('index', axis = 1)
        
    else:
        records_by_word_test[used_word] = recs

We also get a global training set:

In [8]:
records_train = pd.concat(records_by_word_train.values())
records_train = records.iloc[np.random.permutation(len(records_train))]
records_train = records_train.reset_index().drop('index', axis = 1)
records_test = pd.concat(records_by_word_test.values())
records_test = records_test.reset_index().drop('index', axis = 1)
records_train.head()

Unnamed: 0,Speaker,Word,Year,Filename,Ideology
0,waxman,FEDERAL,2004,2004_FEDERAL3311.wav,0
1,waxman,CONGRESS,2010,2010_CONGRESS7720.wav,0
2,long,JUSTICE,2009,2009_JUSTICE10047.wav,0
3,kennedy,ARGUMENT,2012,2012_ARGUMENT6968.wav,1
4,katsas,JUSTICE,2008,2008_JUSTICE8547.wav,1


## 3. Training a custom classifier over the full training set

In [9]:
classifier = wn.WaveNet(
                layer_size = 8,
                stack_size = 2,
                in_channels = 1,
                res_channels = 4,
                skip_channels = 16,
                pooling_kernel = 2,
                classes = 2
                )

In [10]:
def trainWaveNet(classifier, data, valid_data, num_epochs = 10, 
          batch_size = 32, verbose = False, gpu = False):
    ##########################################
    ############### INITIALIZING #############
    ##########################################
    N = len(data.index)
    batches_per_epoch = int(N / batch_size) + 1
    
    print(" Computing initial accuracy on validation set...")
    acc = get_accuracy(valid_data, classifier, verbose = False)
    print(" Initial accuracy: {}".format(acc))

    if gpu and torch.cuda.is_available():
        torch.cuda.empty_cache()
        classifier = classifier.cuda()
    else:
        classifier = classifier.cpu()
        
    optimizer = torch.optim.Adam(
                            classifier.parameters(),
                            lr = 0.1,
                            betas = (0.9, 0.999),
                            eps = 1e-08,
                            weight_decay = 0.01)
    
    ##########################################
    ########### TRAINING ITERATION ###########
    ##########################################
    
    for epoch in range(num_epochs):
        classifier.train()
        print("Epoch: {}/{}".format(epoch + 1, num_epochs))
        seed = 0
        for batch in range(batches_per_epoch):
            
            ##############################
            ##### PREPARE DATA BATCH #####
            ##############################
            X_batch, y_batch = get_minibatch(data, 
                                             batch_size = batch_size, 
                                             seed = seed)
            current_batch_size = X_batch.shape[0]
            

            X_batch = torch.tensor(X_batch, requires_grad = False, 
                                   dtype = torch.float)
            y_batch = torch.tensor(y_batch, requires_grad = False, 
                                   dtype = torch.long)
            
            
            if gpu and torch.cuda.is_available():
                X_batch = X_batch.cuda()
                y_batch = y_batch.cuda()
                torch.cuda.empty_cache()
            #################################
            ####### OPTIMIZATION STEP #######
            #################################
            optimizer.zero_grad()
            [class_probs, pred_probs] = classifier(X_batch.view(current_batch_size, 1, -1))
            loss = wn.WaveNetLoss(
                audios = X_batch,
                labels = y_batch,
                class_probs = class_probs,
                pred_probs = pred_probs,
                pred_channels = 16
                )
            loss.backward()
            optimizer.step()
            
            if batch % 10 == 0:
                print("  Batch: {}/{}, Loss: {}".format(batch+1, batches_per_epoch, loss))
        
            
        # End of epoch stuff
        seed += batch_size
        print(" Computing accuracy on test set...")
        acc = get_accuracy(valid_data, classifier.cpu(), verbose = False)
        print(" Accuracy: {}".format(acc))
    return classifier.cpu()

In [None]:
valid_data = pd.concat([records_by_word_test["SCIENCE"],records_by_word_test["TAXATION"]])
classifier = trainWaveNet(classifier, records_train, valid_data, 
                   num_epochs = 10, batch_size = 16, gpu = False)