# Custom made LSTM model
Requires Pytorch 0.4.0

In [None]:
import librosa
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os

In [None]:
WORKDIR = "/data/WorkData/ideology_from_audio/RESULTS/"

## 1. Adding ideologies to the raw waveform database

In [None]:
def get_last_name(name):
    parts = name.split(" ")
    if parts[-1].lower() == 'jr' or parts[-1].lower() == 'iii' or parts[-1].lower() == 'ii':
        if len(parts) == 1:
            return "noname"
        return parts[-2].lower()
    else:
        return parts[-1].lower()

def get_ideology(last_name):
    group = ideologies[ideologies['last'] == last_name]
    return group.ideology.mean()

In [None]:
ideologies = pd.read_csv(WORKDIR + "ideologyfinal.txt")
ideologies = ideologies.drop(['Unnamed: 0', 'first'], axis = 1)
ideologies = ideologies[ideologies.ideology != 'undefined']
ideologies = ideologies.reset_index().drop('index', axis = 1)
ideologies['ideology'] = ideologies['ideology'].apply(lambda x: int(float(x) > 0.5))
ideologies.head()

In [None]:
# I lost the code, but here, ideology is matched to names and the hole
# dataframe is shuffled.
records = pd.read_csv(WORKDIR + "final_raw_wave_ideology.csv").drop('Unnamed: 0', axis = 1)
records.head()

In [None]:
records.to_csv(WORKDIR + 'final_raw_wave_ideology.csv')

Sorting the records by words:

In [None]:
used_words = list(map(lambda s: s.upper(), ['Justice', 'Honor', 'Federal', 'Congress', 
              'Government', 'Evidence', 'Argument', 'Issue', 'Science', 'Taxation']))
records_by_word = {}

for used_word in used_words:
    records_by_word[used_word] = records[records.Word == used_word]
    records_by_word[used_word] = records_by_word[used_word].reset_index().drop('index', axis = 1)

## 2. Defining training and test sets

Not training on SCIENCE and TAXATION:

In [None]:
train_used_words = used_words[:-2]
train_used_words

We do a word by word splitting

In [None]:
valid_ratio = 0.1

records_by_word_train = {}
records_by_word_test = {}

for used_word in used_words:
    recs = records_by_word[used_word]
    if used_word in train_used_words:
        num_occur = len(recs.index)
        split_index = int(num_occur * (1-valid_ratio))
        records_by_word_train[used_word] = recs.iloc[:split_index].reset_index().drop('index', axis = 1)
        records_by_word_test[used_word] = recs.iloc[split_index:].reset_index().drop('index', axis = 1)
        
    else:
        records_by_word_test[used_word] = recs

We also get a global training set.

In [None]:
records_train = pd.concat(records_by_word_train.values())
records_train = records_train.iloc[np.random.permutation(len(records_train))]
records_train = records_train.reset_index().drop('index', axis = 1)
records_test = pd.concat(records_by_word_test.values())
records_test = records_test.reset_index().drop('index', axis = 1)

In [None]:
records_train.head()

## 3. Dataloading scripts and auxiliary methods

In [None]:
def vstack_with_padding(a,b):
    if len(a.shape) == 1:
        a = a.reshape(1,-1)
    if len(b.shape) == 1:
        b = b.reshape(1,-1)
        
    if a.shape[1] > b.shape[1]:
        b = np.hstack((b, np.zeros((b.shape[0],a.shape[1]-b.shape[1]))))
    elif b.shape[1] > a.shape[1]:
        a = np.hstack((a, np.zeros((a.shape[0],b.shape[1]-a.shape[1]))))
    return np.vstack((a,b))

def get_minibatch(data_df, batch_size = 32, seed = 0):
    """ 
    Returns a minibatch of size batch_size from data_dict,
    starting at index given by seed.
    """
    N = len(data_df.index)
    assert seed < N, "seed out of bounds"
    X_batch = None
    y_batch = []
    for i in range(seed, min(seed + batch_size, N)):
        row = data_df.iloc[i]
        used_word = row.Word
        filename = row.Filename
        path = WORKDIR + "/WordAudio/" + used_word + "/" + filename
        waveform, sample_rate = librosa.load(path)
        if X_batch is None:
            X_batch = waveform
        else:
            X_batch = vstack_with_padding(X_batch, waveform)
        y_batch.append(row.Ideology)
    return X_batch, np.array(y_batch)

def get_accuracy(data_df, classifier, threshold = 0.5, verbose = True, gpu = False):
    """
    Computes accuracy of classifier over examples in data_df
    """
    if gpu and torch.cuda.is_available():
        classifier = classifier.cuda()
    correct = 0
    N = len(data_df.index)
    for i in range(N):
        if verbose and i % 10 == 0:
            print(" {}/{}".format(i,N))
        row = data_df.iloc[i]
        used_word = row.Word
        filename = row.Filename
        path = WORKDIR + "/WordAudio/" + used_word + "/" + filename
        waveform, sample_rate = librosa.load(path)
        inp = torch.Tensor(waveform).view(1,1,-1)
        if gpu and torch.cuda.is_available():
            inp = inp.cuda()
        scores = classifier(inp)[0]
        if scores[0] >= threshold:
            y_pred = 1
        else:
            y_pred = 0
        y_gt = row.Ideology
        if y_pred == y_gt:
            correct += 1
            
    return float(correct) / N

## 4. Training a custom classifier over the full training set

In [None]:
from conv_lstm_classifier2 import ConvLSTM
%load_ext autoreload
%autoreload 1

In [None]:
# Training on the full training set
classifier = ConvLSTM(
                    conv_kernel_size = 5,
                    conv_stride = 3,
                    num_features = 32,
                    pooling_kernel = 1,
                    hidden_size = 512,
                    num_layers = 2,
                    num_of_classes = 2,
                    bias = False,
                    )

In [None]:
def train(classifier, data, valid_data, num_epochs = 10, 
          batch_size = 32, verbose = False, gpu = False, print_accuracy = False):
    if print_accuracy:
        print(" Computing initial accuracy on validation set...")
        acc = get_accuracy(valid_data, classifier.cpu(), verbose = False, gpu = gpu)
        print(" Initial accuracy: {}".format(acc))

    if gpu and torch.cuda.is_available():
        torch.cuda.empty_cache()
        classifier = classifier.cuda()
    else:
        classifier = classifier.cpu()  
        
    optimizer = torch.optim.Adam(
                            classifier.parameters(),
                            betas = (0.9, 0.999),
                            eps = 1e-08,
                            weight_decay = 0.01)
    
    N = len(data.index)
    batches_per_epoch = int(N / batch_size) + 1
    best_acc = 0
    best_params = classifier.state_dict()
    for epoch in range(num_epochs):
        print("Epoch: {}/{}".format(epoch + 1, num_epochs))
        seed = 0
        for batch in range(batches_per_epoch):
            if batch % 50 == 0:
                print("  Batch: {}/{}".format(batch + 1, batches_per_epoch))
            # Prepare the data
            X_batch, y_batch = get_minibatch(data, batch_size = batch_size, 
                                             seed = seed)
            current_batch_size = X_batch.shape[0]
            X_batch = torch.tensor(X_batch, requires_grad = False,  dtype = torch.float)
            y_batch = torch.tensor(y_batch, requires_grad = False)
            
            
            if gpu and torch.cuda.is_available():
                X_batch = X_batch.cuda()
                y_batch = y_batch.cuda()
                torch.cuda.empty_cache()
            
            # Making an optimization step
            optimizer.zero_grad()
            output = classifier(X_batch.view(current_batch_size, 1, -1))
            loss = F.cross_entropy(output, y_batch)
            loss.backward()
            optimizer.step()
            
        # End of batch stuff
        seed += batch_size
        if print_accuracy:
            print(" Computing accuracy on given test set...")
            acc = get_accuracy(valid_data, classifier, verbose = False, gpu = gpu)
            if acc > best_acc:
                best_acc = acc
                best_params = classifier.state_dict()
            print(" Accuracy: {}".format(acc))
    classifier.load_state_dict(best_params)
    return classifier.cpu()

In [None]:
#torch.cuda.set_device(2)
valid_data = pd.concat([records_by_word_test["SCIENCE"],records_by_word_test["TAXATION"]])
classifier = train(classifier, records_train, valid_data, num_epochs = 10, batch_size = 16, gpu = True, print_accuracy=True)