# Profiling
### Key results
* v2 slower than v1 due to `torch.cat` operation. 
* Locally CPU vs brahe CPU: local CPU v2_script version is slower than v2. This difference dissapears on brahe (although brahe max memory usage is lower)
* GPU: 

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # set this before importing torchimport torch 
import torch 
import time

In [None]:
from myrtlespeech.model.hard_lstm import HardLSTM as HardLSTM_ver2
from myrtlespeech.model.hard_lstm2 import HardLSTM as HardLSTM_ver3

from hard import HardLSTM as HardLSTM_ver1

from deepspeech_int import HardLSTM as HardLSTM_dsi
from matplotlib import pyplot as plt
from typing import List
from copy import copy

In [None]:
def gen_args(in_size, hidden, seq_len, num_layers, bidirectional, batch, gpu=False):
    x = torch.randn(seq_len, batch, in_size)
    num_directions = 2 if bidirectional else 1
    zeros = torch.zeros(
        num_layers * num_directions,
        batch,
        hidden,
        dtype=x.dtype,
    )
    if gpu:
        x = x.cuda()
        zeros = zeros.cuda()
    return (x, (zeros, zeros))

### Check everything runs

In [None]:
input_size = 100
hidden = 128
seq_len = 35
num_layers = 1
bidirectional = True
batch = 3

args = gen_args(input_size, hidden, seq_len, num_layers, bidirectional, batch)

lstm_v1 = HardLSTM_ver1(input_size=input_size, hidden_size=hidden, batch_first=False, bidirectional=bidirectional)
lstm_v2 = HardLSTM_ver2(input_size=input_size, hidden_size=hidden, batch_first=False, bidirectional=bidirectional)
lstm_v2_script = torch.jit.script(HardLSTM_ver2(input_size=input_size, hidden_size=hidden, batch_first=False, bidirectional=bidirectional))
lstm_v3 = HardLSTM_ver3(input_size=input_size, hidden_size=hidden, batch_first=False, bidirectional=bidirectional)
lstm_v3_script = torch.jit.script(HardLSTM_ver3(input_size=input_size, hidden_size=hidden, batch_first=False, bidirectional=bidirectional))

lstm = torch.nn.LSTM(input_size=input_size, hidden_size=hidden, batch_first=False, bidirectional=bidirectional)

outputs_v1 = lstm_v1(*args)
outputs_v2 = lstm_v2(*args)
outputs_v2_script = lstm_v2_script(*args)
outputs_v3 = lstm_v3(*args)
outputs_v3_script = lstm_v3_script(*args)
outputs_n = lstm(*args)

In [None]:
y, (h, c) = outputs_n
print(y.shape, h.shape, c.shape)
y, (h, c) = outputs_v3_script
print(y.shape, h.shape, c.shape)


In [None]:
idx_to_name = {
    0: "Input Size",
    1: "Hidden Size",
    2: "Sequence Length",
    3: "Number Layers",
    4: "bidirectional",
    5: "Batch Size",
}
def profile_and_plot(models, dims, construct_each_time = False, batch_first=False, gpu=False):
    """One and only one of dims is a List. All others are constants.
    
    dims = (in_size, hidden, seq_len, num_layers, bidirectional, batch)
    
    """
    list_seen = False
    for idx, dim in enumerate(dims):
        if isinstance(dim, List):
            assert list_seen == False, "Only one List can be present"
            list_seen = True
            list_idx = idx
    assert list_seen == True, "There must be a List present"
    
    values = dims[list_idx]
    results = {k : [] for k in models.keys()}
    
    if not construct_each_time:
        lstms = {}
        for name, lstm_constr in models.items():
            dims_in = copy(dims)
            dims_in[list_idx] = values[0]
            lstm = lstm_constr(dims_in[0], dims_in[1], batch_first=batch_first, bidirectional=dims_in[4])
            if gpu:
                lstm.cuda()
            # warmup
            args = gen_args(*dims_in, gpu=gpu)
            lstm(*args)
            # add to dict
            lstms[name] = lstm 
    
    for value in values:
        dims_in = copy(dims)
        dims_in[list_idx] = value
        args = gen_args(*dims_in, gpu=gpu)

        for name, lstm_constr in models.items():
            if construct_each_time:
                lstm = lstm_constr(dims_in[0], dims_in[1], batch_first=batch_first, bidirectional=dims_in[4])
                if gpu:
                    lstm.cuda()
                # warmup
                outputs = lstm(*args)
            else:
                lstm = lstms[name]

            # time
            t0 = time.perf_counter() 
            lstm(*args)
            tend = time.perf_counter() 
            results[name].append((value, tend-t0))
            if construct_each_time:
                del lstm 
            
    
    # plot
    for k, res in results.items():
        res_ = list(zip(*res))
        plt.plot(res_[0], res_[1], label=k)
        plt.xlabel(f"{idx_to_name[list_idx]}")
        plt.ylabel("Time /s")
    plt.legend()
    plt.show()
    return results

In [None]:
def get_script_constructor(constructor):
    
    def cstor(*args, **kwargs):
        model = constructor(*args, **kwargs).cuda()
        return torch.jit.script(model)
    return cstor

In [None]:
models =  {#"Version1": HardLSTM_ver1, 
           #"Version2": HardLSTM_ver2, 
           #"Version2_scripted": get_script_constructor(HardLSTM_ver2),
           #"Version3": HardLSTM_ver3, 
           "Version3_scripted": get_script_constructor(HardLSTM_ver3),
           "PyTorch": torch.nn.LSTM}

# Seq length

In [None]:
in_size = 1024
hidden = 1024
seq_len = [1] + list(range(32, 1024, 32))
num_layers = 1
bidirectional = True
batch = 256

dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]


#results = profile_and_plot(models, dims, construct_each_time=False)
results = profile_and_plot(models, dims, construct_each_time=False, gpu=True)



In [None]:
# much smaller
in_size = 128
hidden = 128
seq_len = [1] + list(range(32, 1024, 32))
num_layers = 1
bidirectional = False
batch = 32

dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]


#results = profile_and_plot(models, dims, construct_each_time=False)
results = profile_and_plot(models, dims, construct_each_time=False, gpu=True)



In [None]:
seq_len = list(range(2, 750, 30))
bidirectional = True

dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]

results = profile_and_plot(models, dims, construct_each_time=False, gpu=True)

## Variation with batch

In [None]:
in_size = 100
hidden = 256
seq_len = 100
num_layers = 1
bidirectional = False
batch = list(range(2, 512, 8))


dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]

results = profile_and_plot(models, dims, construct_each_time=False)
results = profile_and_plot(models, dims, construct_each_time=False, gpu=True)


In [None]:
bidirectional = True
batch = list(range(2, 252, 8))

dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]

results = profile_and_plot(models, dims, construct_each_time=False)
results = profile_and_plot(models, dims, construct_each_time=False, gpu=True)

# In size

In [None]:
in_size = list(range(1, 2048, 32))
hidden = 1024
seq_len = 256
num_layers = 1
bidirectional = False
batch = 128

dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]

results = profile_and_plot(models, dims, construct_each_time=True, gpu=True)


In [None]:
bidirectional = True
in_size = list(range(1, 1024, 32))


dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]

results = profile_and_plot(models, dims, construct_each_time=True, gpu=True)

# Hidden

In [None]:
in_size = 1024
hidden = list(range(2, 2048, 32))
num_layers = 1
bidirectional = False
batch = 128
seq_len = 256

dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]

results = profile_and_plot(models, dims, construct_each_time=True, gpu=True)


In [None]:
bidirectional = True
hidden = list(range(2, 1024, 32))

dims = [in_size, hidden, seq_len, num_layers, bidirectional, batch]


results = profile_and_plot(models, dims, construct_each_time=True)
results = profile_and_plot(models, dims, construct_each_time=True, gpu=True)

### timit

In [None]:
device = torch.device("cuda")
hidden_size = 1024
batch = 128
seq_len = 256
x = torch.empty((seq_len, batch, hidden_size)).normal_().to(device)
state = (torch.empty((1, batch, hidden_size)).normal_().to(device), torch.empty((1, batch, hidden_size)).normal_().to(device))


In [None]:
lstm_v3 = models['Version3_scripted'](1024, 1024, bidirectional=False).to(device)
x = torch.empty((seq_len, batch, hidden_size)).normal_().to(device)
state = (torch.empty((1, batch, hidden_size)).normal_().to(device), torch.empty((1, batch, hidden_size)).normal_().to(device))
lstm_v3(x, state)
%timeit -n 100 lstm_v3(x, state)

In [None]:
lstm_v2 = models['Version1'](1024, 1024, bidirectional=False).to(device)

%timeit -n 100 lstm_v2(x, state)

In [None]:
lstm = models['PyTorch'](1024, 1024, bidirectional=False).to(device)

%timeit -n 100 lstm(x, state)

In [None]:
lstm_v3 = models['Version3_scripted'](1024, 1024, bidirectional=False).to(device)

%timeit -n 2 lstm_v3(x, state)

In [None]:
lstm = models['PyTorch'](1024, 1024, bidirectional=False).to(device)

%timeit -n 5 lstm(x, state)

In [None]:
lstm = models['PyTorch'](1024, 1024, bidirectional=False).to(device)
lstm

# Profile version1 and version2 diff

In [None]:
import cProfile
from myrtlespeech.model.hard_lstm import HardLSTM as HardLSTM_ver2
from hard import HardLSTM as HardLSTM_ver1
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # set this before importing torchimport torch 
import torch 
import time
def gen_args(in_size, hidden, seq_len, num_layers, bidirectional, batch, gpu=False):
    x = torch.randn(seq_len, batch, in_size)
    num_directions = 2 if bidirectional else 1
    zeros = torch.zeros(
        num_layers * num_directions,
        batch,
        hidden,
        dtype=x.dtype,
    )
    if gpu:
        x = x.cuda()
        zeros = zeros.cuda()
    return (x, (zeros, zeros))

In [None]:
in_size = 100
hidden = 512
seq_len = 100
num_layers = 1
bidirectional = False
batch = 300
dims = in_size, hidden, seq_len, num_layers, bidirectional, batch
args = gen_args(*dims)

lstm_v1 = HardLSTM_ver1(dims[0], dims[1], batch_first=False, bidirectional=dims[4])
lstm_v2 = HardLSTM_ver2(dims[0], dims[1], batch_first=False, bidirectional=dims[4])
lstm_v2_script = torch.jit.script(HardLSTM_ver2(dims[0], dims[1], batch_first=False, bidirectional=dims[4]))
lstm = torch.nn.LSTM(dims[0], dims[1], batch_first=False, bidirectional=dims[4])

In [None]:
lstm_v2

In [None]:
lstm_v1(*args)
cProfile.run('lstm_v1(*args)')

In [None]:
lstm_v2(*args)
cProfile.run('lstm_v2(*args)')

In [None]:
lstm_v2_script(*args)
cProfile.run('lstm_v2_script(*args)')

In [None]:
lstm(*args)
cProfile.run('lstm(*args)')