In [1]:
import torchaudio
import torch
import torch.nn.functional as F
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
import sys
sys.path.append('..')

from models.lit_wav2vec2 import LitWav2Vec2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = LitWav2Vec2('large_lv60k')

In [3]:
path = '/media/geoff/datasets/soapies_balanced_corpora/cs_engzul_balanced/lang_targs_mult/cs_engzul_trn.pkl'
df = pd.read_pickle(path)
df.head()

Unnamed: 0,audio_fpath,tgts
0,/media/geoff/datasets/soapies_balanced_corpora...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,/media/geoff/datasets/soapies_balanced_corpora...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,/media/geoff/datasets/soapies_balanced_corpora...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,/media/geoff/datasets/soapies_balanced_corpora...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,/media/geoff/datasets/soapies_balanced_corpora...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [4]:
audio_path = df.iloc[8000]['audio_fpath']
tgts1 = torch.tensor(df.iloc[8000]['tgts'])
waveform1, sample_rate = torchaudio.load(audio_path)

audio_path = df.iloc[8001]['audio_fpath']
tgts2 = torch.tensor(df.iloc[8001]['tgts'])
waveform2, sample_rate = torchaudio.load(audio_path)

waveforms = pad_sequence([waveform1.reshape(-1, 1), waveform2.reshape(-1, 1)], batch_first=True)
ll = torch.tensor([waveform1.size(1), waveform2.size(1)])
targets = pad_sequence([tgts1.reshape(-1, 1), tgts2.reshape(-1, 1,)], padding_value=0, batch_first=True).squeeze()

In [5]:
y_hat, lengths = model.forward(waveforms.squeeze(), ll)

### Cross Entropy

In [11]:
def interp_targets(targets, max_length):
    targets_ = targets.reshape(targets.size(0), 1, 1, targets.size(1)).to(torch.float32)
    interp_legnths = (1, max_length)
    ds_targets = F.interpolate(targets_, interp_legnths).squeeze()
    return ds_targets.to(torch.long)

In [12]:
targets = interp_targets(targets, torch.max(lengths))
F.cross_entropy(y_hat.view(-1, 3), targets.view(-1))

tensor(1.0835, grad_fn=<NllLossBackward0>)

### CTC

In [54]:
ds_factor = 335

In [58]:
# Hacky way to use torch's vision interpolation stuff

alpha = 0.49 # Shrink factor to get ctc to work

tgts = torch.tensor(tgts1, dtype=torch.float32).reshape(1, 1, 1, -1)
ds_size1 = int(tgts.size(-1)/ds_factor*alpha)
interp_legnths1 = (1,  ds_size1)
interp_tgts1 = F.interpolate(tgts, interp_legnths1).squeeze()

tgts = torch.tensor(tgts2, dtype=torch.float32).reshape(1, 1, 1, -1)
ds_size2 = int(tgts.size(-1)/ds_factor*alpha)
interp_legnths2 = (1,  ds_size2)
interp_tgts2 = F.interpolate(tgts, interp_legnths2).squeeze()

target = pad_sequence([interp_tgts1.reshape(-1, 1), interp_tgts2.reshape(-1, 1)], batch_first=True, padding_value=1).squeeze()
target_lengths = torch.stack([torch.tensor(ds_size1), torch.tensor(ds_size2)])

In [65]:
target = target.to(torch.long)
input = F.log_softmax(y_hat, dim=-1)
N, T, C = input.shape
input = input.view(((T, N, C)))

In [60]:
#loss = ctc_loss(input, target, input_lengths, target_lengths)
ctc_loss = torch.nn.CTCLoss(blank=0, zero_infinity=True)
ctc_loss(input, target, lengths, target_lengths)

tensor(1.7792, grad_fn=<MeanBackward0>)

In [10]:
T = 48
C = 3
N = 2

S = 30
S_min = 29

input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)

ctc_loss = torch.nn.CTCLoss(blank=0, zero_infinity=False)

print(loss)

### Train

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from utils.datasets import CSDataset, collator
from torch.utils.data import DataLoader
from models.lit_cs_detector import LitCSDetector

path = '/media/geoff/datasets/soapies_balanced_corpora/cs_engzul_balanced/lang_targs_mult/cs_engzul_trn.pkl'
df_trn = pd.read_pickle(path)
path = '/media/geoff/datasets/soapies_balanced_corpora/cs_engzul_balanced/lang_targs_mult/cs_engzul_dev.pkl'
df_dev = pd.read_pickle(path)
path = '/media/geoff/datasets/soapies_balanced_corpora/cs_engzul_balanced/lang_targs_mult/cs_engzul_tst.pkl'
df_tst = pd.read_pickle(path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BS_SIZE = 3
df_trn.tgts = df_trn.tgts-1
df_dev.tgts = df_dev.tgts-1
dataset_trn = CSDataset(df_trn)
dataset_dev = CSDataset(df_dev)

train_dataloader = DataLoader(dataset_trn, batch_size=BS_SIZE, shuffle=True, collate_fn=collator, num_workers=12)
dev_dataloader = DataLoader(dataset_dev, batch_size=BS_SIZE, collate_fn=collator, num_workers=12)

In [3]:
model = LitCSDetector("base")

In [4]:
# Callbacks
learning_rate_callback = LearningRateMonitor(logging_interval='step') 
checkpoint_callback = ModelCheckpoint(monitor='val/val_acc',
                                        filename='{epoch}-{val/val_loss:.2f}-{val/val_auc:.2f}',
                                        save_on_train_epoch_end=False,
                                        auto_insert_metric_name=False,
                                        save_last=True,
                                        mode='max'
                                        )
callbacks = [learning_rate_callback, checkpoint_callback]

# Logger                               
tb_logger = pl_loggers.TensorBoardLogger(save_dir="../logs/")
trainer = pl.Trainer(logger=tb_logger, callbacks=callbacks, max_epochs=32, gpus=1, gradient_clip_val=0.5, accumulate_grad_batches=8, log_every_n_steps=100, precision=16)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
lr_finder = trainer.tuner.lr_find(model, train_dataloader, dev_dataloader)
fig = lr_finder.plot(suggest=True)
fig.show()

In [5]:
model.hparams.learning_rate = 5e-5

In [6]:
trainer.fit(model, train_dataloader, dev_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type          | Params
----------------------------------------------------
0 | feature_extractor | Wav2Vec2Model | 94.4 M
1 | head              | Sequential    | 3.2 M 
----------------------------------------------------
97.5 M    Trainable params
0         Non-trainable params
97.5 M    Total params
195.058   Total estimated model params size (MB)


Epoch 0:   1%|          | 30/2869 [00:05<09:02,  5.23it/s, loss=0.738, v_num=1] 