## 1.1 Check GPU

In [None]:
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

name, driver_version, memory.total [MiB]
Tesla V100-SXM2-16GB, 460.32.03, 16160 MiB


## 1.2 Connect to google drive

In [4]:
# Connect Google drive
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## 1.3 Import library and define global variable

In [1]:
import sys
import numpy as np
import pandas as pd
import os
import time
import torch
import torchvision 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast

!pip install num2words
from num2words import num2words

!pip install python-levenshtein
import Levenshtein

!pip install torch-summary
import torchsummary

!pip install torchaudio

cuda = torch.cuda.is_available()
print(cuda, sys.version)
device = torch.device("cuda" if cuda else "cpu")
device

True 3.7.10 (default, May  3 2021, 02:48:31) 
[GCC 7.5.0]


device(type='cuda')

## 2.1 Load and Preprocess Data

In [23]:
# Import code
from constant import LETTER_LIST, LABEL_LIST
from utils import *
from preprocess import preprocess, get_letter_vocab
from datasets import KnnwSpeechDataset, KnnwSpeakerDataset, KnnwDataset
from speech_model import *
from speaker_model import *
from training import LASSession, SpeakerRecSession
from training import thred_sched, LRSched_0arg, PlateauSched

/content/gdrive/MyDrive/IDL/Project
/content/gdrive/.shortcut-targets-by-id/1nFHCtUbxTfWD0vW9CiANn58b6Rg7JhIt/11785Project/sessions


In [6]:
# Load data
# DATA = path_to_data_files
sub_data_path = os.path.join(DATA, "knnw_en_sub_labeled.csv")
sub_df = pd.read_table(sub_data_path, sep = ";", header=0)
audio_path = os.path.join(DATA, "log_spectrogram.npy")
audio_data = np.load(audio_path).transpose()

print(sub_df.shape)
print(audio_data.shape)

(1393, 5)
(1370493, 129)


## 2.2 Dataset & Dataloader

In [None]:
processed_df = preprocess(sub_df, remove_music=True)

print("Letter vocab:", get_letter_vocab(processed_df["Processed Text"]))

In [17]:
def get_loaders(dataset, audio_data, transcript_df, batch_size=32, split=0.15, seed=None):
    train_df, test_df = random_split(transcript_df, split, seed)
    train_set = dataset(audio_data, train_df, total_frames=len(audio_data))
    test_set = dataset(audio_data, test_df, total_frames=len(audio_data))
    train_loader = DataLoader(train_set, batch_size, shuffle=True, collate_fn=dataset.collate)
    test_loader = DataLoader(test_set, batch_size, shuffle=False, collate_fn=dataset.collate)
    return train_loader, test_loader

## Speech Recognition Training

In [19]:
def transfer_encoder_lstm(model):
    model.encoder.lstm = nn.LSTM(input_size=129, hidden_size=256, num_layers=2, dropout=0.3,
                            bidirectional=True, batch_first=True).to(device)
    return model.to(device)

In [31]:
train_loader, test_loader = get_loaders(KnnwSpeechDataset, audio_data, processed_df, batch_size=32, split=0.15)
speech_session = LASSession('sessions/speech_session',
                            lambda: transfer_encoder_lstm(LASSession('hw4p2').load_checkpoint("best").model), # Transfer from existing HW4p2 model   
                            lambda m: torch.optim.AdamW(m.parameters(), lr=1e-4),
                            nn.CrossEntropyLoss(reduction='none'),
                            train_data=train_loader,
                            val_data=test_loader,
                            use_amp=False,
                            sched_factory=LRSched_0arg(
                                lambda op:
                                optim.lr_scheduler.MultiStepLR(op, [250, 350, 450, 550], 0.5, verbose=True)),
                            tf_sched=lambda e: thred_sched(e, 300, 0.001, init=0.9, minval=0.7),
                            af_sched=lambda e: thred_sched(e, 80, 0.05, init=1, minval=0)
)

Loaded checkpoint hw4p2/last
Restored to epoch 78
Loaded checkpoint hw4p2/best
Adjusting learning rate of group 0 to 1.0000e-04.
Checkpoint sessions/speech_session/last doesn't exist.


In [None]:
# Train for 600 epochs
speech_session.train(600)

## Speaker Identification Training

In [24]:
train_loader, test_loader = get_loaders(KnnwSpeakerDataset, audio_data, processed_df, batch_size=32, split=0.15)
speaker_session = SpeakerRecSession('sessions/speaker_session',
                    lambda: SpeakerNet1d([256, 256], lstm_hidden=256, lstm_layers=3, dropout=0.5, 
                                         num_classes=len(LABEL_LIST)).to(device),
                    lambda m: torch.optim.AdamW(m.parameters(), lr=5e-4),
                    nn.CrossEntropyLoss(),
                    train_data=train_loader,
                    val_data=test_loader,
                    use_amp=False,
                    sched_factory=lambda op: 
                        PlateauSched(op, 'loss', factor=0.5, patience=2, cooldown=1, min_lr=1e-6, verbose=True)
                    )

In [None]:
# Train for 100 epochs
speaker_session.train(100)

## End-to-End Inference

In [45]:
def batch_infer(speech_model, speaker_model, x, xlens):
    results = []
    x = x.to(device)
    speech_predictions = speech_model(x, xlens)
    decoded = batch_decode(speech_predictions)
    speaker_pred = speaker_model(x, xlens)
    speaker_pred = torch.argmax(speaker_pred, 1)
    for i in range(len(speaker_pred)):
        lab = index2label[speaker_pred[i].item()]
        line = ''
        if lab != 'None':
            line = f'[{lab}] '
        line += decoded[i]
        results.append(line)
    return results

def end2end(speech_model, speaker_model, data):
    result = []
    total_time = 0
    for x, y, xl, yl, labs in data:
        st = time.time()
        result += batch_infer(speech_model, speaker_model, x, xl)
        btime = time.time() - st
        total_time += btime
    return result, total_time/len(data.dataset)

In [46]:
_, test_loader = get_loaders(KnnwDataset, audio_data, processed_df)

speech_model = speech_session.model.eval()
speaker_model = speaker_session.model.eval()

results, avg_time = end2end(speech_model, speaker_model, test_loader)

In [None]:
results

['taki a look of the broadcast evacuation went up',
 '[Okudera] ah',
 "how'd terry us stared norrn stuff not sense",
 "[Teshi] i'll go no lonker whole town less",
 "[Yotsuha] i cest that's way it",
 "[Mitsuha] the eyetochooo ya said she's eatin'",
 '[Taki] ha mitsuha',
 '[Mitsuha] oucu make us',
 '[Taki] i stokped today threazy of sounds up broadcast about a wreazy',
 '[Taki] taki huh yea the election person',
 '[Taki+Okudera] no i thousand deftine bayazane was',
 'i wanna grant a ready what do touract his residents it was totally too',
 '[Radio] yeah of your relic',
 "[Taki] do you wait't thos pansay",
 '[Mitsuha] would i bottle great i maiden',
 "[Mitsuha] gid it's do it",
 '[Taki] huh',
 "[Mitsuha+Teshi] seek you're found out something",
 '[Okudera] you were gond stone i your uncredty going to like',
 '[Yotsuha] what is she like its',
 "[Mitsuha] and he couldn't it",
 '[Mitsuha] who are you',
 "today tou can to cactore charge of the stack here worth the spittin'",
 "[Mitsuha] what t