In [1]:
import argparse
import os
import pickle
import sys
import time
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
from dataloaders import *
from models.setup import *
from torchsummary import summary
from models.setup import *
from models.ImageModels import *
from models.AudioModels import *
from dataloaders import *
from itertools import chain
import apex
from apex import amp
from tqdm import tqdm
from pathlib import Path

import numpy as trainable_parameters
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

import scipy
import scipy.signal
from scipy.spatial import distance
import librosa
import matplotlib.lines as lines

import itertools
import seaborn as sns

  warn(f"Failed to load image Python extension: {e}")


In [2]:
def tablePrinting(headings, row_headings, values):

    assert(len(headings) - 1 == values.shape[-1])
    assert(len(row_headings) == values.shape[0])

    column_width = 10

    heading = f''
    for i, a_heading in enumerate(headings):
        heading += f'{a_heading:<{column_width}}'
        if i != len(headings) - 1: heading += ' | '
    else: heading += '   '

    print("\t" + heading, flush=True)
    print(f'\t{"-"*len(heading)}', flush=True)

    for i in range(len(values)):
        row = f'\t{row_headings[i]:<{column_width}}'
        for j in range(values.shape[-1]):
            value = floatFormat(values[i, j])
            row += f' | {value:>{column_width}}'
        print(row, flush=True)
        
def floatFormat(number):
    return f'{number:.6f}' 

def timeFormat(start_time, end_time):   

    total_time = end_time-start_time

    days = total_time // (24 * 60 * 60) 
    total_time = total_time % (24 * 60 * 60)

    hours = total_time // (60 * 60)
    total_time = total_time % (60 * 60)

    minutes = total_time // 60
    
    seconds =total_time % (60)

    return int(days), int(hours), int(minutes), int(seconds)

In [3]:
def modelSetup(parser, test=False):

    config_file = parser.pop("config_file")
    print(f'configs/{config_library[config_file]}')
    with open(f'configs/{config_library[config_file]}') as file:
        args = json.load(file)

    if "restore_epoch" in parser:
        restore_epoch = parser.pop("restore_epoch")
    if "resume" in parser:
        resume = parser.pop("resume")
    else: 
        resume = False
    if "feat" in parser:
        feat = parser.pop("feat")
    else:
        feat = None
    if "dataset_path" in parser:
        dataset_path = parser.pop("dataset_path")
    else: 
        dataset_path = None
    if "base_path" in parser:
        base_path = parser.pop("base_path")
    else: 
        base_path = None
    image_base = parser.pop("image_base")
    device = parser.pop("device")

    for key in parser:
        args[key] = parser[key]

    args["data_train"] = Path(args["data_train"])
    args["data_val"] = Path(args["data_val"])
    args["data_test"] = Path(args["data_test"])

    modelHash(args)

    base_dir = Path("model_metadata")    
    data = "_".join(str(Path(os.path.basename(args["data_train"])).stem).split("_")[0:4])
    model_particulars = f'AudioModel-{args["audio_model"]["name"]}_ImageModel-{args["image_model"]}_ArgumentsHash-{args["model_name"]}_ConfigFile-{Path(config_library[config_file]).stem}' 
    args["exp_dir"] = base_dir / data / model_particulars

    if test or resume:

        print(f'\nRecovering model arguments from')
        printDirectory(args["exp_dir"] / "args.pkl")

        print((args["exp_dir"] / "args.pkl").absolute())
        assert(os.path.isfile((args["exp_dir"] / "args.pkl").absolute()))
        with open(args["exp_dir"] / "args.pkl", "rb") as f:
            args = pickle.load(f)
        
        for key in parser:
            args[key] = parser[key]

        if restore_epoch != -1: args["restore_epoch"] = restore_epoch
        args["resume"] = resume
        if dataset_path is not None: args["dataset_path"] = dataset_path
        if base_path is not None: args["base_path"] = base_path

    else:
        assert(os.path.isfile(args["exp_dir"]) is False)
        print(f'\nMaking model directory:')
        printDirectory(args["exp_dir"])
        print(f'Saving model arguments at:')
        printDirectory(args["exp_dir"] / "args.pkl")

        os.makedirs(args["exp_dir"])
        with open(args["exp_dir"] / "args.pkl", "wb") as f:
            pickle.dump(args, f)
        args["resume"] = False
    args["device"] = device
    if feat is not None:
        args['feat'] = feat

    print(f'Model arguments:')
    printArguments(args)

    getDevice(args)

    return args, image_base


def modelHash(args):

    exclude_keys = ["resume"]

    name_dict = args.copy()
    name_dict.pop("resume", None)

    args["model_name"] = hashlib.md5(repr(sorted(name_dict.items())).encode("ascii")).hexdigest()[:10]
image_model_dict = {
    "VGG16": VGG16,
    "Resnet50": Resnet50,
    "Resnet101": Resnet101
}
audio_model_dict = {
    "Davenet": AudioCNN,
    "ResDavenet": ResDavenet,
    "Transformer": BidrectionalAudioLSTM
}

def imageModel(args):
    if args["image_model"] == "VGG16":
        return image_model_dict["VGG16"]
    elif args["image_model"] == "Resnet50":
        return image_model_dict["Resnet50"]
    elif args["image_model"] == "Resnet101":
        return image_model_dict["Resnet101"]
    else:
        raise ValueError(f'Unknown image model: {args["image_model"]["name"]}')

def audioModel(args):
    if args["audio_model"]["name"] == "DAVEnet":
        with open(f'models/DAVEnet.json') as file: model_params = json.load(file)
        args["audio_model"]["conv_layers"] = model_params["conv_layers"]
        args["audio_model"]["max_pool"] = model_params["max_pool"]
        return audio_model_dict["Davenet"]
    elif args["audio_model"]["name"] == "ResDAVEnet":
        with open(f'models/ResDAVEnet.json') as file: model_params = json.load(file)
        args["audio_model"]["conv_layers"] = model_params["conv_layers"]
        return audio_model_dict["ResDavenet"]
    elif args["audio_model"]["name"] == "Transformer":
        return audio_model_dict["Transformer"]
    else:
        raise ValueError(f'Unknown audio model: {args["audio_model"]["name"]}')

def acousticModel(args):
    with open(f'models/AcousticEncoder.json') as file: model_params = json.load(file)
    args["acoustic_model"] = model_params

def loadPretrainedWeights(acoustic_model, args):
    
    device = torch.device(args["device"] if torch.cuda.is_available() else "cpu")
    acoustic_model = nn.DataParallel(acoustic_model)
    model_dict = acoustic_model.state_dict()
    
    cpc_pretrained_name = args['cpc']['pretrained_weights']
    checkpoint_fn = Path(f'pretrained_cpc/{cpc_pretrained_name}.pt')
    checkpoint = torch.load(checkpoint_fn, map_location=device)
    
    for key in checkpoint["acoustic_model"]:
        if key in model_dict: model_dict[key] = checkpoint["acoustic_model"][key]
    acoustic_model.load_state_dict(model_dict)
    
    return acoustic_model

def getParameters(models, to_freeze, args):
    valid_models = []
    for model_name in models:
        valid_models.append(
            {
            'params': models[model_name].parameters(),
            'lr': args["learning_rate_scheduler"]["initial_learning_rate"],
            'name': model_name
            }
            )

    for model_name in to_freeze:
        for n, p in to_freeze[model_name].named_parameters(): 
            if n.startswith('embedder'):
                valid_models.append(
                {
                'params': p,
                'lr': args["learning_rate_scheduler"]["initial_learning_rate"],
                'name': model_name + "_" + n
                }
                )

    return valid_models

def loadModelAttriburesAndTrainingAMP(
    exp_dir, acoustic_model, english_audio_model, hindi_audio_model, image_model, 
    optimizer, amp, device, last_not_best=True
    ):

    info_fn = exp_dir / "training_metadata.json"
    with open(info_fn, "r") as f:
        info = json.load(f)

    if last_not_best:
        checkpoint_fn = exp_dir / "models" / "last_ckpt.pt"
    else:
        checkpoint_fn = exp_dir / "models" / "best_ckpt.pt"

    checkpoint = torch.load(checkpoint_fn, map_location=device)
    
    acoustic_model.load_state_dict(checkpoint["acoustic_model"])
    english_audio_model.load_state_dict(checkpoint["english_audio_model"])
    hindi_audio_model.load_state_dict(checkpoint["hindi_audio_model"])
    image_model.load_state_dict(checkpoint["image_model"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    amp.load_state_dict(checkpoint["amp"])
    epoch = checkpoint["epoch"]
    global_step = checkpoint["global_step"]
    best_epoch = checkpoint["best_epoch"]
    best_acc = checkpoint["best_acc"]  
    print(f'\nLoading model parameters from:\n\t\t{checkpoint_fn}')

    return info, epoch, global_step, best_epoch, best_acc


def NFrames(audio_input, audio_output, nframes, with_torch=True):
    pooling_ratio = round(audio_input.size(-1) / audio_output.size(-1))
    if with_torch: pooling_ratio = torch.tensor(pooling_ratio, dtype=torch.int32)
    nframes = nframes.float()
    nframes.div_(pooling_ratio)
    nframes = nframes.int()
    zeros = (nframes == 0).nonzero()
    if zeros.nelement() != 0: nframes[zeros[:, 0]] += 1

    return nframes

In [4]:
command_line_args = {
    "resume": True, 
    "config_file": 'multilingual+matchmap',
    "device": "0", 
    "restore_epoch": -1, 
    "image_base": ".."
}

In [5]:
restore_epoch = command_line_args['restore_epoch']
args, image_base = modelSetup(command_line_args)

configs/English_Hindi_matchmap_DAVEnet_config.json

Recovering model arguments from
     model_metadata
       ↪ PlacesAudio_400k_distro+PlacesHindi100k+imagesPlaces205_resize
        ↪ AudioModel-Transformer_ImageModel-Resnet50_ArgumentsHash-57f998f2bf_ConfigFile-English_Hindi_matchmap_DAVEnet_config
         ↪ args.pkl


/home/leannenortje/SemanticAcousticModel/model_metadata/PlacesAudio_400k_distro+PlacesHindi100k+imagesPlaces205_resize/AudioModel-Transformer_ImageModel-Resnet50_ArgumentsHash-57f998f2bf_ConfigFile-English_Hindi_matchmap_DAVEnet_config/args.pkl
Model arguments:
	alphas: [1.5, 1.2, 1.5, 1.2, 2.5, 2.5]
	audio_config:
		audio_type: melspectrogram
		fmin: 20
		num_mel_bins: 40
		padval: 0
		preemph_coef: 0.97
		sample_rate: 16000
		target_length: 1024
		use_raw_length: False
		window_size: 0.025
		window_stride: 0.01
		window_type: hamming


	audio_model:
		c_dim: 512
		embedding_dim: 2048
		name: Transformer
		num_heads: 8
		z_dim: 64


	batch_size: 32
	cpc:
		hop_lengt

In [6]:
acousticModel(args)
acoustic_model = AcousticEncoder(args).to("cpu")
# summary(acoustic_model, (40, 2048), device="cpu")#, depth=5)

audio_model_name = audioModel(args) 
english_audio_model = audio_model_name(args).to("cpu")

hindi_audio_model = audio_model_name(args).to("cpu")

image_model_name = imageModel(args)
image_model = image_model_name(args, pretrained=args["pretrained_image_model"]).to("cpu")

device = torch.device(args["device"] if torch.cuda.is_available() else "cpu")

acoustic_model = acoustic_model.to(device)
english_audio_model = english_audio_model.to(device)
hindi_audio_model = hindi_audio_model.to(device)
image_model = image_model.to(device)

model_with_params_to_update = {
    "acoustic_model": acoustic_model, 
    "english_audio_model": english_audio_model, 
    "hindi_audio_model": hindi_audio_model
    }
model_to_freeze = {
    "image_model": image_model
    }
trainable_parameters = getParameters(model_with_params_to_update, model_to_freeze, args)

if args["optimizer"] == 'sgd':
    optimizer = torch.optim.SGD(
        trainable_parameters, args["learning_rate_scheduler"]["initial_learning_rate"],
        momentum=args["momentum"], weight_decay=args["weight_decay"]
        )
elif args["optimizer"] == 'adam':
    optimizer = torch.optim.Adam(
        trainable_parameters, args["learning_rate_scheduler"]["initial_learning_rate"],
        weight_decay=args["weight_decay"]
        )
else:
    raise ValueError('Optimizer %s is not supported' % args["optimizer"])

[acoustic_model, english_audio_model, hindi_audio_model, image_model], optimizer = amp.initialize(
        [acoustic_model, english_audio_model, hindi_audio_model, image_model], optimizer, opt_level='O1'
        )

acoustic_model = nn.DataParallel(acoustic_model) #if not isinstance(acoustic_model, torch.nn.DataParallel) and args["device"] == 'cuda' else acoustic_model
english_audio_model = nn.DataParallel(english_audio_model) if not isinstance(english_audio_model, torch.nn.DataParallel) and args["device"] == 'cuda' else english_audio_model
hindi_audio_model = nn.DataParallel(hindi_audio_model) if not isinstance(hindi_audio_model, torch.nn.DataParallel) and args["device"] == 'cuda' else hindi_audio_model
image_model = nn.DataParallel(image_model) if not isinstance(image_model, torch.nn.DataParallel) and args["device"] == 'cuda' else image_model

info, epoch, global_step, best_epoch, best_acc = loadModelAttriburesAndTrainingAMP(
    args["exp_dir"], acoustic_model, english_audio_model, hindi_audio_model, image_model, optimizer, amp, 
    device, False
)
print(f'{"epoch": <10} = {epoch}')
print(f'{"global_step": <10} = {global_step}')
print(f'{"best_epoch": <10} = {best_epoch}')
print(f'{"best_acc": <10} = {best_acc}\n')

# assert (args["restore_epoch"] <= args["n_epochs"])

print(f'\nRecall scores at epoch {epoch}:')
print(f'\tRestoring audio model from {args["exp_dir"]}/models/audio_model.{best_epoch}.pth')
print(f'\tRestoring audio model from {args["exp_dir"]}/models/image_model.{best_epoch}.pth')

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic

Loading model parameters from:
		model_metadata/PlacesAudio_400k_distro+PlacesHindi100k+imagesPlaces205_resize/AudioModel-Transformer_ImageModel-Resnet50_ArgumentsHash-57f998f2bf_ConfigFile-English_Hindi_matchmap_DAVEnet_config/models/best_ckpt.pt
epoch      = 1
global_step = 3034
best_epoch = 1
best_acc   = 0.

In [15]:
def compute_matchmap_similarity_matrix_IA(im, im_mask, audio, frames, simtype='MISA'):
    
    w = im.size(2)
    h = im.size(3)
    im = im.view(im.size(0), im.size(1), -1).transpose(1, 2)
    audio = audio.squeeze(2)

    assert(im.dim() == 3)
    assert(audio.dim() == 3)
    
    n = im.size(0)

    for i in range(n):
        nth_entrance_in_audio = torch.cat(n*[audio[i, :, 0:frames[i]].unsqueeze(0)])

        M = torch.bmm(im, nth_entrance_in_audio)
        # M = torch.tanh(M)
        M = M.view(im.size(0), w, h, -1)#.transpose(1, 2)

        assert(M.dim() == 4)
        if simtype == 'SISA':
            M = M.mean((3))
            M = M.mean((2))
            M = M.mean((1))
        elif simtype == 'MISA':
            M, _ = M.max(1)
            M, _ = M.max(1)
            M = M.mean((1))
        elif simtype == 'SIMA':
            M, _ = M.max(3)
            M =  M.mean((2))
            M =  M.mean((1))
        else:
            raise ValueError

        if i == 0: S = M.unsqueeze(1)
        else: S = torch.cat([S, M.unsqueeze(1)], dim=1) 
    return S

In [7]:
args["image_config"]["center_crop"] = True
test_loader = torch.utils.data.DataLoader(
    ImageAudioData(
        image_base, args["data_test"], args),
    batch_size=args["batch_size"], shuffle=False, num_workers=8, pin_memory=True)

Read in data paths from:
     data
       ↪ PlacesAudio_400k_distro+PlacesHindi100k+imagesPlaces205_resize_test.json



Read in 914 data points


In [13]:
device = torch.device(args["device"] if torch.cuda.is_available() else "cpu")

image_model.eval()
acoustic_model.eval()
english_audio_model.eval()

N_examples = test_loader.dataset.__len__()
I_embeddings = [] 
E_embeddings = [] 
H_embeddings = [] 
E_frame_counts = []
H_frame_counts = []

with torch.no_grad():
    for i, (image_input, english_input, english_nframes, hindi_input, hindi_nframes) in tqdm(enumerate(test_loader), desc='\tValidating...', leave=False):

        image_output = image_model(image_input.to(device))#.to('cpu').detach()
        I_embeddings.append(image_output)

        english_z, english_c = acoustic_model(english_input.to(device))
        english_output = english_audio_model(english_c.to(device))#.to('cpu').detach()
        E_embeddings.append(english_output)
        E_frame_counts.append(NFrames(english_input, english_output, english_nframes))#.cpu())

        hindi_z, hindi_c = acoustic_model(hindi_input.to(device))
        hindi_output = hindi_audio_model(hindi_c.to(device))#.to('cpu').detach()
        H_embeddings.append(hindi_output)
        H_frame_counts.append(NFrames(hindi_input, hindi_output, hindi_nframes))#.cpu())


    heading = [" "]
    r_at_10 = []
    r_at_5 = []
    r_at_1 = []
    acc = 0
    divide = 0

    image_output = (torch.cat(I_embeddings))

    english_output = (torch.cat(E_embeddings))
    english_frames = (torch.cat(E_frame_counts))

                                       

In [16]:
S = compute_matchmap_similarity_matrix_IA(image_output, None, english_output, english_frames, simtype=args['simtype'])

In [18]:
# def calc_recalls_IA(A, B, B_mask, attention, simtype='MISA'):
#     # function adapted from https://github.com/dharwath

#     S = compute_matchmap_similarity_matrix_IA(A, B, B_mask, attention, simtype=simtype)
#     n = S.size(0)
#     I2A_scores, I2A_ind = S.topk(10, 1)
#     A2I_scores, A2I_ind = S.topk(10, 0)

#     I2A_scores = I2A_scores.detach().cpu().numpy()
#     I2A_ind = I2A_ind.detach().cpu().numpy()
#     A2I_scores = A2I_scores.detach().cpu().numpy()
#     A2I_ind = A2I_ind.detach().cpu().numpy()

#     A_foundind = -np.ones(n)
#     I_foundind = -np.ones(n)
#     for i in tqdm(range(n), desc="Calculating recalls", leave=False):
#         ind = np.where(I2A_ind[i, :] == i)[0]
#         if len(ind) != 0: A_foundind[i] = ind[0]
#         ind = np.where(A2I_ind[:, i] == i)[0]
#         if len(ind) != 0: I_foundind[i] = ind[0]
 
#     r1_I2A = len(np.where(A_foundind == 0)[0])/len(A_foundind)
#     r5_I2A = len(np.where(np.logical_and(A_foundind >= 0, A_foundind < 5))[0])/len(A_foundind)
#     r10_I2A = len(np.where(np.logical_and(A_foundind >= 0, A_foundind < 10))[0])/len(A_foundind)

#     r1_A2I = len(np.where(I_foundind == 0)[0])/len(I_foundind)
#     r5_A2I = len(np.where(np.logical_and(I_foundind >= 0, I_foundind < 5))[0])/len(I_foundind)
#     r10_A2I = len(np.where(np.logical_and(I_foundind >= 0, I_foundind < 10))[0])/len(I_foundind)

#     return {
#         'r1_I2A':r1_I2A, 
#         'r5_I2A':r5_I2A, 
#         'r10_I2A':r10_I2A,
#         'r1_A2I':r1_A2I, 
#         'r5_A2I':r5_A2I, 
#         'r10_A2I':r10_A2I
#         }

n = S.size(0)
S = S.to(device)
A2B_scores, A2B_ind = S.topk(10, 1)
B2A_scores, B2A_ind = S.topk(10, 0)

A2B_scores = A2B_scores.detach().cpu().numpy()
A2B_ind = A2B_ind.detach().cpu().numpy()
B2A_scores = B2A_scores.detach().cpu().numpy()
B2A_ind = B2A_ind.detach().cpu().numpy()

A_foundind = -np.ones(n)
B_foundind = -np.ones(n)
for i in tqdm(range(n), desc="Calculating recalls", leave=False):
    ind = np.where(A2B_ind[i, :] == i)[0]
    if len(ind) != 0: B_foundind[i] = ind[0]
    ind = np.where(B2A_ind[:, i] == i)[0]
    if len(ind) != 0: A_foundind[i] = ind[0]

r1_A_to_B = len(np.where(B_foundind == 0)[0])/len(B_foundind)
r5_A_to_B = len(np.where(np.logical_and(B_foundind >= 0, B_foundind < 5))[0])/len(B_foundind)
r10_A_to_B = len(np.where(np.logical_and(B_foundind >= 0, B_foundind < 10))[0])/len(B_foundind)

r1_B_to_A = len(np.where(A_foundind == 0)[0])/len(A_foundind)
r5_B_to_A = len(np.where(np.logical_and(A_foundind >= 0, A_foundind < 5))[0])/len(A_foundind)
r10_B_to_A = len(np.where(np.logical_and(A_foundind >= 0, A_foundind < 10))[0])/len(A_foundind)


recalls = {
        'A_to_B_r1':r1_A_to_B, 
        'A_to_B_r5':r5_A_to_B, 
        'A_to_B_r10':r10_A_to_B,
        'B_to_A_r1':r1_B_to_A, 
        'B_to_A_r5':r5_B_to_A, 
        'B_to_A_r10':r10_B_to_A
        }

heading = [" "]
r_at_10 = []
r_at_5 = []
r_at_1 = []
acc = 0
divide = 0

heading.extend(["E -> I", "I -> E"])
r_at_10.extend([recalls["B_to_A_r10"], recalls["A_to_B_r10"]])
r_at_5.extend([recalls["B_to_A_r5"], recalls["A_to_B_r5"]])
r_at_1.extend([recalls["B_to_A_r1"], recalls["A_to_B_r1"]])
acc += recalls["B_to_A_r10"] + recalls["A_to_B_r10"]
divide += 2

tablePrinting(
    heading, ["R@10", "R@5", "R@1"],
    np.asarray([r_at_10, r_at_5, r_at_1])
    )

print(f'Retrieval accuracy: {100 * acc / divide}%')

                                                            

	           | E -> I     | I -> E       
	---------------------------------------
	R@10       |   0.105033 |   0.115974
	R@5        |   0.056893 |   0.066740
	R@1        |   0.014223 |   0.015317
Retrieval accuracy: 11.050328227571116%




In [None]:
print(I2A_scores[10, :])
print(I2A_ind[10, :])

In [None]:
S[10, 10]