In [None]:
"""
This notebook demonstrates how to replicate converting tensorflow
weights from tensorflow's vggish to torchvggish
""" 

# Download the audioset directory using subversion
# !apt-get -qq install subversion   # uncomment if on linux
!svn checkout https://github.com/tensorflow/models/trunk/research/audioset

# Download audioset requirements
!pip install numpy scipy
!pip install resampy tensorflow six soundfile

# grab the VGGish model checkpoints & PCA params
!curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt
!curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz

In [2]:
# Test install
!mv audioset/* .
from vggish_smoke_test import *


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.


Testing your install of VGGish

Log Mel Spectrogram example:  [[-4.47297436 -4.29457354 -4.14940631 ... -3.9747003  -3.94774997
  -3.78687669]
 [-4.48589533 -4.28825497 -4.139964   ... -3.98368686 -3.94976505
  -3.7951698 ]
 [-4.46158065 -4.29329706 -4.14905953 ... -3.96442484 -3.94895483
  -3.78619839]
 ...
 [-4.46152626 -4.29365061 -4.14848608 ... -3.96638113 -3.95057575
  -3.78538167]
 [-4.46152595 -4.2936572  -4.14848104 ... -3.96640507 -3.95059567
  -3.78537143]
 [-4.46152565 -4.29366386 -4.14847603 ... -3.96642906 -3.95061564
  -3.78536116]]
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use standard file APIs to check for files with th

In [4]:
import tensorflow as tf
import vggish_slim

vggish_dict = {}
# load the model and get info 
with tf.Graph().as_default(), tf.Session() as sess:
    vggish_slim.define_vggish_slim(training=True)
    vggish_slim.load_vggish_slim_checkpoint(sess,"vggish_model.ckpt")
    
    tvars = tf.trainable_variables()
    tvars_vals = sess.run(tvars)

    for var, val in zip(tvars, tvars_vals):
        print("%s" % (var.name))
        print("\t" + str(var.shape))
        vggish_dict[var.name] = val
    print("values written to vggish_dict")

INFO:tensorflow:Restoring parameters from vggish_model.ckpt
vggish/conv1/weights:0
	(3, 3, 1, 64)
vggish/conv1/biases:0
	(64,)
vggish/conv2/weights:0
	(3, 3, 64, 128)
vggish/conv2/biases:0
	(128,)
vggish/conv3/conv3_1/weights:0
	(3, 3, 128, 256)
vggish/conv3/conv3_1/biases:0
	(256,)
vggish/conv3/conv3_2/weights:0
	(3, 3, 256, 256)
vggish/conv3/conv3_2/biases:0
	(256,)
vggish/conv4/conv4_1/weights:0
	(3, 3, 256, 512)
vggish/conv4/conv4_1/biases:0
	(512,)
vggish/conv4/conv4_2/weights:0
	(3, 3, 512, 512)
vggish/conv4/conv4_2/biases:0
	(512,)
vggish/fc1/fc1_1/weights:0
	(12288, 4096)
vggish/fc1/fc1_1/biases:0
	(4096,)
vggish/fc1/fc1_2/weights:0
	(4096, 4096)
vggish/fc1/fc1_2/biases:0
	(4096,)
vggish/fc2/weights:0
	(4096, 128)
vggish/fc2/biases:0
	(128,)
values written to vggish_dict


In [14]:
# Define torch model for vggish

import torch
import torch.nn as nn
import numpy as np

# From vggish_slim:
# The VGG stack of alternating convolutions and max-pools.
#     net = slim.conv2d(net, 64, scope='conv1')
#     net = slim.max_pool2d(net, scope='pool1')
#     net = slim.conv2d(net, 128, scope='conv2')
#     net = slim.max_pool2d(net, scope='pool2')
#     net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
#     net = slim.max_pool2d(net, scope='pool3')
#     net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
#     net = slim.max_pool2d(net, scope='pool4')
#     # Flatten before entering fully-connected layers
#     net = slim.flatten(net)
#     net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
#     # The embedding layer.
#     net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')

vggish_list = list(vggish_dict.values())
def param_generator():
    param = vggish_list.pop(0)
    transposed = np.transpose(param)
    to_torch = torch.from_numpy(transposed)
    result = torch.nn.Parameter(to_torch)
    yield result

class VGGish(nn.Module):
    def __init__(self):
        super(VGGish, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, 3, 1, 1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, 1, 1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, 1, 1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, 1, 1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(256, 512, 3, 1, 1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2))
        self.embeddings = nn.Sequential(
            nn.Linear(512*24, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 128),
            nn.ReLU(inplace=True))
        
        # extract weights from `vggish_list`
        for seq in (self.features, self.embeddings):
            for layer in seq:
                if type(layer).__name__ != "MaxPool2d" and type(layer).__name__ != "ReLU":
                    layer.weight = next(param_generator())
                    layer.bias = next(param_generator())
            
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0),-1)
        x = self.embeddings(x)
        return x

net = VGGish()
net.eval()

# Save weights to disk
torch.save(net.state_dict(), "./vggish.pth")