In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
# import librosa
import matplotlib.pyplot as plt

import argparse
from pathlib import Path
import pickle 

import torch
# import librosa
sys.path.append(".")

In [2]:
import torch
import torch.nn as nn
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torch.optim import *

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
local_config = {
	'batch_size': 1,
	'eps': 1e-5,
	'sample_rate': 22050,
	'load_size': 22050 * 20,
	'name_scope': 'SoundNet_TF',
	'phase': 'extracttt',
}

In [19]:
def load_audio(audio_path, sample_rate=22050, mono=True):
    # By default, librosa will resample the signal to 22050Hz(sr=None). And range in (-1., 1.)
    sound_sample, sr = librosa.load(audio_path, sr=sample_rate, mono=mono)
    
    assert sample_rate == sr

    return sound_sample, sr

def gen_audio_from_dir(dir, file_list, file_ext='.mp3', config=local_config):
    '''Audio loader from dir generator'''
    txt_list = []
    
    audio_path_list = Path(dir).glob(f'*{file_ext}')

    for audio_path in tqdm(audio_path_list):
        audio_path_str = str(audio_path).split('/')[-1][:-4]
        if audio_path_str in file_list:
            sound_sample, _ = load_audio(audio_path)
            yield preprocess(sound_sample, config), audio_path 

def preprocess(raw_audio, config=local_config):
    # Select first channel (mono)
    if len(raw_audio.shape) > 1:
        raw_audio = raw_audio[0]

    # Make range [-256, 256]
    raw_audio *= 256.0

    # Make minimum length available
    length = config['load_size']
    if length > raw_audio.shape[0]:
        raw_audio = np.tile(raw_audio, int(length/raw_audio.shape[0] + 1))

    # Make equal training length
    if config['phase'] != 'extract':
        raw_audio = raw_audio[:length]

    assert len(raw_audio.shape) == 1, "Audio is not mono"
    assert np.max(raw_audio) <= 256, "Audio max value beyond 256"
    assert np.min(raw_audio) >= -256, "Audio min value beyond -256"

    # Shape for network is 1 x DIM x 1 x 1
    raw_audio = np.reshape(raw_audio, [1, 1, -1, 1])

    return raw_audio.copy()


In [20]:
df_videos_label = {}
for line in open('../labels/train.csv').readlines()[1:]:
    video_id, category = line.strip().split(",")
    df_videos_label[video_id] = category
train_file_list = list(df_videos_label.keys())

## Torchaudio 사용

In [25]:
mp3_file_list = os.listdir('../mp3')
sample_audio, sr = torchaudio.load('../mp3/' + mp3_file_list[0])
sample_audio.shape

torch.Size([1, 215406])

In [26]:
preprocess(sample_audio).shape

(1, 1, 441000, 1)

In [27]:
train_df = pd.read_csv('../labels/train.csv')
torch.Tensor(train_df.Category)

tensor([ 0.,  0.,  0.,  ..., 14., 14., 14.])

## Make Dataset from mp3 files

In [28]:
class mp3_dataset(Dataset):
    def __init__(self, feat_dir, label_dir):

        file_df = pd.read_csv(label_dir)
        mp3_file_list = list(file_df.Id)
        category_list = list(file_df.Category)
        features = []
        labels = []
        error_count = 0

        for file, categ in tqdm(zip(mp3_file_list, category_list)):
            try:
                sample_audio, sr = torchaudio.load(feat_dir + str(file) + '.mp3')
                new_audio = preprocess(sample_audio)
                features.append(new_audio[0, 0, :, 0])
                labels.append(categ)
            except:                
                error_count += 1
                pass
#         features = torch.stack(features)
        labels = torch.Tensor(labels)

        if error_count > 0:
            print(f'Could not process {error_count} audio files correctly.')
        
        self.length = len(features)
        self.feats = features
        self.labels = labels


    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        
        data = self.feats[ind]
        label = self.labels[ind]

        return data, label

In [29]:
train_data = mp3_dataset('../mp3/', '../labels/train.csv')

2416it [00:12, 188.36it/s]formats: can't open input file `../mp3/NTkxNzA4MjE4OTM1ODg4NTYxOA==.mp3': No such file or directory
5381it [00:28, 181.09it/s]formats: can't open input file `../mp3/LTgxOTM5Mzg2MTMwNzM4NjQzNzg=.mp3': No such file or directory
5740it [00:30, 188.13it/s]

Could not process 2 audio files correctly.





In [30]:
val_data = mp3_dataset('../mp3/', '../labels/val.csv')
test_data = mp3_dataset('../mp3/', '../labels/test_for_students.csv')

241it [00:01, 191.38it/s]formats: can't open input file `../mp3/LTQ5ODI3NjU5MTQ3OTQ4NTAwOQ==.mp3': No such file or directory
1760it [00:09, 190.01it/s]


Could not process 1 audio files correctly.


749it [00:03, 187.83it/s]


In [74]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [32]:
for feat, l in train_loader:
    print(feat.shape)
    print(l)
    break

torch.Size([64, 441000])
tensor([ 7.,  1.,  4.,  1.,  1.,  7.,  8.,  6., 12.,  7.,  9.,  4.,  9.,  4.,
         0.,  3.,  0., 13.,  9.,  0.,  9.,  2.,  2.,  0.,  5.,  0.,  5.,  6.,
         9.,  5., 14.,  0.,  4.,  0.,  8., 12.,  4.,  5., 13.,  2.,  4., 12.,
         1.,  6.,  9.,  2., 13.,  1.,  1.,  6., 12.,  3.,  5.,  2.,  9.,  7.,
         6.,  4., 14.,  4.,  2., 11.,  0.,  4.])


## Pretrained model loading

In [75]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=1)

In [80]:
from hear21passt.base import load_model, get_scene_embeddings, get_timestamp_embeddings

model = load_model().cuda()
seconds = 20
passt_feat_all = []
labels_all = []

for feat, label in tqdm(train_loader):
    audio = feat.to(device)
    embed, time_stamps = get_timestamp_embeddings(audio, model)
#     print(embed.shape)
    embed = get_scene_embeddings(audio, model)
#     print(embed.shape)
    passt_feat_all.append(embed)
    labels_all.append(label)
    
    



 Loading PASST TRAINED ON AUDISET 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwi

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [03:38<00:00, 18.22s/it]


In [81]:
torch.vstack(passt_feat_all).shape

torch.Size([749, 1295])

In [82]:
passt_feat_all = torch.vstack(passt_feat_all)
labels_all = torch.cat(labels_all)

with open('passt_feat_train.pkl', 'wb') as f:
    pickle.dump(passt_feat_all, f)
with open('passt_label_train.pkl', 'wb') as f:
    pickle.dump(labels_all, f)

In [None]:
passt_feat_all = []
labels_all = []

for feat, label in tqdm(val_loader):
    audio = feat.to(device)
    embed, time_stamps = get_timestamp_embeddings(audio, model)
#     print(embed.shape)
    embed = get_scene_embeddings(audio, model)
#     print(embed.shape)
    passt_feat_all.append(embed)
    labels_all.append(label)
    
    
passt_feat_all = torch.vstack(passt_feat_all)
labels_all = torch.cat(labels_all)

with open('passt_feat_val.pkl', 'wb') as f:
    pickle.dump(passt_feat_all, f)
with open('passt_label_val.pkl', 'wb') as f:
    pickle.dump(labels_all, f)

In [None]:
passt_feat_all = []
labels_all = []

for feat, label in tqdm(test_loader):
    audio = feat.to(device)
    embed, time_stamps = get_timestamp_embeddings(audio, model)
#     print(embed.shape)
    embed = get_scene_embeddings(audio, model)
#     print(embed.shape)
    passt_feat_all.append(embed)
    labels_all.append(label)
    
    
passt_feat_all = torch.vstack(passt_feat_all)
labels_all = torch.cat(labels_all)

with open('passt_feat_test.pkl', 'wb') as f:
    pickle.dump(passt_feat_all, f)
with open('passt_label_test.pkl', 'wb') as f:
    pickle.dump(labels_all, f)