In [1]:
!pip install -q pysndfx SoundFile audiomentations pretrainedmodels efficientnet_pytorch resnest

In [2]:
import numpy as np
import librosa as lb
import librosa.display as lbd
import soundfile as sf
from  soundfile import SoundFile
import pandas as pd
from  IPython.display import Audio
from pathlib import Path

import torch
from torch import nn, optim
from  torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from matplotlib import pyplot as plt

import os, random, gc
import re, time, json
from  ast import literal_eval


from IPython.display import Audio
from sklearn.metrics import label_ranking_average_precision_score

from tqdm.notebook import tqdm
import joblib

from resnest.torch import resnest50
from efficientnet_pytorch import EfficientNet
import pretrainedmodels
import resnest.torch as resnest_torch

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
DATA_ROOT = Path("../input/birdclef-2021")
TRAIN_AUDIO_ROOT = Path("../input/birdclef-2021/train_short_audio")
TRAIN_AUDIO_IMAGES_ROOT = Path("../input/kkiller-birdclef-2021/audio_images")

NUM_CLASSES = 397
SR = 32_000
DURATION = 7

MAX_READ_SAMPLES = 5 # Each record will have 10 melspecs at most, you can increase this on Colab with High Memory Enabled

###
MEL_PATHS = sorted(Path("../input").glob("kkiller-birdclef-mels-computer-d7-part?/rich_train_metadata.csv"))
TRAIN_LABEL_PATHS = sorted(Path("../input").glob("kkiller-birdclef-mels-computer-d7-part?/LABEL_IDS.json"))
MODEL_ROOT = Path(".")

TRAIN_BATCH_SIZE = 50
TRAIN_NUM_WORKERS = 2

VAL_BATCH_SIZE = 50
VAL_NUM_WORKERS = 2

In [5]:
##df 불러오기

def get_df(mel_paths=MEL_PATHS, train_label_paths=TRAIN_LABEL_PATHS):
    df = None
    LABEL_IDS = {}
    
    # mel_paths = rich_train_metadata
    for file_path in mel_paths:
        temp = pd.read_csv(str(file_path), index_col=0)
        temp["impath"] = temp.apply(lambda row: file_path.parent/"audio_images/{}/{}.npy".format(row.primary_label, row.filename), axis=1) 
        df = temp if df is None else df.append(temp)
    
    df["secondary_labels"] = df["secondary_labels"].apply(literal_eval)

    for file_path in train_label_paths:
        with open(str(file_path)) as f:
            LABEL_IDS.update(json.load(f))
        
    return LABEL_IDS, df

In [6]:
LABEL_IDS, df = get_df()

In [7]:
##LABEL_IDS

In [8]:
##df.head()

In [9]:
## df.loc[df['rating'] < 2.5]

In [10]:
##이름순으로 총 387종류 있고, primary label에 최대 500회까지 있고 없는애는 8번밖에 없음
## print(df["primary_label"].value_counts())
## print(df["label_id"].min(), df["label_id"].max())

In [11]:
## filename 이랑 impath랑 dic으로 매칭시
def load_data(df):
    def load_row(row):
        # impath = TRAIN_IMAGES_ROOT/f"{row.primary_label}/{row.filename}.npy"
        # filename == @@@.ogg,     impath == .npy
        return row.filename, np.load(str(row.impath))[:MAX_READ_SAMPLES]
    
    pool = joblib.Parallel(4) ## 병렬처리로 속도개선 하기 위함
    mapper = joblib.delayed(load_row) ##얘랑 parallel이랑 이유는 모르겠는데 같이 씀 나중에 공부해볼것
    
    tasks = [mapper(row) for row in df.itertuples(False)]
    res = pool(tqdm(tasks))
    res = dict(res)
    
    return res

In [12]:
# We cache the train set to reduce training time

audio_image_store = load_data(df)

  0%|          | 0/62874 [00:00<?, ?it/s]

In [13]:
##dic으로 np array랑 filename이랑 매치돼 있는 녀석, 전체 파일이라 62874개있음
len(audio_image_store)

62874

In [14]:
## Training을 위한 dataset으로 만들기
class BirdClefDataset(Dataset):

    def __init__(self, audio_image_store, meta, sr=SR, is_train=True, num_classes=NUM_CLASSES, duration=DURATION):
        
        self.audio_image_store = audio_image_store
        self.meta = meta.copy().reset_index(drop=True)
        self.sr = sr
        self.is_train = is_train
        self.num_classes = num_classes
        self.duration = duration
        self.audio_length = self.duration*self.sr
    
    @staticmethod
    def normalize(image):
        image = image.astype("float32", copy=False) / 255.0
        image = np.stack([image, image, image])
        return image

    def __len__(self):
        return len(self.meta)
    
    def __getitem__(self, idx):
        row = self.meta.iloc[idx]
        image = self.audio_image_store[row.filename]

        image = image[np.random.choice(len(image))]
        image = self.normalize(image)
        
        
        t = np.zeros(self.num_classes, dtype=np.float32) + 0.0025 # Label smoothing
        t[row.label_id] = 0.995
        
        return image, t

In [15]:
# ds 는 image랑 label smoothing 된

ds = BirdClefDataset(audio_image_store, meta=df, sr=SR, duration=DURATION, is_train=True)
len(ds)

62874

In [16]:
#ds[?] 로 랜덤으로 ?받아서 확인해봅시다.
# x, y = ds[np.random.choice(len(ds))]

# x는 RGB (128,281) numpy,   y는 label_Smooth 된 label
# x.shape, y.shape, np.where(y >= 0.5)

In [17]:
##lbd.specshow(x[0])

# Model 만들것!! ----------------------210429

In [18]:
def Convlayer(in_ch, out_ch, kernel_size=3, stride=2, use_leaky = True, use_inst_norm=True, use_pad=True):
    
    if use_pad:
        conv = nn.Conv2d(in_ch, out_ch, kernel_size, stride, 1, bias=True)
    else:
        conv = nn.Conv2d(in_ch, out_ch, kernel_size, stride, 0, bias=True)
     
    
    if use_inst_norm:
        norm = nn.InstanceNorm2d(out_ch)
    else:
        norm = nn.BatchNorm2d(out_ch)
        
        
    if use_leaky:
        actv = nn.LeakyReLU(negative_slope=0.2, inplace=True)
    else:
        actv = nn.GELU()
    
    return nn.Sequential(conv, norm, actv)

In [19]:
class Resblock(nn.Module):
    def __init__(self, in_features, use_dropout=True, dropout_ratio=0.5):
        super().__init__()
        models = list()
        
        models.append(Convlayer(in_features, in_features, 1, 1, use_leaky=True, use_inst_norm=False, use_pad=False))
        models.append(nn.ReflectionPad2d(1))
        models.append(Convlayer(in_features, in_features, 3, 1, use_leaky=True, use_inst_norm=False, use_pad=False))
        models.append(nn.ReflectionPad2d(1))
        models.append(Convlayer(in_features, in_features, 3, 1, use_leaky=True, use_inst_norm=False, use_pad=False))
        
        self.res = nn.Sequential(*models)
        
    
    def forward(self, x):
        x = x + self.res(x)
        x = F.relu(x)
        return x

In [20]:
def get_model(name, num_classes = NUM_CLASSES):
    
    '''
    model = getattr(resnest_torch, name)(pretrained=True)
    nb_ft = model.fc.in_features
    print(nb_ft)
    
    model.fc = nn.Linear(nb_ft, num_classes)
    '''
    #xb = 100,3,128,281
    #o = torch.Size([100, 397, 42, 93])
    #yb = 100,397
    
    if 'resnet' in name: ##resnet 쓰고싶으면
        model = torch.hub.load('pytorch/vision:v0.6.0', name, pretrained=True) ##python ver에 따라서 v0.6.0이나 0.9.0 등 고를것
        model.fc = nn.Linear(model.fc.in_features, num_classes)

    else: ##내가 만든 전용 모델 쓰고싶으면
        model = list()
        model.append(Resblock(3))
        model.append(Convlayer(3, 1, 3, 3, use_leaky=False, use_inst_norm=False, use_pad=False))
        model.append(nn.Flatten())
        model.append(nn.Linear(3906,397)) ##3906
        model = nn.Sequential(*model)
    
    
    return model

In [21]:
def one_step(xb, yb, net, criterion, optimizer, scheduler = None):
    torch.cuda.empty_cache()
    
    xb, yb = xb.to(DEVICE), yb.to(DEVICE)
    
    
    ## xb는 numpy, y는 label, net은 resnet등의 seqential
    optimizer.zero_grad()
    o = net(xb) #o는 자료 넣은 예측값
    
    
    loss = criterion(o, yb)
    loss.backward()
    optimizer.step()
    
    with torch.no_grad():
        l = loss.item()
        
        o = o.sigmoid() ##@@ 다른 활성화 쓸것
        
        yb = (yb > 0.5) * 1.0 ## threshold인듯
        
        ##sklearn에서 label smooth 쉽게하는 놈
        lrap = label_ranking_average_precision_score(yb.cpu().numpy(), o.cpu().numpy())
        
        o = (o > 0.5) * 1.0
        
        prec = (o*yb).sum()/(1e-6 + o.sum())
        rec = (o*yb).sum()/(1e-6 + yb.sum())
        f1 = 2*prec*rec/(1e-6+prec+rec)
        
    if  scheduler is not None:
        scheduler.step()
    
    
    return l, lrap, f1.item(), rec.item(), prec.item()

In [22]:
@torch.no_grad()  ##함수 데코레이터 : evaluate를 torch.no_Grad인 상태로 실해하겠다.
def evaluate(net, criterion, val_loader):
    
    
    net.eval()
    
    os, y = [], []
    val_loader = tqdm(val_loader, leave=False, total = len(val_loader))
    
    for icount, (xb, yb) in enumerate(val_loader):
        val_loader.set_description("in val_loader ")
        torch.cuda.empty_cache()
        
        y.append(yb.to(DEVICE))
        
        xb = xb.to(DEVICE)
        o = net(xb)
        
        os.append(o)
    
    # y, os를 list에서 torch로 변경
    y = torch.cat(y)
    o = torch.cat(os)
    
    l = criterion(o, y).item()
    
    o = o.sigmoid()
    y = (y > 0.5)*1.0

    lrap = label_ranking_average_precision_score(y.cpu().numpy(), o.cpu().numpy())

    o = (o > 0.5)*1.0

    prec = ((o*y).sum()/(1e-6 + o.sum())).item()
    rec = ((o*y).sum()/(1e-6 + y.sum())).item()
    f1 = 2*prec*rec/(1e-6+prec+rec)

    return l, lrap, f1, rec, prec, 

In [23]:
def one_epoch(net, criterion, optimizer, scheduler, train_loader, val_loader):
    net.train()
    l, lrap, prec, rec, f1, icount = 0., 0., 0., 0., 0., 0 
    train_loader = tqdm(train_loader, leave = False)
    epoch_bar = train_loader
    

    for (xb, yb) in epoch_bar:
        epoch_bar.set_description("for one step ")
        
        ##epoch 마다 step 한번씩 거치게 설계했음
        _l, _lrap, _f1, _rec, _prec = one_step(xb, yb, net, criterion, optimizer)
        
        l += _l
        lrap += _lrap
        f1 += _f1
        rec += _rec
        prec += _prec
        
        icount += 1
        
        
        if hasattr(epoch_bar, "set_postfix") and not icount%10:
            epoch_bar.set_postfix(
                loss="{:.6f}".format(l/icount),
                lrap="{:.3f}".format(lrap/icount),
                prec="{:.3f}".format(prec/icount),
                rec="{:.3f}".format(rec/icount),
                f1="{:.3f}".format(f1/icount),
            )
    
    scheduler.step()
    
    ##평균치 내서
    l /= icount
    lrap /= icount
    f1 /= icount
    rec /= icount
    prec /= icount
    
    #eval에 넣어서 나온 값
    l_val, lrap_val, f1_val, rec_val, prec_val = evaluate(net, criterion, val_loader)
    
    return (l, l_val), (lrap, lrap_val), (f1, f1_val), (rec, rec_val), (prec, prec_val)

In [24]:
##auto save 뺐음 나중에 제출할때 넣을것
class AutoSave:
  def __init__(self, top_k=2, metric="f1", mode="min", root=None, name="ckpt"):
    self.top_k = top_k
    self.logs = []
    self.metric = metric
    self.mode = mode
    self.root = Path(root or MODEL_ROOT)
    assert self.root.exists()
    self.name = name

    self.top_models = []
    self.top_metrics = []

  def log(self, model, metrics):
    metric = metrics[self.metric]
    rank = self.rank(metric)

    self.top_metrics.insert(rank+1, metric)
    if len(self.top_metrics) > self.top_k:
      self.top_metrics.pop(0)

    self.logs.append(metrics)
    self.save(model, metric, rank, metrics["epoch"])


  def save(self, model, metric, rank, epoch):
    t = time.strftime("%Y%m%d%H%M%S")
    name = "{}_epoch_{:02d}_{}_{:.04f}_{}".format(self.name, epoch, self.metric, metric, t)
    name = re.sub(r"[^\w_-]", "", name) + ".pth"
    path = self.root.joinpath(name)

    old_model = None
    self.top_models.insert(rank+1, name)
    if len(self.top_models) > self.top_k:
      old_model = self.root.joinpath(self.top_models[0])
      self.top_models.pop(0)      

    torch.save(model.state_dict(), path.as_posix())

    if old_model is not None:
      old_model.unlink()

    self.to_json()


  def rank(self, val):
    r = -1
    for top_val in self.top_metrics:
      if val <= top_val:
        return r
      r += 1

    return r
  
  def to_json(self):
    # t = time.strftime("%Y%m%d%H%M%S")
    name = "{}_logs".format(self.name)
    name = re.sub(r"[^\w_-]", "", name) + ".json"
    path = self.root.joinpath(name)

    with path.open("w") as f:
      json.dump(self.logs, f, indent=2)

In [25]:
##한 fold는 나눠진 dataset당 epoch 다 돌리는거의미

def one_fold(model_name, fold, train_set, val_set, epochs=20, save=True, save_root=None):
    
    save_root = Path(save_root) or MODEL_ROOT
    saver = AutoSave(root=save_root, name=f"birdclef_{model_name}_fold{fold}", metric="f1_val")
    
    
    net = get_model(model_name).to(DEVICE)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), lr=8e-4)
    
    #lr 스케쥴러 -> Set the learning rate of each parameter group using a cosine annealing schedule
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=epochs)
    
    
    
    ###여기서부터 data 불러오는 과정
    #Train Mel_img를 Dataset으로 불러옴
    train_data = BirdClefDataset(audio_image_store, meta=df.iloc[train_set].reset_index(drop=True),
                           sr=SR, duration=DURATION, is_train=True)
    
    #Train dataloader 예시 (batchsize, 3, 256, 256) 처럼 불러오게 됨
    train_loader = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, num_workers=TRAIN_NUM_WORKERS, shuffle=True, pin_memory=True)
    
    
    ##Val
    val_data = BirdClefDataset(audio_image_store, meta=df.iloc[val_set].reset_index(drop=True),  sr=SR, duration=DURATION, is_train=False)
    val_loader = DataLoader(val_data, batch_size=VAL_BATCH_SIZE, num_workers=VAL_NUM_WORKERS, shuffle=False, pin_memory=True)
    
    
    ####진행####
    epochs_bar = tqdm(list(range(epochs)), leave=False)
    for epoch in epochs_bar:
        epochs_bar.set_description(f"--> [EPOCH {epoch:02d}]")
        net.train()
        
        
        ##한 epoch 돌려서 return합시다
        (l, l_val), (lrap, lrap_val), (f1, f1_val), (rec, rec_val), (prec, prec_val) \
        = one_epoch(net=net,
                    criterion=criterion,
                    optimizer=optimizer,
                    scheduler=scheduler,
                    train_loader=train_loader,
                    val_loader=val_loader,
                   )
        
        
        epochs_bar.set_postfix(
            loss="({:.6f}, {:.6f})".format(l, l_val),
            prec="({:.3f}, {:.3f})".format(prec, prec_val),
            rec="({:.3f}, {:.3f})".format(rec, rec_val),
            f1="({:.3f}, {:.3f})".format(f1, f1_val),
            lrap="({:.3f}, {:.3f})".format(lrap, lrap_val),
        )
        
        
        print(
            "[{epoch:02d}] loss: {loss} lrap: {lrap} f1: {f1} rec: {rec} prec: {prec}".format(
                epoch=epoch,
                loss="({:.6f}, {:.6f})".format(l, l_val),
                prec="({:.3f}, {:.3f})".format(prec, prec_val),
                rec="({:.3f}, {:.3f})".format(rec, rec_val),
                f1="({:.3f}, {:.3f})".format(f1, f1_val),
                lrap="({:.3f}, {:.3f})".format(lrap, lrap_val)
            )
        )
        if save:
            metrics = {
                "loss": l, "lrap": lrap, "f1": f1, "rec": rec, "prec": prec,
                "loss_val": l_val, "lrap_val": lrap_val, "f1_val": f1_val, "rec_val": rec_val, "prec_val": prec_val,
                "epoch": epoch,
            }
            
            saver.log(net, metrics)

In [26]:
def train(model_name, epochs = 10, save= True, n_splits=5, seed=177, save_root=None, suffix="", folds=None):
    gc.collect()
    torch.cuda.empty_cache()
    
    ##Save 관련...
    save_root = save_root or MODEL_ROOT/f"{model_name}{suffix}"
    save_root.mkdir(exist_ok=True, parents=True)
    
    fold_bar = tqdm(df.reset_index().groupby("fold").index.apply(list).items(), total=df.fold.max()+1)
    
    for fold, val_set in fold_bar:
        if folds and not fold in folds:
            continue
            
        print(f"\n############################### [FOLD {fold}]")
        fold_bar.set_description(f"[FOLD {fold}]")
        train_set = np.setdiff1d(df.index, val_set)
        
        one_fold(model_name, fold=fold, train_set=train_set , val_set=val_set , epochs=epochs, save=save, save_root=save_root)
        
        gc.collect()
        torch.cuda.empty_cache()

In [28]:
train("resnet152", epochs=4, suffix=f"_sr{SR}_d{DURATION}_v1_v1", folds=[0])

  0%|          | 0/5 [00:00<?, ?it/s]


############################### [FOLD 0]


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.6.0
Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /root/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth


  0%|          | 0.00/230M [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1006 [00:00<?, ?it/s]

  0%|          | 0/252 [00:00<?, ?it/s]

[00] loss: (0.031898, 0.029382) lrap: (0.103, 0.201) f1: (0.003, 0.029) rec: (0.002, 0.015) prec: (0.062, 0.551)


  0%|          | 0/1006 [00:00<?, ?it/s]

  0%|          | 0/252 [00:00<?, ?it/s]

[01] loss: (0.028110, 0.027806) lrap: (0.323, 0.360) f1: (0.074, 0.140) rec: (0.040, 0.079) prec: (0.620, 0.632)


  0%|          | 0/1006 [00:00<?, ?it/s]

  0%|          | 0/252 [00:00<?, ?it/s]

[02] loss: (0.026075, 0.025941) lrap: (0.505, 0.515) f1: (0.251, 0.304) rec: (0.151, 0.189) prec: (0.824, 0.779)


  0%|          | 0/1006 [00:00<?, ?it/s]

  0%|          | 0/252 [00:00<?, ?it/s]

[03] loss: (0.024536, 0.024447) lrap: (0.628, 0.633) f1: (0.415, 0.473) rec: (0.274, 0.326) prec: (0.888, 0.860)


In [29]:
TEST_AUDIO_ROOT = Path("../input/birdclef-2021/test_soundscapes")
SAMPLE_SUB_PATH = "../input/birdclef-2021/sample_submission.csv"
TARGET_PATH = None
    
if not len(list(TEST_AUDIO_ROOT.glob("*.ogg"))):
    TEST_AUDIO_ROOT = Path("../input/birdclef-2021/train_soundscapes")
    SAMPLE_SUB_PATH = None
    # SAMPLE_SUB_PATH = "../input/birdclef-2021/sample_submission.csv"
    TARGET_PATH = Path("../input/birdclef-2021/train_soundscape_labels.csv")

In [30]:
data = pd.DataFrame(
    [(path.stem, *path.stem.split("_"), path) for path in Path(TEST_AUDIO_ROOT).glob("*.ogg")],
columns = ['filename', 'id', 'site', 'date', 'filepath'])

print(data.shape)

(20, 5)


In [None]:
##
'''
Thanks for the decent notebook, it really helped me a lot! :)
Here is my question.
As far as I understand, 6 images were generated when one audio file is about 30 seconds long.
And in BirdClefDataset, you chose only one random image from these 6 images of one audio file and used it to train model. Is that correct?
If it is correct, then is there any reasons for using only one random image? not all images?

I'm a beginner and I might not fully understand your code.
Please point me out if I was wrong :)
'''

In [None]:
##일단 체크포인트 ##@@로 해놨음

In [None]:
회자 식별 관련 연구같은거 보면 좋음

1. https://academy.allaboutbirds.org/the-language-of-birds/
-> 주파수 별로 쪼개서(고주파, 저주파, 전) learning 시킬것
-> 진폭과 주파수를 다 고려할것
https://www.youtube.com/watch?v=2FMeJdYh5Sc
-> a) spectrogram -> CNN /2차원
   b) without feature extraction frame level 로 -> 1d cnn
   a,b 를 score level ensemble
    
    
2. 사람의 음성이 아니기때문에 mel 차원으로 할 필요 없을듯

2. 아니면 차원수를 늘려서 RGB처럼 n차원으로(고주파, 저주파, 전체) 분석하기
https://www.youtube.com/watch?v=g0pvHkq-BIA
    
    
3.음성신호의 noise 제거하기
 ->VAD(voice activity detection) 으로 무음구간의 프레임 음향특성제거
 ->고민을 좀 해봐야 할듯

4.분류 모델이기 때문에 다양한 손실함수 사용하고 비교해볼것
 ->labelsmoothing
    
5.Concrete Drop out 등 drop out technique 사용해보기
6.Sound Mixup tech 사용해보기 -> 메모장에 있음
7.TS learning 활용
https://www.youtube.com/watch?v=2FMeJdYh5Sc
    
8.활성화 함수(GELU, Leaky중에서)
9. model에 residual block 사용하기
10. pooling 은 overfitting 에 유리하다, padding은 정보압축해서 속도면에서 유리하다
11. optimizer 은 SGD, AMSGrad 등등 고려

In [None]:
'''data loader 내부 확인
dataiter = iter(데이터로더 이름)
images, labels = dataiter.next()
print(images.shape, labels.shape)