In [1]:
# import soundfile as sf
# import librosa
# data, sr = sf.read('1.wav')
# data = librosa.resample(data[:,0],sr,16000)

In [2]:
# data.shape[0]

In [3]:
config = {
    "batch_size":24,
    "beam_width" : 2,
    "lr" : 1e-5,
    "weight_decay": 0,
    "epochs" : 100
    } # Feel free to add more items here

In [4]:
# from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
# import torch
import torch
import torch.nn as nn
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
from torch.nn.utils.rnn import pad_sequence
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)
torch.manual_seed(0)
# model_name = "facebook/wav2vec2-large-xlsr-53"
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
# model_audio = Wav2Vec2Model.from_pretrained(model_name)

# i= feature_extractor(data, return_tensors="pt", sampling_rate=16000)
# #previous are in dataloader

# with torch.no_grad():
#   o= model_audio(i.input_values)
# print(o.keys())
# print(o.last_hidden_state.shape)
# print(o.extract_features.shape)
# print(i.input_values.numpy().shape)

  from .autonotebook import tqdm as notebook_tqdm


Device:  cuda


<torch._C.Generator at 0x7f99644b9870>

In [5]:
from sentence_transformers import SentenceTransformer
model_text = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#Sentences we want to encode. Example:
# sentence = ['This framework generates embeddings for each input sentence.']


#Sentences are encoded by calling model.encode()
# text_fea = model_text.encode(sentence)

In [6]:
import json

class RASDataset(torch.utils.data.Dataset):

    def __init__(self, root, file_pth, partition = 'train', subset= None): 
        # Load the directory and all files in them
        f = open(file_pth)
        self.data_json = json.load(f)
        f.close()

        self.length = len(self.data_json)       
        self.base_path = os.path.join(root, partition)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        cur_dict = self.data_json[idx]
        name = str(cur_dict['id'])

        audio_feat = np.load(os.path.join(self.base_path, name+'_feat.npy'))
        audio_feat = np.squeeze(audio_feat)

        text = [cur_dict['description']]
        text_feat = model_text.encode(text)
        text_feat = np.squeeze(text_feat)

        target = np.load(os.path.join(self.base_path, name+'_target.npy'))
        target = np.squeeze(target)
        
        audio_feat = torch.FloatTensor(audio_feat)
        text_feat = torch.FloatTensor(text_feat)
        target = torch.LongTensor(target)

        sample = {
                  "audio_feat": audio_feat,
                  "text_feat": text_feat,
                  "target": target
                }

        return sample


    def collate_fn(self,batch):

        batch_audio = [i["audio_feat"] for i in batch]
        batch_text = [i["text_feat"] for i in batch]
        batch_target = [i["target"] for i in batch]

        batch_audio_pad = pad_sequence(batch_audio, batch_first=True)
        lengths_audio = [i.shape[0] for i in batch_audio]

        batch_target_pad = pad_sequence(batch_target, batch_first=True)
        lengths_target = [i.shape[0] for i in batch_target]

        batch_audio_pad = torch.FloatTensor(batch_audio_pad)
        batch_text = torch.stack(batch_text)
        batch_target_pad = torch.LongTensor(batch_target_pad)

        return batch_audio_pad, batch_text, batch_target_pad, torch.tensor(lengths_audio), torch.tensor(lengths_target)

       

In [7]:
root = './'
train_data = RASDataset(root, 'train_combined.json', partition= "train")
val_data = RASDataset(root, 'val_combined.json', partition= "val")

train_loader = torch.utils.data.DataLoader(
    dataset     = train_data, 
    num_workers = 0,
    batch_size  = config['batch_size'], 
    collate_fn = train_data.collate_fn,
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data, 
    num_workers = 0,
    batch_size  = config['batch_size'],
    collate_fn = val_data.collate_fn,
    pin_memory  = True,
    shuffle     = False
)

print("Batch size: ", config['batch_size'])
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))

Batch size:  24
Train dataset samples = 7500, batches = 313
Val dataset samples = 1500, batches = 63


In [8]:
for data in train_loader:
    audio, text, target, laudio, ltarget = data
    print(audio.shape, text.shape, target.shape, laudio.shape, ltarget.shape)
    break 

torch.Size([24, 1497, 512]) torch.Size([24, 384]) torch.Size([24, 1497]) torch.Size([24]) torch.Size([24])


In [9]:
class Attention(nn.Module):
    def __init__(self, input_size: int, hidden_size: int):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.Tanh(),
        )
        self.linear = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        outputs = self.linear(self.fc(x))
        # print(outputs.size())
        alpha = torch.softmax(outputs, dim=2)
        x = (x * alpha)
        return x

class RASModel(torch.nn.Module):

    def __init__(self, embed_dim, num_heads):
        super().__init__()

        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.audio_linear = nn.Linear(512, self.embed_dim)
        self.text_linear = nn.Linear(384, 384)
        # self.attention = nn.MultiheadAttention(self.embed_dim+384, self.num_heads, batch_first=True)
        self.attention = Attention(self.embed_dim+384,self.embed_dim+384) #nn.MultiheadAttention(self.embed_dim+384, self.num_heads, batch_first=True)
        self.fc1 = nn.Linear(self.embed_dim+384, 2)
        self.fc2 = nn.Linear(512,124)
        self.fc3 = nn.Linear(124,2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.rnn = nn.GRU(self.embed_dim+384,self.embed_dim+384,2,batch_first=True)

        
    
    def forward(self, audio_fea, text_fea):

        # audio_fea = self.audio_linear(audio_fea)
        # text_fea = self.text_linear(text_fea)

        B, T, D = audio_fea.size()

        text_fea_rep = text_fea.repeat(T, 1) #B,512 -> T,B,512
        text_fea_rep = text_fea_rep.reshape(B,T,-1)

        x = torch.cat((audio_fea, text_fea_rep), 2)

        # x, _ = self.attention(x, x, x)
        # x, _ = self.rnn(x)
        # print(rnn_output.size())
        x = self.attention(x)
        # print(att_output.size())
        linear_attn = self.fc1(x)
        # print(linear_attn.size())
        return linear_attn

In [10]:
model = RASModel(512, 8).to(device)
summary(model, audio.to(device), text.to(device))

                          Kernel Shape     Output Shape    Params Mult-Adds
Layer                                                                      
0_attention.fc.Linear_0     [896, 896]  [24, 1497, 896]  803.712k  802.816k
1_attention.fc.Tanh_1                -  [24, 1497, 896]         -         -
2_attention.Linear_linear     [896, 1]    [24, 1497, 1]     896.0     896.0
3_fc1                         [896, 2]    [24, 1497, 2]    1.794k    1.792k
----------------------------------------------------------------------------
                        Totals
Total params          806.402k
Trainable params      806.402k
Non-trainable params       0.0
Mult-Adds             805.504k


  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_attention.fc.Linear_0,"[896, 896]","[24, 1497, 896]",803712.0,802816.0
1_attention.fc.Tanh_1,-,"[24, 1497, 896]",,
2_attention.Linear_linear,"[896, 1]","[24, 1497, 1]",896.0,896.0
3_fc1,"[896, 2]","[24, 1497, 2]",1794.0,1792.0


In [11]:
class weighted_log_loss(nn.Module):    
    def __init__(self):
        super(weighted_log_loss,self).__init__()
        self.LOSS_BIAS = 0.2

    def forward(self, yt, yp):   
        pos_loss = -(0 + yt) * torch.log(0 + yp + 1e-7)
        neg_loss = -(1 - yt) * torch.log(1 - yp + 1e-7)

        return self.LOSS_BIAS * torch.mean(neg_loss) + (1. - self.LOSS_BIAS) * torch.mean(pos_loss)


In [12]:
import numpy as np
criterion = nn.CrossEntropyLoss((torch.FloatTensor([0.45, 1]).to(device)))
# criterion = nn.L1Loss()

optimizer =  torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) # What goes in here?
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8)


In [13]:
from sklearn.metrics import f1_score

def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    
    prob_all = []
    label_all = []

    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    
    for i, (audio_fea, text_fea, target, audio_len, target_len) in enumerate(dataloader):
        
        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        audio_fea = audio_fea.to(device)
        text_fea  = text_fea.to(device)
        target = target.to(device)

        ### Forward Propagation
        logits  = model(audio_fea, text_fea).permute(0,2,1)

        ### Loss Calculation
        # for j in range(logits.shape[2]):
        #     if j==0:
        #         loss  = criterion(logits[:,:,j], target[:,j])
        #     else:
        #         loss  += criterion(logits[:,:,j], target[:,j])
        loss = criterion(logits,target)
        ### Backward Propagation
        loss.backward() 
        
        ### Gradient Descent
        optimizer.step()       

        tloss   += loss.item()
        for j in range(logits.shape[0]):
            # print(torch.argmax(logits[j,:,:audio_len[j]], dim= 0).shape)
            # print(target[j,:target_len[j]].shape)
            tacc += torch.mean(torch.argmax(logits[j,:,:audio_len[j]], dim= 0) == target[j,:target_len[j]], dtype=torch.float32).item()
            prob_all.extend(np.argmax(logits.detach().cpu().numpy()[j,:,:audio_len[j]], axis= 0)) #求每一行的最大值索引
            label_all.extend(target.detach().cpu().numpy()[j,:target_len[j]])
            # print(f1_score(label_all,prob_all))
        # if i%100==0:
        #     print(np.sum(torch.argmax(logits, dim= 1).cpu().numpy()))
        #     print(np.sum(target.cpu().numpy()))

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))), 
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1) / logits.shape[0])),
                              f1="{:.04f}%".format(float(f1_score(label_all,prob_all)*100)))
        batch_bar.update()

        ### Release memory
        del audio_fea, text_fea, target
        torch.cuda.empty_cache()
  
    batch_bar.close()
    tloss   /= len(train_loader)
    tacc    /= len(train_loader)*config['batch_size']
    tf1  = f1_score(label_all,prob_all)
    return tloss, tacc, tf1


def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy

    prob_all = []
    label_all = []

    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (audio_fea, text_fea, target, audio_len, target_len) in enumerate(dataloader):
        ### Move Data to Device (Ideally GPU)
        audio_fea = audio_fea.to(device)
        text_fea  = text_fea.to(device)
        target = target.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode(): 
            ### Forward Propagation
            logits  = model(audio_fea, text_fea).permute(0,2,1)
            ### Loss Calculation
            # for j in range(logits.shape[2]):
            #     if j==0:
            #         loss  = criterion(logits[:,:,j], target[:,j])
            #     else:
            #         loss  += criterion(logits[:,:,j], target[:,j])
            loss    = criterion(logits, target)
            # loss = criterion(torch.argmax(logits, dim=1),target)


        vloss   += loss.item()
        for j in range(logits.shape[0]):
            vacc += torch.mean(torch.argmax(logits[j,:,:audio_len[j]], dim= 0) == target[j,:target_len[j]], dtype=torch.float32).item()
            prob_all.extend(np.argmax(logits.detach().cpu().numpy()[j,:,:audio_len[j]], axis= 0)) #求每一行的最大值索引
            label_all.extend(target.detach().cpu().numpy()[j,:target_len[j]])

        # Do you think we need loss.backward() and optimizer.step() here?

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))), 
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1) / logits.shape[0])),
                              f1="{:.04f}%".format(float(f1_score(label_all,prob_all)*100)))

        batch_bar.update()
    
        ### Release memory
        del audio_fea, text_fea, target
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)*config['batch_size']
    vf1   = f1_score(label_all,prob_all)

    return vloss, vacc, vf1

In [14]:
best_acc = 0

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc, train_f1   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc, val_f1   = eval(model, val_loader)
    scheduler.step()

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\tTrain F1 {:.04f}\t Learning Rate {:.07f}".format(train_acc*100,\
                                                                                                       train_loss, train_f1*100, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}\tVal F1 {:.04f}".format(val_acc*100, val_loss, val_f1*100))
    
    torch.save(model.state_dict(), 'model.pkl')

    if val_acc>best_acc:
        torch.save(model.state_dict(), 'best.pkl')
        best_acc = val_acc
    ### Log metrics at each epoch in your run 
    # Optionally, you can log at each batch inside train/eval functions 
    # (explore wandb documentation/wandb recitation)
    # wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss, 
            #    'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})    
    # wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss, 'lr': curr_lr})

    ### Highly Recommended: Save checkpoint in drive and/or wandb if accuracy is better than your current best

### Finish your wandb run
# run.finish()


Epoch 1/100


                                                                                                

	Train Acc 49.8304%	Train Loss 0.7334	Train F1 40.1473	 Learning Rate 0.0000100
	Val Acc 48.0579%	Val Loss 0.7194	Val F1 41.3413

Epoch 2/100


                                                                                                

	Train Acc 48.6303%	Train Loss 0.7079	Train F1 41.2786	 Learning Rate 0.0000100
	Val Acc 47.8246%	Val Loss 0.6998	Val F1 41.6647

Epoch 3/100


                                                                                                

	Train Acc 48.4697%	Train Loss 0.6907	Train F1 41.4267	 Learning Rate 0.0000100
	Val Acc 47.3160%	Val Loss 0.6858	Val F1 42.1752

Epoch 4/100


Train:  97%|█████████▋| 304/313 [13:54<00:47,  5.29s/it, acc=48.3128%, f1=41.5768%, loss=0.6789]