In [1]:
import torch
import torchaudio
import torch.nn.functional as F

import os
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from IPython.display import Audio
import time

## 1. Creating the Dataset

In [2]:
from torchaudio.transforms import Resample
from data import LibriDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SAMPLE_FREQ = 16000
AUDIO_MAX_DURATION = 60
BATCH_SIZE = 12

# Creating the dataset and dataloader

trainDataset = LibriDataset("data_cut", newSampleFreq = SAMPLE_FREQ, maxLenght = SAMPLE_FREQ * AUDIO_MAX_DURATION)
trainDataLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True)

In [4]:
print("Audio in the dataset: " + str(len(trainDataset)))
print("Frame per item: " + str(trainDataset[2][0].shape))
Audio(trainDataset[40][0], rate = SAMPLE_FREQ)

Audio in the dataset: 57
Frame per item: torch.Size([960000])


### 2. HuBERT TEST

In [5]:
from transformers import HubertModel,AutoProcessor,AutoFeatureExtractor,Wav2Vec2Processor,HubertForCTC
import joblib
import torch.nn as nn
from hubertKM import SemanticTokenizer, visualizeEmbeddings

#### Importing the pretrained models for HuBERT and KMeans

In [6]:
# A semantic tokenizer
# Input: (w2vCheckpointPath, kmeansCheckpointPath)
# Output: (semanticTokens, normalizedEmbeddings)
w2vBERT = SemanticTokenizer("facebook/hubert-base-ls960","./hubertKM/hubert_base_ls960_L9_km500.bin")  

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

#### Computing semantic tokens

In [7]:
with torch.no_grad():
    s = time.time()
    ##semanticTokens, embeddings = w2vBERT(trainDataset[0])
    #semanticTokens, embeddings = w2vBERT(next(iter(trainDataLoader)).squeeze())
    print(time.time() - s)

0.0


In [8]:
#semanticTokens, embeddings

#### Visualizing embeddings

In [9]:
# Create some plots using PCA and t-SNE

# visualizeEmbeddings(embeddings, semanticTokens)

#### Check with another implementation (OPTIONAL)

## 3. Creating the Semantic Transformer

In [10]:
from SoundStream import soundstream_16khz, audio_to_tokens, tokens_to_audio, encode_audio, decode_audio

In [11]:
Audio(trainDataset[40][0], rate = SAMPLE_FREQ)

In [12]:
soundStream = soundstream_16khz()

In [13]:
audioWave, sampleRate = torchaudio.load("data_cut\\16\\352\\little_lame_prince_01_64kb_0000.flac")

x = encode_audio(audioWave, sampleRate, soundStream)
print(x.shape)
Audio(audioWave, rate = sampleRate)

torch.Size([1, 151, 8])


## TEST

In [14]:
from data import storeTokens,  TokensDataset

tokenPath = "out"
tokenFile = "out.csv"
audioPath = "data_cut"

#fileCount = storeTokens(audioPath, tokenPath, tokenFile, w2vBERT, soundStream, fileCountCheckpoint = 2)

In [15]:
tokenDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeSemanticTokens = True, includeCoarseTokens = True, includeFineTokens = True) 
semanticDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeSemanticTokens = True)
coarseDataset = TokensDataset(tokenPath, tokenFile, requiredDuration=30, includeCoarseTokens = True)
fineDataset = TokensDataset(tokenPath, tokenFile, requiredDuration = 30, includeFineTokens = True)

In [16]:
firstItem = tokenDataset.__getitem__(0)
firstSemantic = semanticDataset.__getitem__(0)
firstCoarse = coarseDataset.__getitem__(0)
firstFine = fineDataset.__getitem__(0)

print("ALL TOKENS:")
for item in firstItem:
	print(item)

print("\nSEMANTIC TOKEN:")
for item in firstSemantic:
	print(item)

print("\nCOARSE TOKEN:")
for item in firstCoarse:
	print(item)

print("\nFINE TOKEN:")
for item in firstFine:
	print(item)

ALL TOKENS:
[17, 17, 225, 391, 391, 491, 289, 26, 7, 251, 241, 241, 431, 84, 84, 16, 16, 16, 16, 182, 182, 182, 375, 375, 98, 98, 13, 229, 170, 247, 312, 187, 47, 491, 491, 491, 391, 391, 391, 73, 289, 289, 373, 72, 437, 306, 306, 306, 396, 178, 143, 192, 192, 192, 242, 242, 116, 94, 335, 14, 411, 411, 411, 171, 171, 171, 171, 252, 143, 458, 192, 483, 440, 440, 44, 44, 99, 338, 338, 338, 338, 338, 395, 395, 499, 306, 306, 306, 396, 396, 215, 35, 401, 402, 133, 364, 364, 276, 276, 109, 109, 498, 498, 498, 498, 498, 59, 37, 24, 24, 131, 393, 393, 155, 155, 332, 332, 332, 332, 372, 396, 313, 216, 216, 22, 283, 283, 455, 94, 479, 479, 331, 256, 426, 426, 426, 426, 426, 413, 413, 243, 243, 243, 270, 270, 270, 433, 433, 160, 160, 160, 160, 18, 112, 112, 225, 80, 289, 320, 320, 159, 494, 38, 31, 342, 68, 115, 273, 273, 265, 265, 265, 85, 85, 146, 146, 285, 325, 34, 134, 134, 43, 43, 109, 109, 139, 139, 139, 175, 175, 81, 277, 177, 236, 314, 401, 401, 310, 310, 107, 395, 395, 485, 329, 171, 17