In [8]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# Setup

In [9]:
import time

import pandas as pd
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

from src.dataset import *
from src.train import *

RES_DIR_PATH = "res"

In [10]:
AUDIOS_DIR_PATH = os.path.join(RES_DIR_PATH, "mp3_data")
MODELS_DIR_PATH = os.path.join(RES_DIR_PATH, "models")
DATASETS_DIR_PATH = os.path.join(RES_DIR_PATH, "datasets")

CSV_PATH = os.path.join(RES_DIR_PATH, "samples.csv")

MODEL_NAME = "facebook/wav2vec2-base"
FEATURES_CONFIG = {
    "genre": {"top_n": 5}
}

TEST_SIZE = 0.2

In [11]:
model_id = MODEL_NAME.replace("/", "-")
run_name = f"{model_id}-{time.strftime('%Y%m%d-%H%M%S')}"

# Analysis

In [12]:
filtered_csv_path = get_csv_name(CSV_PATH, FEATURES_CONFIG)

if os.path.exists(filtered_csv_path):
    df = pd.read_csv(filtered_csv_path)
else:
    df = pd.read_csv(CSV_PATH)
    df = filter_df(
        df, 
        audios_dir_path=AUDIOS_DIR_PATH, 
        remove_nones=True,
        features_config=FEATURES_CONFIG, 
    )
    df.to_csv(filtered_csv_path, index=False)

df.head()

Unnamed: 0,genre,category,mp3_path,id
0,Hip Hop,Trumpet,res/mp3_data/01 Hip Hop/Abandoned Brass Stabs.mp3,01_Hip_Hop_Abandoned_Brass_Stabs
1,Hip Hop,Timpani,res/mp3_data/01 Hip Hop/Abandoned Orchestral L...,01_Hip_Hop_Abandoned_Orchestral_Layers
2,Hip Hop,Electronic Beats,res/mp3_data/01 Hip Hop/Afloat Beat.mp3,01_Hip_Hop_Afloat_Beat
3,Hip Hop,Synthesizer,res/mp3_data/01 Hip Hop/Afloat Pad.mp3,01_Hip_Hop_Afloat_Pad
4,Hip Hop,Synthetic Bass,res/mp3_data/01 Hip Hop/Afloat Sub Bass.mp3,01_Hip_Hop_Afloat_Sub_Bass


# Dataset

In [15]:
encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{model_id}-full-encoded")
encoded_dataset_path

'res/datasets/ds-facebook-wav2vec2-base-full-encoded'

In [19]:
if os.path.exists(encoded_dataset_path):
    ds = datasets.load_from_disk(encoded_dataset_path)
    ds = add_audio_column(ds)
else:
    feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
    non_encoded_ds = get_dataset(df)
    non_encoded_ds = add_audio_column(non_encoded_ds)
    print("Applying preprocessing to dataset")
    ds = non_encoded_ds.map(get_preprocess_func(feature_extractor), remove_columns=["audio"], batched=True)
    ds.save_to_disk(encoded_dataset_path)

ds

Dataset({
    features: ['genre', 'category', 'subcategory', 'key type', 'key signature', 'time signature', 'beat count', 'Single', 'Ensemble', 'Dry', 'Processed', 'Clean', 'Distorted', 'Grooving', 'Arrhythmic', 'Acoustic', 'Electric', 'Melodic', 'Dissonant', 'Relaxed', 'Intense', 'Part', 'Fill', 'Cheerful', 'Dark', 'audio_path', 'id', 'input_values', 'audio'],
    num_rows: 29220
})

In [4]:
from src.utils import play_random_audios

play_random_audios(ds["train"], get_dataset_label_mapping(ds["train"]), 3, print_features=["id"] + TARGET_FEATURES)

NameError: name 'ds' is not defined

# Training

In [9]:
prepared_ds = prepare_ds(ds, df, TARGET_FEATURES, fixed_mapping=None, save=False)
prepared_ds

Casting the dataset: 100%|██████████| 13/13 [00:14<00:00,  1.10s/ba]
Casting the dataset: 100%|██████████| 4/4 [00:05<00:00,  1.47s/ba]


DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'input_values'],
        num_rows: 12450
    })
    test: Dataset({
        features: ['id', 'label', 'input_values'],
        num_rows: 3129
    })
})

In [10]:
class_feature = prepared_ds["train"].features["label"]
l2i, i2l = get_feature_label_mapping(class_feature)

l2i, i2l

({'Hip Hop': 0,
  'Electronic/Dance': 1,
  'Rock/Blues': 2,
  'World/Ethnic': 3,
  'Orchestral': 4},
 {0: 'Hip Hop',
  1: 'Electronic/Dance',
  2: 'Rock/Blues',
  3: 'World/Ethnic',
  4: 'Orchestral'})

In [None]:
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=class_feature.num_classes,
    label2id=l2i,
    id2label=i2l,
)

In [None]:
training_args = TrainingArguments(
    run_name=run_name,
    output_dir="out",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=256,
    per_device_eval_batch_size=512,
    num_train_epochs=10,
    logging_steps=50,
)

In [None]:
import wandb

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=feature_extractor,
    compute_metrics=get_metrics_func(),
)

trainer.train()
wandb.finish()

In [None]:
trainer.save_model(os.path.join(MODELS_DIR_PATH, run_name))