In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# Setup

In [None]:
import time

import pandas as pd

from src.dataset import *
from src.train import *
from src.utils import *

RES_DIR_PATH = "res"

In [None]:
AUDIOS_DIR_PATH = os.path.join(RES_DIR_PATH, "mp3_data")
MODELS_DIR_PATH = os.path.join(RES_DIR_PATH, "models")
DATASETS_DIR_PATH = os.path.join(RES_DIR_PATH, "datasets")

CSV_PATH = os.path.join(RES_DIR_PATH, "samples.csv")

FEATURES_CONFIG = {
    "genre": {"top_n": 5, "samples": None}
}
TRAINING_CONFIG = {
    "feature_encoder": "wav2vec2",
    "freeze_encoder": False,
    "classifier": {"layers": [256], "dropout": 0},
}

VALID_SIZE = 0.1
TEST_SIZE = 0.1

In [None]:
run_descr = "test"
run_name = f"{TRAINING_CONFIG['feature_encoder']}-{time.strftime('%Y%m%d-%H%M%S')}-{run_descr}"

# Analysis

In [None]:
filtered_csv_path = get_csv_name(FEATURES_CONFIG, CSV_PATH)

if os.path.exists(filtered_csv_path):
    df = pd.read_csv(filtered_csv_path)
else:
    df = pd.read_csv(CSV_PATH)
    df = filter_df(
        df, 
        audios_dir_path=AUDIOS_DIR_PATH, 
        remove_nones=True,
        features_config=FEATURES_CONFIG, 
    )
    df.to_csv(filtered_csv_path, index=False)

df.head()

In [None]:
for f in FEATURES_CONFIG:
    print("="*50)
    print(f)
    print("-"*50)
    print(df[f].value_counts())

# Dataset

In [None]:
encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{TRAINING_CONFIG['feature_encoder']}-full-encoded")
encoded_dataset_path

In [None]:
if os.path.exists(encoded_dataset_path):
    ds = datasets.load_from_disk(encoded_dataset_path)
    ds = add_audio_column(ds, TRAINING_CONFIG)
else:
    non_encoded_ds = get_dataset(df)
    non_encoded_ds = add_audio_column(non_encoded_ds, TRAINING_CONFIG)
    print("Applying preprocessing to dataset")
    ds = non_encoded_ds.map(get_preprocess_func(TRAINING_CONFIG), remove_columns=["audio"], batched=True)
    ds.save_to_disk(encoded_dataset_path)

ds

In [None]:
# play_random_audios(ds["train"], get_dataset_label_mapping(ds["train"]), 3, print_features=["id"] + TARGET_FEATURES)

# Training

In [None]:
prepared_ds = prepare_ds(ds, df, FEATURES_CONFIG, 0.2, fixed_mapping=None, save=False)
prepared_ds

In [None]:
model = get_model(TRAINING_CONFIG, prepared_ds)
model

In [None]:
trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    feature_extractor=None,
    output_dir="out",
    debug=True,
    env="jupyter",
)

In [None]:
trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)