# Chronology and age analysis

## Import dependencies, set constants etc.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, ParameterSampler
from joblib import Parallel, delayed

from src import utils
from src.features.features_utils import PhaseCorrection, BeatUpbeatRatio, IOIComplexity, TempoSlope, ProportionalAsynchrony, RollingIOISummaryStats
from src.detect.detect_utils import OnsetMaker
from src.visualise.random_forest_plots import *

  from pandas import Int64Index as NumericIndex


In [2]:
# These variables are used for the optimization process
SEED = 42
N_FOLDS = 5
N_JOBS = -1
# Number of iterations to use in random sampling
N_ITER = 10000

In [3]:
# Set the seed in NumPy for consistent results across function calls
np.random.seed(SEED)

In [None]:
# Get the filepath for our birth/death dates list


## Load in data

First, we load in our list of `src.detect.detect_utils.OnsetMaker` classes. These contain the location of detected onsets and beats, as well as additional metadata.

In [None]:
onsets: list[OnsetMaker] = utils.unserialise_object(fr'{utils.get_project_root()}\models\matched_onsets_corpus_chronology')

## Extract features

Now, we can extract our desired feature from each OnsetMaker class.

In [8]:
def get_feature_data(feature_cls, cols, extra_str = '', **cls_kwargs):
    """Creates a class with given kwargs and returns the desired key-value pairs from its summary dictionary"""
    cls = feature_cls(**cls_kwargs)
    return {k + extra_str: v for k, v in cls.summary_dict.items() if k in cols}

In [9]:
def process_track(track: OnsetMaker) -> dict:
    """Processes a single track, extracting all required features, and returns a dictionary"""
    # Convert the summary dictionary (dictionary of arrays) to a dataframe
    summary_dict = pd.DataFrame(track.summary_dict)
    # These are the positions of downbeats, i.e. the first beat of a measure
    downbeats = track.ons['downbeats_manual']
    # The tempo and time signature of the track
    tempo = track.tempo
    time_signature = track.item['time_signature']
    # Subset to get my onsets and partner onsets as separate dataframes
    for exog_ins in utils.INSTRUMENTS_TO_PERFORMER_ROLES.keys():
        my_onsets = track.ons[exog_ins]
        my_beats = summary_dict[exog_ins]
        their_beats = summary_dict[[i for i in utils.INSTRUMENTS_TO_PERFORMER_ROLES.keys() if i != exog_ins]]
        # BEAT-UPBEAT RATIO
        bur = get_feature_data(
            BeatUpbeatRatio, ['bur_log_mean', 'bur_log_std', 'bur_log_count_nonzero'],
            my_onsets=my_onsets, my_beats=my_beats, clean_outliers=True
        )
        # PHASE CORRECTION
        pc = get_feature_data(
            PhaseCorrection, ['self_coupling', 'coupling_bass', 'coupling_drums', 'nobs'],
            my_beats=my_beats, their_beats=their_beats, order=1
        )
        # PHASE CORRECTION - PARTNER
        # In comparison to the 'full' phase correction model, we only need to get a few columns here
        pcb = get_feature_data(
            PhaseCorrection, ['coupling_piano', 'nobs'], extra_str='_bass',
            my_beats=summary_dict['bass'], their_beats=summary_dict[['piano', 'drums']], order=1
        )
        pcd = get_feature_data(
            PhaseCorrection, ['coupling_piano', 'nobs'], extra_str='_drums',
            my_beats=summary_dict['drums'], their_beats=summary_dict[['piano', 'bass']], order=1
        )
        # PROPORTIONAL ASYNCHRONY
        pa = get_feature_data(
            ProportionalAsynchrony, ['piano_prop_async_count_nonzero', 'piano_bass_prop_async_nanmean', 'piano_drums_prop_async_nanmean', 'piano_bass_prop_async_nanstd', 'piano_drums_prop_async_nanstd'],
            summary_df=summary_dict, my_instr_name=exog_ins
        )
        # IOI COMPLEXITY
        ioi = get_feature_data(
            IOIComplexity, ['lz77_mean', 'lz77_std', 'n_onsets_mean', 'n_onsets_std'],
            my_onsets=my_onsets, downbeats=downbeats, tempo=tempo, time_signature=time_signature
        )
        # TEMPO SLOPE
        ts = get_feature_data(
            TempoSlope, ['tempo_slope', 'tempo_drift'],
            my_beats=pd.concat([my_beats, their_beats], axis=1).mean(axis=1)
        )
        # TEMPO STABILITY
        tstab = get_feature_data(
            RollingIOISummaryStats, ['rolling_std_count_nonzero', 'rolling_std_median'],
            my_onsets=my_beats, downbeats=downbeats, bar_period=4
        )
        # Return a single dictionary that combines the summary dictionary for all the features
        return dict(**track.item, **bur, **pc, **pcb, **pcd, **pa, **ioi, **ts, **tstab, tempo=tempo)

Now, we extract features from all tracks in parallel (should take < 5 minutes)

In [10]:
with Parallel(n_jobs=-1, verbose=5) as parallel:
    res = parallel(delayed(process_track)(t) for t in onsets)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.7min finished
