# Beat-Upbeat Ratio Distributions

## Import dependencies, set constants etc.

In [None]:
import warnings

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import scipy.signal as signal
import statsmodels.formula.api as smf
from sklearn.mixture import GaussianMixture
from statsmodels.tools.sm_exceptions import ConvergenceWarning

from src import utils
import src.visualise.visualise_utils as vutils
from src.visualise.bur_plots import *
from src.features.features_utils import FeatureExtractor

In [2]:
# These variables are used for the optimization process
SEED = 42
N_FOLDS = 5
N_JOBS = -1
N_BOOT = 999

In [3]:
# Upper and lower bounds to use when thresholding BURs
BUR_UPPER = 4.0
BUR_LOWER = 0.25

In [4]:
# Set the seed in NumPy for consistent results across function calls
np.random.seed(SEED)

## Load in data
We start by loading in the data we extracted from our `corpus_chronology` in `src\features\extract_features.py`. This gives us a list of `FeatureExtractor` classes (defined in `src\features\features_utils.py`).

In [5]:
features: list[FeatureExtractor] = utils.unserialise_object(fr'{utils.get_project_root()}\models\extracted_features_corpus_chronology')

In [6]:
res = []
# Iterate through each track
for track in features:
    # Iterate through each instrument
    for instr in utils.INSTRUMENTS_TO_PERFORMER_ROLES.keys():
        # Iterate through each BUR by that instrument
        for bur in track.BURs[instr].bur['burs'].dropna().to_list():
            # Append a new dictionary
            res.append(dict(mbz_id=track.metadata[instr]['mbz_id'], bur=np.log2(bur), instrument=instr, tempo=track.metadata[instr]['tempo'], bandleader=track.metadata[instr]['pianist']))
burs = pd.DataFrame(res)

## Clean data
We drop BUR values lower than 0.25 and higher than 4 (see Corcoran & Frieler, 2021)

In [7]:
burs = burs[(burs['bur'] > np.log2(BUR_LOWER)) & (burs['bur'] < np.log2(BUR_UPPER))]

In [8]:
burs.groupby('instrument')['bur'].mean()

instrument
bass     0.056211
drums    0.721039
piano    0.268612
Name: bur, dtype: float64

Now we order our dataframe so that the instruments are in the correct order (piano -> bass -> drums)

In [9]:
burs = (
    burs.set_index('instrument')
    .loc[utils.INSTRUMENTS_TO_PERFORMER_ROLES.keys()]
    .reset_index(drop=False)
)

## Plot the average BUR per instrument

In [None]:
BarPlotBUR(burs).create_plot()
plt.show()

## Compute the KDE and extract peaks

In [11]:
def get_peaks(data, len_data: int = 1000, **kwargs) -> np.ndarray:
    """Fits a kernel-density estimate to BUR data and extracts BUR peaks"""
    # Fit the actual KDE to the data, using the default parameters
    kde = stats.gaussian_kde(data.T, bw_method='silverman')
    # Create a linear space of integers ranging from our lowest to our highest BUR
    data_plot = np.linspace(data.min(), data.max(), len_data)[:, np.newaxis]
    # Evaluate the KDE on our linear space of integers
    kde_eval = kde.evaluate(data_plot.T)
    # Find the peaks from our fitted KDE
    peaks, _ = signal.find_peaks(kde_eval, **kwargs)
    # Return the sorted peaks from our KDE: this will be an array of BUR values
    return np.sort(data_plot[peaks].flatten())

In [12]:
def bootstrap_peaks(data: np.array, actual_peaks: np.array, tol: float = 0.5) -> dict:
    """Bootstrap confidence intervals for an array of peaks"""
    # We create a dictionary with lists for storing our bootstrapped peaks
    boot_res = {pe: [] for pe in actual_peaks}
    # We perform a few operations here, so we don't have to re-do them every loop
    size = len(data)
    data = data.flatten()
    # For every bootstrapped sample
    for i_ in range(1):
        print(i_)
        # Set the random seed
        np.random.seed(i_)
        # Take a random sample of our BURs, with replacement, and reshape
        boot = np.random.choice(data, replace=True, size=size).reshape(-1, 1)
        # Get the peaks for our bootstrapped sample
        boot_peaks = set(get_peaks(boot))
        # Iterate through the actual peaks of our BURs
        for peak in actual_peaks:
            # Try and find the closest value in our bootstrapped set
            try:
                closest_peak = min(boot_peaks, key=lambda x: abs(x - peak))
            # If we've run out of bootstrapped peaks, break out and finish this sample early
            except ValueError:
                break
            else:
                # If the distance between our actual peak and the nearest bootstrapped peak exceeds our threshold
                if abs(closest_peak - peak) > tol:
                    # Then continue on to the next actual peak, without matching
                    continue
                else:
                    # Otherwise, match our actual peak with this closest peak and remove it from the set
                    boot_res[peak].append(closest_peak)
                    boot_peaks.remove(closest_peak)
    # Return a dictionary with
    return {k: [np.nanpercentile(v, 5), np.nanpercentile(v, 95)] for k, v in boot_res.items()}

In [13]:
res_ = []
for instr, grp in burs.groupby('instrument', sort=False):
    X = grp['bur'].to_numpy().reshape(-1, 1)
    grp_peaks = get_peaks(X)
    ci_peaks = bootstrap_peaks(X, grp_peaks)
    for num, (actual, (low, high)) in enumerate(ci_peaks.items()):
        res_.append(dict(
            instrument=instr,
            peak_num=num,
            peak=actual,
            low=low,
            high=high
        ))
peaks_df = pd.DataFrame(res_)

0
0
0


In [14]:
peaks_df['raw_peak'] = 2 ** peaks_df['peak']
peaks_df

Unnamed: 0,instrument,peak_num,peak,low,high,raw_peak
0,piano,0,0.334334,0.31031,0.31031,1.260796
1,bass,0,0.010515,0.018505,0.018505,1.007315
2,bass,1,1.081162,1.065182,1.065182,2.115739
3,drums,0,-0.678679,-0.678679,-0.678679,0.624737
4,drums,1,1.163163,1.167167,1.167167,2.239479


## Plot the BUR distribution with density curve and peaks

In [None]:
HistPlotBURByInstrument(burs, peaks_df).create_plot()
plt.show()

## Create GMMs using given number of components

In [16]:
for instr in ['bass', 'drums']:
    data = peaks_df[peaks_df['instrument'] == instr]
    peaks = data['peak'].to_numpy().reshape(-1, 1)
    n_components = len(data)
    gm = GaussianMixture(
        random_state=42,
        n_components=n_components,
        means_init=peaks
    )
    gm.fit(burs[burs['instrument'] == instr]['bur'].to_numpy().reshape(-1, 1))
    means = gm.means_.flatten()
    means_init = gm.means_init.flatten()
    weights = gm.weights_
    print(gm.means_.flatten(), means_init, weights)

[-0.34517166  1.04498657] [0.01051499 1.08116154] [0.71126845 0.28873155]
[-0.42768708  1.03950669] [-0.67867868  1.16316316] [0.21705932 0.78294068]


## Model average tempo vs BUR

In [None]:
average = burs.groupby(['instrument', 'mbz_id']).agg(dict(bur=['mean', 'count'], tempo='median', bandleader='first')).reset_index(drop=False)
average.columns = ['_'.join(col).strip() for col in average.columns.values]
average = average[average['bur_count'] > 15]
average['tempo_standard'] = (average['tempo_median'] - average['tempo_median'].mean()) / average['tempo_median'].std()

In [109]:
md = smf.mixedlm(
    "bur_mean~tempo_standard*C(instrument_, Treatment(reference='piano'))",
    data=average,
    groups=average['bandleader_first'],
    re_formula="0 + tempo_standard + C(instrument_, Treatment(reference='piano'))"
).fit()
print(md.summary())

                                                         Mixed Linear Model Regression Results
Model:                                             MixedLM                                 Dependent Variable:                                 bur_mean 
No. Observations:                                  751                                     Method:                                             REML     
No. Groups:                                        10                                      Scale:                                              0.0693   
Min. group size:                                   68                                      Log-Likelihood:                                     -100.8164
Max. group size:                                   81                                      Converged:                                          Yes      
Mean group size:                                   75.1                                                                                     

In [110]:
# Variance explained by the fixed effects: we need to use md.predict() with the underlying data to get this
var_fixed = md.predict().var()
# Variance explained by the random effects
var_random = float(md.cov_re.to_numpy().mean())
# Variance of the residuals
var_resid = md.scale
# Total variance of the model
total_var = var_fixed + var_random + var_resid
# Calculate the r2 values and append to the model
print('conditional_r2:', (var_fixed + var_random) / total_var)
print('marginal_r2:', var_fixed / total_var)

conditional_r2: 0.6069052210554511
marginal_r2: 0.5860129491841163


In [111]:
stddev = np.std([v.iloc[0] for v in md.random_effects.values()])
print('stdev of bandleader groups:', stddev)

stdev of bandleader groups: 0.1354858567269509


## Plot average BUR vs tempo

In [None]:
class RegPlotBURTempo(vutils.BasePlot):
    """Creates a graph showing tempo vs mean BUR, with marginal distributions"""
    # Disable convergence and user warnings here, raised when the model is created with bootstrapping
    warnings.simplefilter('ignore', ConvergenceWarning)
    warnings.simplefilter('ignore', UserWarning)
    # Initial attributes for plotting
    BURS_WITH_IMAGES = [0.5, 1, 2]
    BUR_THRESHOLD = 15
    N_BOOT = 100
    BIN_MULTIPLER = 1.5
    # These are keywords that we pass into our given plot types
    LINE_KWS = dict(lw=vutils.LINEWIDTH * 2, ls=vutils.LINESTYLE)
    FILL_KWS = dict(lw=0, ls=vutils.LINESTYLE, alpha=vutils.ALPHA)
    SCATTER_KWS = dict(
        hue_order=utils.INSTRUMENTS_TO_PERFORMER_ROLES.keys(),
        palette=vutils.RGB, markers=['o', 's', 'D'], s=40,
        edgecolor=vutils.BLACK, zorder=1
    )
    HIST_KWS = dict(
        kde=False, color=vutils.BLACK, alpha=vutils.ALPHA,
        lw=vutils.LINEWIDTH, ls=vutils.LINESTYLE
    )
    # Attributes for our model
    MODEL = "bur_mean ~ tempo_standard * C(instrument_, Treatment(reference='piano'))"
    RE_FORMULA = "0 + tempo_standard + C(instrument_, Treatment(reference='piano'))"

    def __init__(self, bur_df: pd.DataFrame, **kwargs):
        """Called when initialising the class"""
        self.corpus_title = 'corpus_chronology'
        # Initialise the base plot with our given kwargs
        super().__init__(figure_title=fr'bur_plots\regplot_burtempo_{self.corpus_title}', **kwargs)
        # Format the dataframe
        self.average = self._format_df(bur_df)
        # Create our initial model, using the actual data
        self.md = self._mixedlm(self.average)
        # Create our gridded plots
        self.fig, self.ax = plt.subplots(
            nrows=2, ncols=2, figsize=(vutils.WIDTH, vutils.WIDTH / 2),
            gridspec_kw=dict(width_ratios=(11, 1), height_ratios=(1, 5)),
        )
        # The main ax for plotting the regression/scatter plot
        self.main_ax = self.ax[1, 0]
        # Marginal ax, for plotting histograms
        self.marginal_ax = np.array([self.ax[0, 0], self.ax[1, 1]])
        # Top right corner ax, which we can go ahead and disable
        self.ax[0, 1].axis('off')

    def _format_df(self, bur_df: pd.DataFrame) -> pd.DataFrame:
        """Formats the dataframe of raw BUR values"""
        # Group by instrument and track, get the mean BUR value and number of BUR values
        clean = (
            bur_df.groupby(['instrument', 'mbz_id'])
            .agg(dict(bur=['mean', 'count'], tempo='median', bandleader='first'))
            .reset_index(drop=False)
        )
        # This resets the multi index and column names
        clean.columns = ['_'.join(col).strip() for col in clean.columns.values]
        # Drop BURs without enough values
        clean = clean[clean['bur_count'] > self.BUR_THRESHOLD]
        # Standardise the tempo into Z-scores and return
        clean['tempo_standard'] = (clean['tempo_median'] - clean['tempo_median'].mean()) / clean['tempo_median'].std()
        return clean

    def add_bur_images(self, y):
        """Adds images for required BUR values"""
        # Iterate through all of our BUR values
        for x in self.BURS_WITH_IMAGES:
            # Try and get the image of the notation type for this BUR value
            try:
                img = plt.imread(fr'{utils.get_project_root()}\references\images\bur_notation\bur_{x}.png')
            except FileNotFoundError:
                pass
            # If we can get the image, then yield it to add to our plot
            else:
                yield mpl.offsetbox.AnnotationBbox(
                    mpl.offsetbox.OffsetImage(img, clip_on=False, zoom=0.5), (y, np.log2(x)),
                    frameon=False, xycoords='data', clip_on=False, annotation_clip=False
                 )

    def _mixedlm(self, model_data: pd.DataFrame):
        """Creates a mixed effects model with given parameters from a dataset"""
        return smf.mixedlm(
            self.MODEL, data=model_data, groups=model_data['bandleader_first'], re_formula=self.RE_FORMULA
        ).fit()

    def _get_line(self, model):
        """Creates data for a straight line by predicting values from a mixed effects model"""
        # Get our intercept and tempo parameters from the model
        intercept = model.params['Intercept']
        tempo = model.params['tempo_standard']
        # Get our bass parameters from the model
        is_bass = model.params["C(instrument_, Treatment(reference='piano'))[T.bass]"]
        is_bass_tempo = model.params["tempo_standard:C(instrument_, Treatment(reference='piano'))[T.bass]"]
        # Get our drums parameters from the model
        is_drums = model.params["C(instrument_, Treatment(reference='piano'))[T.drums]"]
        is_drums_tempo = model.params["tempo_standard:C(instrument_, Treatment(reference='piano'))[T.drums]"]
        # Get our piano parameters from the model
        is_piano = 0
        is_piano_tempo = 0
        # This is the range of values we'll be iterating through, taken from the actual results
        low_, high_ = int(np.floor(self.average['tempo_median'].min())), int(np.ceil(self.average['tempo_median'].max()))
        mean_, std_ = np.array(range(low_, high_)).mean(), np.array(range(low_, high_)).std()
        # Iterate through each BPM in our range
        for bpm in range(low_, high_):
            # Standardise the BPM (Z-score) according to the observed values
            bpm_z = (bpm - mean_) / std_
            tempo_coeff = tempo * bpm_z
            # Iterate through each instrument and both coefficients
            for instr_, coeff_, interact_ in zip(
                utils.INSTRUMENTS_TO_PERFORMER_ROLES.keys(),
                [is_piano, is_bass, is_drums],
                [is_piano_tempo, is_bass_tempo, is_drums_tempo]
            ):
                # Construct the BUR value by following the regression equation
                bur_ = intercept + tempo_coeff + coeff_ + (interact_ * bpm_z)
                # Yield a dictionary of the results
                yield dict(tempo=bpm, tempo_std=bpm_z, instr=instr_, bur=bur_)

    def _format_bootstrap_lines(self, boot_models: list):
        """Formats data from a series of bootstrapped models into one dataframe of errors"""
        # Get a straight line for each bootstrapped model and combine into one dataframe
        big = pd.concat([pd.DataFrame(self._get_line(boot)) for boot in boot_models], axis=1)
        # Iterate through each tempo value
        for idx, row in big.iterrows():
            sem = stats.sem(row['bur'].to_numpy())
            # Get the standard error of the mean of the row
            # Return a dictionary of results
            yield dict(
                tempo=row['tempo'].iloc[0],
                instr=row['instr'].iloc[0],
                sem=sem,
                low_ci=np.percentile(row['bur'], 2.5),
                high_ci=np.percentile(row['bur'], 97.5)
            )

    def _get_bootstrapped_sample(self):
        """Returns bootstrapped samples of the full dataset"""
        def bootstrap(state: int):
            """Bootstrapping function"""
            # Take a random sample of bandleaders and iterate through each
            for _, leader in bandleaders.sample(frac=1, replace=True, random_state=state).items():
                # Get all the data belonging to each bandleader
                yield self.average[self.average['bandleader_first'] == leader]

        # These are the names of all bandleaders
        bandleaders = pd.Series(self.average['bandleader_first'].unique())
        for i in range(self.N_BOOT):
            # Print the current iteration to act as a log
            print(i)
            # Return each bootstrapped sample as a single dataframe
            yield pd.concat(bootstrap(i), axis=0)

    def _create_main_plot(self):
        """Plots regression and scatter plot onto the main axis, with bootstrapped errorbars"""
        # Get the line for the actual data
        line_df = pd.DataFrame(self._get_line(self.md))
        # Bootstrap to get random samples, replacement unit is bandleader
        boot_samples = self._get_bootstrapped_sample()
        # Create model for each sample of data
        boot_mds = [self._mixedlm(sample) for sample in boot_samples]
        # Convert all bootstrapped models into one single dataframe of errors
        boot_lines = pd.DataFrame(self._format_bootstrap_lines(boot_mds))
        # Iterate through each instrument and line color
        for instr_, col_ in zip(utils.INSTRUMENTS_TO_PERFORMER_ROLES.keys(), vutils.RGB):
            # First temporary dataframe: our actual data for this instrument
            temp_ = line_df[line_df['instr'] == instr_]
            # Plot the actual data
            self.main_ax.plot(temp_['tempo'], temp_['bur'], color=col_, **self.LINE_KWS)
            # Second temporary dataframe: our bootstrapped data for this instrument
            temp_boot_ = boot_lines[boot_lines['instr'] == instr_]
            # Calculate the upper and lower bounds as mean +/- standard error
            low_b = temp_['bur'] - (temp_boot_['sem'] * 1.96)
            high_b = temp_['bur'] + (temp_boot_['sem'] * 1.96)
            # Fill between the low and high bounds
            self.main_ax.fill_between(temp_boot_['tempo'], temp_boot_['low_ci'], temp_boot_['high_ci'], color=col_, **self.FILL_KWS)
        # Create the scatter plot
        sns.scatterplot(
            data=self.average, x='tempo_median', y='bur_mean', style='instrument_',
            ax=self.main_ax, hue='instrument_', **self.SCATTER_KWS
        )

    def _create_marginal_plots(self):
        """Plots histograms and density estimates onto the marginal axis"""
        # Top marginal plot
        sns.histplot(
            data=self.average, x='tempo_median', ax=self.marginal_ax[0],
            bins=int(vutils.N_BINS * self.BIN_MULTIPLER),  **self.HIST_KWS
        )
        # Right marginal plot
        sns.histplot(
            data=self.average, y='bur_mean', ax=self.marginal_ax[1],
            bins=int(vutils.N_BINS / self.BIN_MULTIPLER),  **self.HIST_KWS
        )

    def _create_plot(self):
        """Creates the main and marginal plots"""
        self._create_main_plot()
        self._create_marginal_plots()

    def _format_marginal_ax(self):
        """Formats axis-level properties for marginal axis"""
        # Remove correct spines from marginal axis
        for spine, ax in zip(['left', "bottom"], self.marginal_ax.flatten()):
            ax.spines[[spine, 'right', 'top']].set_visible(False)
        # Set other features for the main axis
        self.marginal_ax[0].set(
            xlabel='', ylabel='', yticks=[0], yticklabels=[''], xticklabels=[], xlim=(100, 310),
            xticks=[100, 150, 200, 250, 300]
        )
        self.marginal_ax[1].set(
            xlabel='', ylabel='', xticks=[0], xticklabels=[''], yticklabels=[], ylim=(-1.35, 1.7),
            yticks=[-1, 0, 1]
        )

    def _format_main_ax(self):
        """Formats axis-level properties for the main axis"""
        # Add BUR images onto the right-hand side of the main plot
        for artist in self.add_bur_images(y=305):
            self.main_ax.add_artist(artist)
        # Add a grid onto the plot
        self.main_ax.grid(visible=True, axis='both', which='major', zorder=0, **vutils.GRID_KWS)
        # Get our legend handles, and set their edge color to black
        hand, _ = self.main_ax.get_legend_handles_labels()
        for ha in hand:
            ha.set_edgecolor(vutils.BLACK)
        # Remove the old legend, then add the new one on
        self.main_ax.get_legend().remove()
        self.main_ax.legend(
            hand, [i.title() for i in utils.INSTRUMENTS_TO_PERFORMER_ROLES.keys()],
            loc='lower left', title='Instrument', frameon=True, framealpha=1,
            edgecolor=vutils.BLACK
        )
        # Final attributes to set here
        self.main_ax.set(
            xticks=[100, 150, 200, 250, 300], yticks=[-1, 0, 1], xlim=(100, 310),
            xlabel='Mean Tempo (BPM)', ylabel='Mean ${Log_2}$ BUR', ylim=(-1.35, 1.7)
        )

    def _format_ax(self):
        """Formats axis-level properties"""
        # Run code for formatting main and marginal ax separately
        self._format_main_ax()
        self._format_marginal_ax()
        # These lines of code apply to every ax on the plot
        for a in [self.main_ax, *self.marginal_ax.flatten()]:
            plt.setp(a.spines.values(), linewidth=vutils.LINEWIDTH)
            a.tick_params(axis='both', bottom=True, width=vutils.TICKWIDTH)

    def _format_fig(self):
        """Format figure-level properties"""
        self.fig.subplots_adjust(left=0.05, right=0.99, top=0.99, bottom=0.09, hspace=0.1, wspace=0.05)


RegPlotBURTempo(burs).create_plot()
plt.show()