In [1]:
import pandas as pd
import numpy as np
from os import walk
from tokenizing_functions import convert_to_note_items, get_file_and_dirnames
from analysis_functions import check_duration, get_duration_count, get_durations_in_bins, check_triole_seq, get_triole_count, compute_shifts, get_shifts_count
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

PATH_TRANSPOSED = "../0_data/4_preprocessed_sets"

In [2]:
POSITION_STEPS = 16
TICKS_PER_BEAT = 1024
TICKS_PER_MIN_DURATION = TICKS_PER_BEAT*4/32
DURATION_BINS_LONG = np.arange(TICKS_PER_MIN_DURATION, (TICKS_PER_MIN_DURATION*64)+1, TICKS_PER_MIN_DURATION, dtype=int)
DURATION_BINS_SHORT = np.arange(TICKS_PER_MIN_DURATION, (TICKS_PER_MIN_DURATION*32)+1, TICKS_PER_MIN_DURATION, dtype=int)

In [3]:
# dictionary for filenames of datasets a, b, c, d
abcd_dic = {
    "a": {
        "name": "a)_4_4_metric_120_bpm",
        "files": []
    },
    "b": {
        "name": "b)_transposed_key",
        "files": []
    },
    "c": {
        "name": "c)_transposed_octave",
        "files": []
    },
    "d": {
        "name": "d)_transposed_key_and_octave",
        "files": []
    }
}
    
# get filenames for each dataset
dir = "17_POP909-Dataset-master"
for key in abcd_dic:
    files,_ = get_file_and_dirnames(f'{PATH_TRANSPOSED}/{abcd_dic[key]["name"]}/{dir}')
    files.sort()
    abcd_dic[key]["files"] = files

# Duration Analysis

In [4]:
# analyze durations for each file in each dataset
for key in abcd_dic:
    df_durations = pd.DataFrame()
    for f in tqdm(abcd_dic[key]["files"]):
        path = f'{PATH_TRANSPOSED}/{abcd_dic[key]["name"]}/{dir}/{f}'
        note_items = convert_to_note_items(path)
        duration_dic = get_duration_count(check_duration(note_items))
        duration_count = {"name": f}
        duration_count.update(duration_dic)
        df_durations = df_durations.append(duration_count, ignore_index=True)
    df_durations.fillna(0, inplace=True)
    abcd_dic[key]["duration_df"] = df_durations

100%|██████████| 902/902 [00:09<00:00, 96.58it/s] 
100%|██████████| 845/845 [00:08<00:00, 97.97it/s] 
100%|██████████| 858/858 [00:09<00:00, 85.86it/s] 
100%|██████████| 803/803 [00:08<00:00, 100.22it/s]


In [5]:
# print smallest and largest durations for each dataset
for key in abcd_dic:
    cols = sorted([col for col in abcd_dic[key]["duration_df"].columns if not type(col) == str])
    print(key, "smallest and largest durations")
    print(cols[:10])
    print(cols[-10:])
    print()

a smallest and largest durations
[1, 85, 86, 170, 171, 256, 341, 427, 512, 597]
[6741, 6827, 6912, 7168, 7424, 7509, 7680, 7851, 7936, 8192]

b smallest and largest durations
[1, 85, 86, 170, 171, 256, 341, 427, 512, 597]
[6741, 6827, 6912, 7168, 7424, 7509, 7680, 7851, 7936, 8192]

c smallest and largest durations
[1, 85, 86, 170, 171, 256, 341, 427, 512, 597]
[6741, 6827, 6912, 7168, 7424, 7509, 7680, 7851, 7936, 8192]

d smallest and largest durations
[1, 85, 86, 170, 171, 256, 341, 342, 427, 512]
[6827, 6912, 6997, 7168, 7424, 7509, 7680, 7851, 7936, 8192]



In [6]:
# analyze duration bins for each file in each dataset
space = [0, 85, 128, 256, 341, 512, 1024, 2048, 4096, 8192]
for key in abcd_dic:
    df_durations_bins = pd.DataFrame()
    for f in tqdm(abcd_dic[key]["files"]):
        path = f'{PATH_TRANSPOSED}/{abcd_dic[key]["name"]}/{dir}/{f}'
        note_items = convert_to_note_items(path)
        duration_dic = get_duration_count(check_duration(note_items))
        duration_count = {"name": f}
        duration_count.update(get_durations_in_bins(duration_dic, space))
        df_durations_bins = df_durations_bins.append(duration_count, ignore_index=True)
    df_durations_bins.fillna(0, inplace=True)
    abcd_dic[key]["duration_bin_df"] = df_durations_bins

100%|██████████| 902/902 [00:07<00:00, 127.20it/s]
100%|██████████| 845/845 [00:06<00:00, 132.26it/s]
100%|██████████| 858/858 [00:06<00:00, 131.24it/s]
100%|██████████| 803/803 [00:06<00:00, 131.27it/s]


In [7]:
# print notes shorter than one triplet (<85) between one and two bars (4096 - 8192) and longer than two bars (>8192)
df_dic = {}
for key in abcd_dic:
    df_durations_bins = abcd_dic[key]["duration_bin_df"]
    df_dic[key] = {}
    df_dic[key]["songs notes shorter than 85 absolute"] = df_durations_bins[df_durations_bins[0] > 0].shape[0]
    df_dic[key]["songs notes shorter than 85 percent"] = np.round(df_durations_bins[df_durations_bins[0] > 0].shape[0]/df_durations_bins.shape[0]*100, 2)
    df_dic[key]["sum notes shorter than 85"] = df_durations_bins[0].sum()
    df_dic[key]["songs notes > 1 and < 2 bars absolute"] = df_durations_bins[df_durations_bins[4096] > 0].shape[0]
    df_dic[key]["songs notes > 1 and < 2 bars percent"] = np.round(df_durations_bins[df_durations_bins[4096] > 0].shape[0]/df_durations_bins.shape[0]*100, 2)
    df_dic[key]["sum notes > 1 and < 2 bars"] = df_durations_bins[4096].sum()
    df_dic[key]["songs notes > 2 bars absolute"] = df_durations_bins[df_durations_bins[8192] > 0].shape[0]
    df_dic[key]["songs notes > 2 bars percent"] = np.round(df_durations_bins[df_durations_bins[8192] > 0].shape[0]/df_durations_bins.shape[0]*100, 2)
    df_dic[key]["sum notes > 2 bars"] = df_durations_bins[8192].sum()
pd.DataFrame(df_dic)

Unnamed: 0,a,b,c,d
songs notes shorter than 85 absolute,25.0,21.0,22.0,18.0
songs notes shorter than 85 percent,2.77,2.49,2.56,2.24
sum notes shorter than 85,99.0,86.0,80.0,70.0
songs notes > 1 and < 2 bars absolute,419.0,393.0,398.0,373.0
songs notes > 1 and < 2 bars percent,46.45,46.51,46.39,46.45
sum notes > 1 and < 2 bars,2046.0,1947.0,1939.0,1837.0
songs notes > 2 bars absolute,0.0,0.0,0.0,0.0
songs notes > 2 bars percent,0.0,0.0,0.0,0.0
sum notes > 2 bars,0.0,0.0,0.0,0.0


In [8]:
# analyze 1/16th and 1/8th triplet sequences for each file in each dataset
for key in abcd_dic:
    df_triole_seq = pd.DataFrame()
    for f in tqdm(abcd_dic[key]["files"]):
        path = f'{PATH_TRANSPOSED}/{abcd_dic[key]["name"]}/{dir}/{f}'
        note_items = convert_to_note_items(path)
        sequences = get_triole_count(check_triole_seq(note_items, lower_bound=85, upper_bound=86))
        sequence_count = {"name": f}
        sequence_count.update(sequences)
        df_triole_seq = df_triole_seq.append(sequence_count, ignore_index=True)
    df_triole_seq.fillna(0, inplace=True)
    abcd_dic[key]["triole_seq_df"] = df_triole_seq

    df_triole_seq_170 = pd.DataFrame()
    for f in tqdm(abcd_dic[key]["files"]):
        path = f'{PATH_TRANSPOSED}/{abcd_dic[key]["name"]}/{dir}/{f}'
        note_items = convert_to_note_items(path)
        sequences = get_triole_count(check_triole_seq(note_items, lower_bound=170, upper_bound=171))
        sequence_count = {"name": f}
        sequence_count.update(sequences)
        df_triole_seq_170 = df_triole_seq_170.append(sequence_count, ignore_index=True)
    df_triole_seq.fillna(0, inplace=True)
    abcd_dic[key]["df_triole_seq_170"] = df_triole_seq_170

100%|██████████| 902/902 [00:06<00:00, 130.53it/s]
100%|██████████| 902/902 [00:06<00:00, 132.19it/s]
100%|██████████| 845/845 [00:06<00:00, 133.19it/s]
100%|██████████| 845/845 [00:06<00:00, 133.36it/s]
100%|██████████| 858/858 [00:07<00:00, 113.60it/s]
100%|██████████| 858/858 [00:06<00:00, 128.02it/s]
100%|██████████| 803/803 [00:06<00:00, 133.29it/s]
100%|██████████| 803/803 [00:05<00:00, 134.12it/s]


In [9]:
# print metrics for 1/16th and 1/8th triplet sequences
df_dic = {}
for key in abcd_dic:
    df_triole_seq = abcd_dic[key]["triole_seq_df"]
    df_triole_seq_170 = abcd_dic[key]["df_triole_seq_170"]

    df_dic[key] = {}
    df_dic[key]["songs single 1/16 triole absolute"] = df_triole_seq[df_triole_seq[1] > 0].shape[0]
    df_dic[key]["songs single 1/16 triole percent"] = np.round(df_triole_seq[df_triole_seq[1] > 0].shape[0]/df_triole_seq.shape[0]*100, 2)
    df_dic[key]["sum single 1/16 triole sequences"] = df_triole_seq[1].sum()
    df_dic[key]["songs double 1/16 triole absolute"] = df_triole_seq[df_triole_seq[2] > 0].shape[0]
    df_dic[key]["songs double 1/16 triole percent"] =np.round(df_triole_seq[df_triole_seq[2] > 0].shape[0]/df_triole_seq.shape[0]*100, 2)
    df_dic[key]["sum double 1/16 triole sequences"] = df_triole_seq[2].sum()
    try:
        df_dic[key]["songs triple 1/16 triole absolute"] = df_triole_seq[df_triole_seq[3] > 0].shape[0]
        df_dic[key]["songs triple 1/16 triole percent"] =np.round(df_triole_seq[df_triole_seq[3] > 0].shape[0]/df_triole_seq.shape[0]*100, 2)
        df_dic[key]["sum triple 1/16 triole sequences"] = df_triole_seq[3].sum()
    except:
        print(key, "no triple 1/16 triole sequences")
    df_dic[key]["songs single 1/8 triole absolute"] = df_triole_seq_170[df_triole_seq_170[1] > 0].shape[0]
    df_dic[key]["songs single 1/8 triole percent"] = np.round(df_triole_seq_170[df_triole_seq_170[1] > 0].shape[0]/df_triole_seq_170.shape[0]*100, 2)
    df_dic[key]["sum single 1/8 triole sequences"] = df_triole_seq[1].sum()
    try:
        df_dic[key]["songs double 1/8 triole absolute"] = df_triole_seq_170[df_triole_seq_170[2] > 0].shape[0]
        df_dic[key]["songs double 1/8 triole percent"] =np.round(df_triole_seq_170[df_triole_seq_170[2] > 0].shape[0]/df_triole_seq_170.shape[0]*100, 2)
        df_dic[key]["sum double 1/8 triole sequences"] = df_triole_seq_170[2].sum()
    except:
        print(key, "no double 1/8 triole sequences")
    try:
        df_dic[key]["songs triple 1/8 triole absolute"] = df_triole_seq_170[df_triole_seq_170[3] > 0].shape[0]
        df_dic[key]["songs triple 1/8 triole percent"] =np.round(df_triole_seq_170[df_triole_seq_170[3] > 0].shape[0]/df_triole_seq_170.shape[0]*100, 2)
        df_dic[key]["sum triple 1/8 triole sequences"] = df_triole_seq_170[3].sum()
    except:
        print(key, "no triple 1/8 triole sequences")
pd.DataFrame(df_dic)

b no triple 1/16 triole sequences
d no triple 1/16 triole sequences
d no double 1/8 triole sequences
d no triple 1/8 triole sequences


Unnamed: 0,a,b,c,d
songs single 1/16 triole absolute,322.0,304.0,300.0,237.0
songs single 1/16 triole percent,35.7,35.98,34.97,29.51
sum single 1/16 triole sequences,3914.0,3644.0,3770.0,2511.0
songs double 1/16 triole absolute,9.0,7.0,8.0,6.0
songs double 1/16 triole percent,1.0,0.83,0.93,0.75
sum double 1/16 triole sequences,32.0,28.0,30.0,26.0
songs triple 1/16 triole absolute,1.0,,1.0,
songs triple 1/16 triole percent,0.11,,0.12,
sum triple 1/16 triole sequences,71.0,,71.0,
songs single 1/8 triole absolute,217.0,201.0,203.0,128.0


# Time Shift Analysis

In [10]:
# analyze time shifts for all files in each dataset
for key in abcd_dic:
    df = pd.DataFrame()
    for f in tqdm(abcd_dic[key]["files"]):
        path = f'{PATH_TRANSPOSED}/{abcd_dic[key]["name"]}/{dir}/{f}'
        note_items = convert_to_note_items(path)
        shifts = compute_shifts(note_items, ticks_per_position=1024*4/POSITION_STEPS)
        shift_count = {"name": f}
        shift_count.update(get_shifts_count(shifts))
        df = df.append(shift_count, ignore_index=True)
    df.fillna(0, inplace=True)
    df["abs_85"] = df[-85]+df[85]
    df["shift_count"] = df[-85]+df[85]+df[-171]
    abcd_dic[key]["time_shift_df"] = df

100%|██████████| 902/902 [00:08<00:00, 103.95it/s]
100%|██████████| 845/845 [00:07<00:00, 106.16it/s]
100%|██████████| 858/858 [00:08<00:00, 105.87it/s]
100%|██████████| 803/803 [00:07<00:00, 106.33it/s]


In [11]:
# print metrics of time shifts
df_dic = {}
for key in abcd_dic:
    df = abcd_dic[key]["time_shift_df"]

    df_dic[key] = {}
    df_dic[key]["songs with 85 shift"] = df[df["abs_85"]>0].shape[0]
    df_dic[key]["max 85 shift"] = df[df["abs_85"]>0]["abs_85"].max()
    df_dic[key]["min 85 shift"] = df[df["abs_85"]>0]["abs_85"].min()
    df_dic[key]["mean 85 shift"] = df[df["abs_85"]>0]["abs_85"].mean()
    df_dic[key]["songs with -171 shift"] = df[df[-171]>0].shape[0]
    df_dic[key]["max -171 shift"] = df[df[-171]>0][-171].max()
    df_dic[key]["min -171 shift"] = df[df[-171]>0][-171].min()
    df_dic[key]["mean -171 shift"] = df[df[-171 ]>0][-171].mean()
    df_dic[key]["songs with shift"] = df[df["shift_count"]>0].shape[0]
    df_dic[key]["max shift"] = df[df["shift_count"]>0]["shift_count"].max()
    df_dic[key]["min shift"] = df[df["shift_count"]>0]["shift_count"].min()
    df_dic[key]["mean shift"] = df[df["shift_count"]>0]["shift_count"].mean()
    df_dic[key]["songs with no shifts"] = df[df["shift_count"]==0].shape[0]
pd.DataFrame(df_dic)

Unnamed: 0,a,b,c,d
songs with 85 shift,669.0,625.0,631.0,584.0
max 85 shift,410.0,410.0,410.0,410.0
min 85 shift,1.0,1.0,1.0,1.0
mean 85 shift,116.64574,115.9808,117.437401,117.236301
songs with -171 shift,107.0,99.0,103.0,95.0
max -171 shift,1.0,1.0,1.0,1.0
min -171 shift,1.0,1.0,1.0,1.0
mean -171 shift,1.0,1.0,1.0,1.0
songs with shift,669.0,625.0,631.0,584.0
max shift,410.0,410.0,410.0,410.0


In [12]:
# analyze time shifts for 48 position bins

POSITION_STEPS_48 = 48
for key in abcd_dic:
    df_48 = pd.DataFrame()
    for f in tqdm(abcd_dic[key]["files"]):
        path = f'{PATH_TRANSPOSED}/{abcd_dic[key]["name"]}/{dir}/{f}'
        note_items = convert_to_note_items(path)
        shifts = compute_shifts(note_items, ticks_per_position=1024*4/POSITION_STEPS_48)
        shift_count = {"name": f}
        shift_count.update(get_shifts_count(shifts))
        df_48 = df_48.append(shift_count, ignore_index=True)
    df_48.fillna(0, inplace=True)
    abcd_dic[key]["time_shift_48_df"] = df_48

100%|██████████| 902/902 [00:09<00:00, 99.55it/s] 
100%|██████████| 845/845 [00:08<00:00, 100.01it/s]
100%|██████████| 858/858 [00:08<00:00, 100.53it/s]
100%|██████████| 803/803 [00:07<00:00, 101.88it/s]


In [13]:
# print time shift metrics for 48 position bins
df_48 = abcd_dic["a"]["time_shift_48_df"]
print("a data")
print()
print("songs with -1 shift:", df_48[df_48[-1]>0].shape[0])
print("max -1 shift:", df_48[df_48[-1]>0][-1].max())
print("min -1 shift:", df_48[df_48[-1]>0][-1].min())
print("mean -1 shift:", df_48[df_48[-1]>0][-1].mean())
print("songs with no shifts:", df_48[df_48[-1]==0].shape[0])
print()
df_48 = abcd_dic["d"]["time_shift_48_df"]
print("d data")
print()
print("songs with -1 shift:", df_48[df_48[-1]>0].shape[0])
print("max -1 shift:", df_48[df_48[-1]>0][-1].max())
print("min -1 shift:", df_48[df_48[-1]>0][-1].min())
print("mean -1 shift:", df_48[df_48[-1]>0][-1].mean())
print("songs with no shifts:", df_48[df_48[-1]==0].shape[0])
print()

a data

songs with -1 shift: 525
max -1 shift: 326.0
min -1 shift: 1.0
mean -1 shift: 76.20952380952382
songs with no shifts: 377

d data

songs with -1 shift: 461
max -1 shift: 326.0
min -1 shift: 1.0
mean -1 shift: 77.78091106290672
songs with no shifts: 342

