In [1]:
import pandas as pd
import os
from helper_functions import get_file_and_dirnames
from analysis_functions import analyse_data_folder
import warnings
warnings.filterwarnings('ignore')

PATH_MIDI = "../0_data/1_midi_sets"
PATH_ANALYSIS = "../0_data/2_analysis"

# when paths not exist, create directories
if not os.path.exists(PATH_ANALYSIS):
    os.makedirs(PATH_ANALYSIS)

## Analyse Datasets

In [2]:
# get list of directories in raw sets path
_,dirs = get_file_and_dirnames(PATH_MIDI)
dirs.sort()
dirs

['10_classic_midi_cont_melodies',
 '11_classic_piano_vienna',
 '12_emotion_classification',
 '13_NES_music',
 '14_national_anthems',
 '15_baroque',
 '16_classic_piano',
 '17_POP909-Dataset-master',
 '1_lakh_full',
 '2_lakh_subset',
 '3_lofi_hiphop',
 '4_Jazz_incl_csv',
 '6_anime_music',
 '7_Pop',
 '8_maestro-v3.0.0',
 '9_classic_midi_data']

In [3]:
paths = [
    '10_classic_midi_cont_melodies',
    '11_classic_piano_vienna',
    '12_emotion_classification',
    '14_national_anthems',
    '15_baroque',
    '16_classic_piano',
    '17_POP909-Dataset-master',
    '1_lakh_full',
    '2_lakh_subset',
    '3_lofi_hiphop',
    '4_Jazz_incl_csv',
    '6_anime_music',
    '7_Pop',
    '8_maestro-v3.0.0',
    '9_classic_midi_data'
]

In [4]:
# analyse midi sets, save results in csv and print out error count
for d in paths:
    df = analyse_data_folder(f"{PATH_MIDI}/{d}")
    if d[1] == "_":
        d = "0" + d
    df.to_csv(f"{PATH_ANALYSIS}/{d}.csv")
    print(d)
    print("errors:", len(df[df["error"]==True]))

100%|██████████| 1276/1276 [3:39:38<00:00, 10.33s/it]  


08_maestro-v3.0.0
errors: 0


100%|██████████| 92/92 [01:04<00:00,  1.43it/s]

09_classic_midi_data
errors: 0





## Create Overview of Analysis CSVs

In [5]:
files,_ = get_file_and_dirnames(PATH_ANALYSIS)
files.sort()

df = pd.DataFrame()
for f in files:
    if f == "00_overview.csv":
        continue
    print(f)
    try:
        f_df = pd.read_csv(f"{PATH_ANALYSIS}/{f}")
        # filter out errors and change type
        f_df = f_df[f_df["error"] != True]
        f_df["bpm"] = f_df["bpm"].apply(lambda x: float(x))
        # extract analysis rows
        mean_row = f_df[f_df["name"]=="mean"]
        min_row = f_df[f_df["name"]=="min"]
        max_row = f_df[f_df["name"]=="max"]
        # extract entries where time metric is 4/4
        f_df["numerator"] = f_df["numerator"].apply(lambda x: str(x))
        f_df["denominator"] = f_df["denominator"].apply(lambda x: str(x))
        numerator_df = f_df[f_df["numerator"]=="4.0"]
        denominator_df = numerator_df[numerator_df["denominator"]=="4.0"]
        # compute statistics
        r = {
            "folder": f,
            "percentage_multitrack": float(mean_row["type"].iloc[0]).__round__(4),
            "percentage_monophonic": float(mean_row["monophonic"].iloc[0]).__round__(4),
            "percentage_overlap": float(mean_row["overlap"].iloc[0]).__round__(4),
            "avg_length": float(mean_row["length_sec"].iloc[0]).__round__(2),
            "song_4/4_beat": len(denominator_df.index),
            "different_keys": len(f_df[f_df["error"]==False]["key"].unique()),
            "bpm_avg": float(mean_row["bpm"].iloc[0]).__round__(2),
            "bpm_range": (float(max_row["bpm"].iloc[0]) - float(f_df[f_df["bpm"] > 0]["bpm"].min())).__round__(2),
            "shortest_note": float(min_row["duration_lowest"].iloc[0]).__round__(5)
        }
    # catch any error
    except:
        print("error")
        r = {col: "error" for col in df.columns}
        r["folder"] = f

    df = df.append(r, ignore_index=True)
df.to_csv(f"{PATH_ANALYSIS}/00_overview.csv")
df

02_lakh_subset.csv
03_lofi_hiphop.csv
04_Jazz_incl_csv.csv
06_anime_music.csv
07_Pop.csv
08_maestro-v3.0.0.csv
09_classic_midi_data.csv
10_classic_midi_cont_melodies.csv
11_classic_piano_vienna.csv
12_emotion_classification.csv
14_national_anthems.csv
15_baroque.csv
16_classic_piano.csv
17_POP909-Dataset-master.csv


Unnamed: 0,folder,percentage_multitrack,percentage_monophonic,percentage_overlap,avg_length,song_4/4_beat,different_keys,bpm_avg,bpm_range,shortest_note
0,02_lakh_subset.csv,0.6463,0.7317,0.3902,173.89,61,11,124.59,161.0,0.00098
1,03_lofi_hiphop.csv,0.3118,0.2688,0.9892,9.27,96,1,0.97,89.03,0.00104
2,04_Jazz_incl_csv.csv,0.8368,0.7974,0.2804,243.02,796,20,115.13,261.0,0.0001
3,06_anime_music.csv,0.925,0.1667,0.8667,134.09,192,28,116.86,226.0,0.00098
4,07_Pop.csv,1.0,1.0,0.0,65.42,53,1,105.86,108.0,0.01042
5,08_maestro-v3.0.0.csv,1.0,0.0008,1.0,561.41,1279,1,120.0,0.0,0.00208
6,09_classic_midi_data.csv,0.8913,0.2935,0.6848,134.73,63,17,112.95,180.0,0.00098
7,10_classic_midi_cont_melodies.csv,1.0,0.0,1.0,421.04,10,9,120.72,177.0,0.00208
8,11_classic_piano_vienna.csv,0.6071,0.0,1.0,98.21,3,2,120.0,0.0,0.00208
9,12_emotion_classification.csv,0.8402,0.7732,0.2887,223.21,173,13,122.59,347.0,0.00208
