In [1]:
from mido import MidiFile, tempo2bpm
import pandas as pd
import os
from helper_functions import get_file_and_dirnames
from analysis_functions import analyse_data_folder
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

PATH_MIDI = "../data/1_midi_sets"
PATH_ANALYSIS = "../data/2_analysis"

if not os.path.exists(PATH_ANALYSIS):
    os.makedirs(PATH_ANALYSIS)

## Analyse Datasets

In [2]:
_,dirs = get_file_and_dirnames(PATH_MIDI)
dirs.sort()
dirs

['10_classic_midi_cont_melodies',
 '11_classic_piano_vienna',
 '12_emotion_classification',
 '13_NES_music',
 '14_national_anthems',
 '15_baroque',
 '16_classic_piano',
 '17_POP909-Dataset-master',
 '1_lakh_full',
 '2_lakh_subset',
 '3_lofi_hiphop',
 '4_Jazz_incl_csv',
 '6_anime_music',
 '7_Pop',
 '8_maestro-v3.0.0',
 '9_classic_midi_data']

In [3]:
paths = [
    #'10_classic_midi_cont_melodies',
    #'11_classic_piano_vienna',
    #'12_emotion_classification',
    #'14_national_anthems',
    #'15_baroque',
    #'16_classic_piano',
    '17_POP909-Dataset-master',
    #'1_lakh_full',
    #'2_lakh_subset',
    #'3_lofi_hiphop',
    #'4_Jazz_incl_csv',
    #'5_classic_only_csv',
    #'6_anime_music',
    #'7_Pop',
    #'8_maestro-v3.0.0',
    #'9_classic_midi_data'
]

In [4]:
for d in paths:
    df = analyse_data_folder(f"{PATH_MIDI}/{d}")
    if d[1] == "_":
        d = "0" + d
    df.to_csv(f"{PATH_ANALYSIS}/{d}.csv")
    print(d)
    print("errors:", len(df[df["error"]==True]))

100%|██████████| 94/94 [00:03<00:00, 25.91it/s]


03_lofi_hiphop
errors: 0


100%|██████████| 240/240 [03:40<00:00,  1.09it/s]

06_anime_music
errors: 0





## Create Overview

In [13]:
files,_ = get_file_and_dirnames(PATH_ANALYSIS)
files.sort()

df = pd.DataFrame()
for f in files:
    if f == "00_overview.csv":
        continue
    print(f)
    try:
        f_df = pd.read_csv(f"{PATH_ANALYSIS}/{f}")

        mean_row = f_df[f_df["name"]=="mean"]
        min_row = f_df[f_df["name"]=="min"]
        max_row = f_df[f_df["name"]=="max"]

        f_df["numerator"] = f_df["numerator"].apply(lambda x: str(x))
        f_df["denominator"] = f_df["denominator"].apply(lambda x: str(x))
        numerator_df = f_df[f_df["numerator"]=="4.0"]
        denominator_df = numerator_df[numerator_df["denominator"]=="4.0"]

        r = {
            "folder": f,
            "percentage_multitrack": float(mean_row["type"].iloc[0]).__round__(4),
            "percentage_monophonic": float(mean_row["monophonic"].iloc[0]).__round__(4),
            "percentage_overlap": float(mean_row["overlap"].iloc[0]).__round__(4),
            "avg_length": float(mean_row["length_sec"].iloc[0]).__round__(2),
            "song_4/4_beat": len(denominator_df.index),
            "different_keys": len(f_df[f_df["error"]==False]["key"].unique()),
            "bpm_avg": mean_row["bpm"].iloc[0],
            "bpm_range": float(max_row["bpm"].iloc[0]) - float(f_df[f_df["bpm"] > "0"]["bpm"].min()),
            "shortest_note": float(min_row["duration_lowest"].iloc[0]).__round__(5)
        }

    except:
        print("error")
        r = {col: "error" for col in df.columns}
        r["folder"] = f

    df = df.append(r, ignore_index=True)
df.to_csv(f"{PATH_ANALYSIS}/00_overview.csv")
df

10_classic_midi_cont_melodies.csv
11_classic_piano_vienna.csv
17_POP909-Dataset-master.csv
3_lofi_hiphop.csv


Unnamed: 0,folder,percentage_multitrack,avg_length,song_4/4_beat,different_keys,bpm_avg,shortest_note
0,10_classic_midi_cont_melodies.csv,1.0,421.04,10,9,120.724138,0.00208
1,11_classic_piano_vienna.csv,0.6071,98.21,3,2,120.0,0.00208
2,17_POP909-Dataset-master.csv,1.0,249.6,264,2,81.418042,0.00104
3,3_lofi_hiphop.csv,0.3118,9.27,96,1,0.967742,0.00104
