In [1]:
import os

import pandas as pd
from tqdm import tqdm
from music21 import converter, midi, key

from evaluation_metrics import get_file_and_dirnames

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.insert(1, '../1_preprocessing')
from preprocessing_functions import transpose_to_Cmaj_Amin, transpose_to_octave_4_to_6
from analysis_functions import analyse_data_folder

PATH_MIDI = "../0_data/8_predictions/midi"

## Analyze and Transpose Key

In [2]:
# analyze a1 data in terms of key

files, dirs = get_file_and_dirnames(f"{PATH_MIDI}/a1")
files.sort()

df_dic = {}
for f in tqdm(files):
    if f[-5:] != ".midi":
        continue

    midi_score = converter.parse(f"{PATH_MIDI}/a1/{f}")
    
    music_analysis = midi_score.analyze('key')
    df_dic[f] = {
        "name": f,
        "key": music_analysis,
        "confidence": music_analysis.correlationCoefficient
    }
df = pd.DataFrame(df_dic).T

# check number of songs per key analysis confidence
print("confidence over 90% -", len(df[df["confidence"] > 0.90]))
print("confidence over 80% -", len(df[df["confidence"] > 0.80]))
print("confidence over 75% -", len(df[df["confidence"] > 0.75]))
print("confidence over 70% -", len(df[df["confidence"] > 0.70]))
print("confidence over 50% -", len(df[df["confidence"] > 0.50]))

df

100%|██████████| 100/100 [00:07<00:00, 13.75it/s]

confidence over 90% - 15
confidence over 80% - 79
confidence over 75% - 91
confidence over 70% - 97
confidence over 50% - 100





Unnamed: 0,name,key,confidence
generated_midi_0.midi,generated_midi_0.midi,E major,0.861118
generated_midi_1.midi,generated_midi_1.midi,f minor,0.786468
generated_midi_10.midi,generated_midi_10.midi,e minor,0.757023
generated_midi_11.midi,generated_midi_11.midi,b- minor,0.935869
generated_midi_12.midi,generated_midi_12.midi,E major,0.880262
...,...,...,...
generated_midi_95.midi,generated_midi_95.midi,G major,0.850507
generated_midi_96.midi,generated_midi_96.midi,B major,0.923142
generated_midi_97.midi,generated_midi_97.midi,b- minor,0.948385
generated_midi_98.midi,generated_midi_98.midi,e minor,0.884402


In [3]:
# when paths not exist, create directories
if not os.path.exists(f"{PATH_MIDI}/a1_transposed_key"):
    os.makedirs(f"{PATH_MIDI}/a1_transposed_key")

skipped = []

for f in tqdm(files):
    # skip key confidence levels below 75%
    if df[df["name"] == f]["confidence"].values[0] < 0.75:
        skipped.append(f)
    # transpose and save MIDI scores
    else:
        try:
            mf = midi.MidiFile()
            mf.open(f"{PATH_MIDI}/a1/{f}")
            mf.read()
            mf.close()
            midi_stream = midi.translate.midiFileToStream(mf)
            transposed_stream = transpose_to_Cmaj_Amin(midi_stream)
            transposed_stream.write("midi", fp=f"{PATH_MIDI}/a1_transposed_key/{f}")
        except:
            print("error in", f)
            continue

100%|██████████| 100/100 [00:57<00:00,  1.75it/s]


In [4]:
print("skipped files count:", len(skipped))

skipped files count: 9


## Analyze and Transpose Octave

In [5]:
# analyze a1 data in terms of octave range

df_key = analyse_data_folder(f"{PATH_MIDI}/a1_transposed_key", compute_metrics=False)

# print some metrics
print("mean note")
print("transposed_key", df_key["note_avg"].mean())
print()
print("min note")
print("transposed_key", df_key["note_lowest"].min())
print()
print("max note")
print("transposed_key", df_key["note_highest"].max())
print()
print("note variation")
print("transposed_key", df_key["note_variation_count"].max())
print()

# compute note range and print number of songs over 2 octaves 
df_key["note_range"] = df_key["note_highest"] - df_key["note_lowest"]
print("transposed key number range over 2 octaves:", len(df_key[df_key["note_range"] > 24]))

# filter them out in a new dataframe and check octaves
df_key_within_2_octaves = df_key[df_key["note_range"] <= 24]
octaves = []
for octave in range(2,8):
    octave_start = (octave+1)*12
    octave_end = ((octave+2)*12)-1

    count_begin_end = len(df_key_within_2_octaves[df_key_within_2_octaves["note_lowest"] < octave_end])
    count_begin_start = len(df_key_within_2_octaves[df_key_within_2_octaves["note_lowest"] < octave_start])

    count_fin_end = len(df_key_within_2_octaves[df_key_within_2_octaves["note_highest"] > octave_end])
    count_fin_start = len(df_key_within_2_octaves[df_key_within_2_octaves["note_highest"] > octave_start])

    octaves.append([octave, count_begin_end - count_begin_start, count_fin_start - count_fin_end])

pd.DataFrame(octaves, columns= ["octave", "begin", "finish"])

100%|██████████| 91/91 [00:25<00:00,  3.63it/s]

mean note
transposed_key 69.32967032967034

min note
transposed_key 40.0

max note
transposed_key 95.0

note variation
transposed_key 31

transposed key number range over 2 octaves: 27





Unnamed: 0,octave,begin,finish
0,2,3,0
1,3,28,0
2,4,30,5
3,5,2,36
4,6,0,15
5,7,0,0


In [7]:
# when paths not exist, create directories
if not os.path.exists(f"{PATH_MIDI}/a1_transposed_key_octave"):
    os.makedirs(f"{PATH_MIDI}/a1_transposed_key_octave")

transposed_key_files, _ = get_file_and_dirnames(f"{PATH_MIDI}/a1_transposed_key")
transposed_key_files.sort()

skipped = []

# transpose octave
for f in tqdm(transposed_key_files):
    try:
        note_range = df_key[df_key["name"] == f]["note_range"].values[0]
    except:
        continue
    # skip songs over more than 2 octaves
    if note_range > 24:
        skipped.append(f)
    # convert other songs to octave 4 to 6
    else:
        try:
            midi_score = converter.parse(f"{PATH_MIDI}/a1_transposed_key/{f}")
            midi_score = transpose_to_octave_4_to_6(midi_score)
            midi_score.write("midi", fp=f"{PATH_MIDI}/a1_transposed_key_octave/{f}")
        except:
            continue

100%|██████████| 91/91 [00:32<00:00,  2.77it/s]


In [8]:
print("skipped files count:", len(skipped))

skipped files count: 27


## Analyze Octave and Key after Transposing

In [9]:
# analyze transposed data in terms of key

files, dirs = get_file_and_dirnames(f"{PATH_MIDI}/a1_transposed_key_octave")
files.sort()

df_dic = {}
for f in tqdm(files):
    if f[-5:] != ".midi":
        continue

    midi_score = converter.parse(f"{PATH_MIDI}/a1_transposed_key_octave/{f}")
    
    music_analysis = midi_score.analyze('key')
    df_dic[f] = {
        "name": f,
        "key": music_analysis,
        "confidence": music_analysis.correlationCoefficient
    }
df = pd.DataFrame(df_dic).T
df

100%|██████████| 64/64 [00:15<00:00,  4.14it/s]


Unnamed: 0,name,key,confidence
generated_midi_0.midi,generated_midi_0.midi,C major,0.862229
generated_midi_1.midi,generated_midi_1.midi,a minor,0.786468
generated_midi_10.midi,generated_midi_10.midi,a minor,0.757023
generated_midi_11.midi,generated_midi_11.midi,a minor,0.935869
generated_midi_12.midi,generated_midi_12.midi,C major,0.892373
...,...,...,...
generated_midi_93.midi,generated_midi_93.midi,C major,0.846996
generated_midi_94.midi,generated_midi_94.midi,a minor,0.942381
generated_midi_95.midi,generated_midi_95.midi,C major,0.850507
generated_midi_96.midi,generated_midi_96.midi,C major,0.923142


In [10]:
# check number of songs per key analysis confidence
print("confidence over 90% -", len(df[df["confidence"] > 0.90]))
print("confidence over 80% -", len(df[df["confidence"] > 0.80]))
print("confidence over 75% -", len(df[df["confidence"] > 0.75]))
print("confidence over 70% -", len(df[df["confidence"] > 0.70]))
print("confidence over 50% -", len(df[df["confidence"] > 0.50]))

confidence over 90% - 10
confidence over 80% - 56
confidence over 75% - 64
confidence over 70% - 64
confidence over 50% - 64


In [11]:
# analyze transposed data in terms of octave range

df_key = analyse_data_folder(f"{PATH_MIDI}/a1_transposed_key_octave", compute_metrics=False)

# compute note range
df_key["note_range"] = df_key["note_highest"] - df_key["note_lowest"]

# filter them out in a new dataframe and check octaves
df_key_within_2_octaves = df_key[df_key["note_range"] <= 24]
octaves = []
for octave in range(2,8):
    octave_start = (octave+1)*12
    octave_end = ((octave+2)*12)-1

    count_begin_end = len(df_key_within_2_octaves[df_key_within_2_octaves["note_lowest"] < octave_end])
    count_begin_start = len(df_key_within_2_octaves[df_key_within_2_octaves["note_lowest"] < octave_start])

    count_fin_end = len(df_key_within_2_octaves[df_key_within_2_octaves["note_highest"] > octave_end])
    count_fin_start = len(df_key_within_2_octaves[df_key_within_2_octaves["note_highest"] > octave_start])

    octaves.append([octave, count_begin_end - count_begin_start, count_fin_start - count_fin_end])

pd.DataFrame(octaves, columns= ["octave", "begin", "finish"])

100%|██████████| 64/64 [00:17<00:00,  3.63it/s]

mean note
transposed_key 76.046875

min note
transposed_key 60.0

max note
transposed_key 93.0

note variation
transposed_key 20

transposed key number range over 2 octaves: 0





Unnamed: 0,octave,begin,finish
0,2,0,0
1,3,0,0
2,4,63,0
3,5,0,13
4,6,0,43
5,7,0,0


In [12]:
# print some metrics
print("mean note")
print("transposed_key", df_key["note_avg"].mean())
print()
print("min note")
print("transposed_key", df_key["note_lowest"].min())
print()
print("max note")
print("transposed_key", df_key["note_highest"].max())
print()
print("note variation")
print("transposed_key", df_key["note_variation_count"].max())
print()
print("transposed key number range over 2 octaves:", len(df_key[df_key["note_range"] > 24]))

mean note
transposed_key 76.046875

min note
transposed_key 60.0

max note
transposed_key 93.0

note variation
transposed_key 20

transposed key number range over 2 octaves: 0
