In [1]:
import pandas as pd
from music21 import converter
import os
from helper_functions import get_file_and_dirnames
from analysis_functions import analyse_data_folder
from preprocessing_functions import transpose_to_octave_4_to_6
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

PATH_TRANSPOSED = "../0_data/4_preprocessed_sets"

## Transposing Octave

In [2]:
# get file list and print number of remaining songs
dir = "17_POP909-Dataset-master"
transposed_4_4_files,_ = get_file_and_dirnames(f"{PATH_TRANSPOSED}/a)_4_4_metric_120_bpm/{dir}")
transposed_4_4_files.sort()
transposed_key_files,_ = get_file_and_dirnames(f"{PATH_TRANSPOSED}/b)_transposed_key/{dir}")
transposed_key_files.sort()

print("songs in 4/4 metric and 120 bpm:", len(transposed_4_4_files))
print("songs in transposed key:", len(transposed_key_files))

songs in 4/4 metric and 120 bpm: 902
songs in transposed key: 845


In [3]:
# get note analysis
df_4_4 = analyse_data_folder(f"{PATH_TRANSPOSED}/a)_4_4_metric_120_bpm/{dir}", compute_metrics=False)
df_key = analyse_data_folder(f"{PATH_TRANSPOSED}/b)_transposed_key/{dir}", compute_metrics=False)

100%|██████████| 902/902 [07:41<00:00,  1.96it/s]
100%|██████████| 845/845 [05:58<00:00,  2.35it/s]

mean note
4/4 71.45787139689578
transposed_key 70.67810650887574

min note
4/4 42.0
transposed_key 39.0

max note
4/4 98.0
transposed_key 101.0

note variation
4/4 26
transposed_key 26






In [4]:
# print some metrics
print("mean note")
print("4/4", df_4_4["note_avg"].mean())
print("transposed_key", df_key["note_avg"].mean())
print()

print("min note")
print("4/4", df_4_4["note_lowest"].min())
print("transposed_key", df_key["note_lowest"].min())
print()

print("max note")
print("4/4", df_4_4["note_highest"].max())
print("transposed_key", df_key["note_highest"].max())
print()

print("note variation")
print("4/4", df_4_4["note_variation_count"].max())
print("transposed_key", df_key["note_variation_count"].max())
print()

mean note
4/4 71.45787139689578
transposed_key 70.67810650887574

min note
4/4 42.0
transposed_key 39.0

max note
4/4 98.0
transposed_key 101.0

note variation
4/4 26
transposed_key 26



In [5]:
# compute note range
df_4_4["note_range"] = df_4_4["note_highest"] - df_4_4["note_lowest"]
df_key["note_range"] = df_key["note_highest"] - df_key["note_lowest"]

# print number of songs over 2 octaves 
print("4_4 number range over 2 octaves:", len(df_4_4[df_4_4["note_range"] > 24]))
print("transposed key number range over 2 octaves:", len(df_key[df_key["note_range"] > 24]))

# filter them out in a new dataframe
df_4_4_within_2_octaves = df_4_4[df_4_4["note_range"] <= 24]
df_key_within_2_octaves = df_key[df_key["note_range"] <= 24]

4_4 number range over 2 octaves: 44
transposed key number range over 2 octaves: 42


In [6]:
# check number of songs which begin and end in each octave 4_4
octaves = []
for octave in range(2,8):
    octave_start = (octave+1)*12
    octave_end = ((octave+2)*12)-1

    count_begin_end = len(df_4_4_within_2_octaves[df_4_4_within_2_octaves["note_lowest"] < octave_end])
    count_begin_start = len(df_4_4_within_2_octaves[df_4_4_within_2_octaves["note_lowest"] < octave_start])

    count_fin_end = len(df_4_4_within_2_octaves[df_4_4_within_2_octaves["note_highest"] > octave_end])
    count_fin_start = len(df_4_4_within_2_octaves[df_4_4_within_2_octaves["note_highest"] > octave_start])

    octaves.append([octave, count_begin_end - count_begin_start, count_fin_start - count_fin_end])
print("4_4 songs")
pd.DataFrame(octaves, columns= ["octave", "begin", "finish"])

4_4 songs


Unnamed: 0,octave,begin,finish
0,2,1,0
1,3,236,0
2,4,541,60
3,5,20,523
4,6,0,167
5,7,0,0


In [7]:
# check number of songs which begin and end in each octave 4_4
octaves = []
for octave in range(2,8):
    octave_start = (octave+1)*12
    octave_end = ((octave+2)*12)-1

    count_begin_end = len(df_key_within_2_octaves[df_key_within_2_octaves["note_lowest"] < octave_end])
    count_begin_start = len(df_key_within_2_octaves[df_key_within_2_octaves["note_lowest"] < octave_start])

    count_fin_end = len(df_key_within_2_octaves[df_key_within_2_octaves["note_highest"] > octave_end])
    count_fin_start = len(df_key_within_2_octaves[df_key_within_2_octaves["note_highest"] > octave_start])

    octaves.append([octave, count_begin_end - count_begin_start, count_fin_start - count_fin_end])
print("transposed key songs")
pd.DataFrame(octaves, columns= ["octave", "begin", "finish"])

transposed key songs


Unnamed: 0,octave,begin,finish
0,2,23,0
1,3,288,2
2,4,414,109
3,5,62,351
4,6,0,145
5,7,0,1


In [8]:
# when paths not exist, create directories
if not os.path.exists(f"{PATH_TRANSPOSED}/c)_transposed_octave/{dir}"):
    os.makedirs(f"{PATH_TRANSPOSED}/c)_transposed_octave/{dir}")

skipped = []

# transpose octave
for f in tqdm(transposed_4_4_files):
    # skip not MIDI files and already transposed ones
    if f[-4:] != ".mid":
        continue
    try:
        note_range = df_4_4[df_4_4["name"] == f]["note_range"].values[0]
    except:
        continue
    # skip songs over more than 2 octaves
    if note_range > 24:
        skipped.append(f)
    # convert other songs to octave 4 to 6
    else:
        try:
            midi_score = converter.parse(f"{PATH_TRANSPOSED}/a)_4_4_metric_120_bpm/{dir}/{f}")
            midi_score = transpose_to_octave_4_to_6(midi_score)
            midi_score.write("midi", fp=f"{PATH_TRANSPOSED}/c)_transposed_octave/{dir}/{f}")
        except:
            continue

print("skipped:", len(skipped))

100%|██████████| 902/902 [07:10<00:00,  2.10it/s]

skipped: 44





In [9]:
print("skipped:", len(skipped))

skipped: 44


In [10]:
# when paths not exist, create directories
if not os.path.exists(f"{PATH_TRANSPOSED}/d)_transposed_key_and_octave/{dir}"):
    os.makedirs(f"{PATH_TRANSPOSED}/d)_transposed_key_and_octave/{dir}")

skipped = []

# transpose octave
for f in tqdm(transposed_4_4_files):
    # skip not MIDI files and already transposed ones
    if f[-4:] != ".mid":
        continue
    try:
        note_range = df_key[df_key["name"] == f]["note_range"].values[0]
    except:
        continue
    # skip songs over more than 2 octaves
    if note_range > 24:
        skipped.append(f)
    # convert other songs to octave 4 to 6
    else:
        try:
            midi_score = converter.parse(f"{PATH_TRANSPOSED}/b)_transposed_key/{dir}/{f}")
            midi_score = transpose_to_octave_4_to_6(midi_score)
            midi_score.write("midi", fp=f"{PATH_TRANSPOSED}/d)_transposed_key_and_octave/{dir}/{f}")
        except:
            continue

len(skipped)

100%|██████████| 902/902 [06:48<00:00,  2.21it/s]


42

In [11]:
# analyse transposed MIDI fiels
df_octave_only = analyse_data_folder(f"{PATH_TRANSPOSED}/c)_transposed_octave/{dir}", compute_metrics=False)
df_octave_key = analyse_data_folder(f"{PATH_TRANSPOSED}/d)_transposed_key_and_octave/{dir}", compute_metrics=False)

100%|██████████| 858/858 [04:37<00:00,  3.09it/s]
100%|██████████| 803/803 [04:15<00:00,  3.14it/s]


In [12]:
# check number of songs which begin and end in each octave
octaves = []
for octave in range(2,8):
    octave_start = (octave+1)*12
    octave_end = ((octave+2)*12)-1

    count_begin_end = len(df_octave_only[df_octave_only["note_lowest"] < octave_end])
    count_begin_start = len(df_octave_only[df_octave_only["note_lowest"] < octave_start])

    count_fin_end = len(df_octave_only[df_octave_only["note_highest"] > octave_end])
    count_fin_start = len(df_octave_only[df_octave_only["note_highest"] > octave_start])

    octaves.append([octave, count_begin_end - count_begin_start, count_fin_start - count_fin_end])
print("transposed octave only songs")
pd.DataFrame(octaves, columns= ["octave", "begin", "finish"])

transposed octave only songs


Unnamed: 0,octave,begin,finish
0,2,0,0
1,3,0,0
2,4,798,2
3,5,0,404
4,6,0,344
5,7,0,0


In [13]:
# check number of songs which begin and end in each octave
octaves = []
for octave in range(2,8):
    octave_start = (octave+1)*12
    octave_end = ((octave+2)*12)-1

    count_begin_end = len(df_octave_key[df_octave_key["note_lowest"] < octave_end])
    count_begin_start = len(df_octave_key[df_octave_key["note_lowest"] < octave_start])

    count_fin_end = len(df_octave_key[df_octave_key["note_highest"] > octave_end])
    count_fin_start = len(df_octave_key[df_octave_key["note_highest"] > octave_start])

    octaves.append([octave, count_begin_end - count_begin_start, count_fin_start - count_fin_end])
print("transposed octave only songs")
pd.DataFrame(octaves, columns= ["octave", "begin", "finish"])

transposed octave only songs


Unnamed: 0,octave,begin,finish
0,2,0,0
1,3,0,0
2,4,787,2
3,5,0,361
4,6,0,245
5,7,0,0


In [14]:
# print some metrics
print("mean note")
print("octave only", df_octave_only["note_avg"].mean())
print("transposed key and octave", df_octave_key["note_avg"].mean())
print()

print("min note")
print("octave only", df_octave_only["note_lowest"].min())
print("transposed key and octave", df_octave_key["note_lowest"].min())
print()

print("max note")
print("octave only", df_octave_only["note_highest"].max())
print("transposed key and octave", df_octave_key["note_highest"].max())
print()

print("note variation")
print("octave only", df_octave_only["note_variation_count"].max())
print("transposed key and octave", df_octave_key["note_variation_count"].max())
print()

mean note
octave only 75.1002331002331
transposed key and octave 74.89290161892902

min note
octave only 60.0
transposed key and octave 60.0

max note
octave only 95.0
transposed key and octave 95.0

note variation
octave only 22
transposed key and octave 22

