In [1]:
import os
import re
import json
import muspy
import pickle
import random
import numpy as np
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
lakh_matched_dataset = muspy.LakhMIDIMatchedDataset('persistent/MusicGeneration/data/lmd_matched', use_converted=True)

In [7]:
lakh_matched_dataset.info()

DatasetInfo(name='Lakh MIDI Dataset', description='The Lakh MIDI dataset is a collection of 176,581 unique MIDI files, 45,129 of which have been matched and aligned to entries in the Million Song Dataset. Its goal is to facilitate large-scale music information retrieval, both symbolic (using the MIDI files alone) and audio content-based (using information extracted from the MIDI files as annotations for the matched audio files).', homepage='https://colinraffel.com/projects/lmd/', license='Creative Commons Attribution 4.0 International License (CC-By 4.0)')

In [10]:
with open("/home/aborghesi/persistent/MusicGeneration/src/spotify_genre_collector/lakh_matched_genre_vectors.pickle", 'rb') as f:
    genre_vectors = pickle.load(f)

In [None]:
song_ids = [song.metadata.source_filename[:-4] for song in tqdm(lakh_matched_dataset)] # Quite long

In [18]:
len(song_ids), len(set(song_ids))

(115190, 44747)

In [25]:
from tqdm import trange

found = set()
song_doubles = {}
for i in trange(len(song_ids)):
    if i in found:
        continue
    found_idxs = np.where(song_ids[i] == np.asarray(song_ids))[0]
    song_doubles[i] = len(found_idxs)
    for found_idx in found_idxs:
        found.add(found_idx)

100%|██████████| 115190/115190 [12:15<00:00, 156.71it/s]


In [33]:
doubles = [int(v) for v in song_doubles.values()]
mean = np.mean(doubles)
std = np.std(doubles)
max_ = np.max(doubles)
min_ = np.min(doubles)
unique, counts = np.unique(doubles, return_counts=True)
print(f"Mean amount of doubles: {mean:.4f}, std. {std:.4f}, with a max of {max_} and a min of {min_}")
for i in range(len(unique)):
    print(f"{unique[i]}: {counts[i]}")

Mean amount of doubles: 2.5743, std. 2.8815, with a max of 97 and a min of 1
1: 20824
2: 9462
3: 5444
4: 3111
5: 1864
6: 1164
7: 728
8: 567
9: 367
10: 309
11: 229
12: 140
13: 96
14: 79
15: 85
16: 68
17: 39
18: 20
19: 21
20: 15
21: 10
22: 14
23: 12
24: 16
25: 3
26: 1
27: 5
28: 5
29: 3
30: 5
31: 2
32: 1
33: 3
34: 3
35: 4
37: 4
38: 2
41: 2
42: 1
45: 1
46: 2
49: 1
52: 3
55: 2
56: 1
57: 2
58: 1
60: 1
61: 1
62: 1
75: 1
90: 1
97: 1
