In [2]:
import music21
import pandas as pd
import re
import ast
import os
from tqdm import tqdm
import numpy as np

## Chordonomicon

In [2]:
NOTES_AMERICAN = ['C', 'D', 'E', 'F', 'G', 'A', 'B']
NOTES_AMERICAN_SHARP = ['Cs', 'Ds', 'Es', 'Fs', 'Gs', 'As', 'Bs']
NOTES_AMERICAN_FLAT = ['Cb', 'Db', 'Eb', 'Fb', 'Gb', 'Ab', 'Bb']

NOTES_LATIN = ['do', 're', 'mi', 'fa', 'sol', 'la', 'si']
NOTES_LATIN_SHARP= ['dos', 'res', 'mis', 'fas', 'sols', 'las', 'sis']
NOTES_LATIN_FLAT= ['dob', 'reb', 'mib', 'fab', 'solb', 'lab', 'sib']

NOTES = dict(zip(NOTES_LATIN, NOTES_AMERICAN))
NOTES_SHARP = dict(zip(NOTES_LATIN_SHARP, NOTES_AMERICAN_SHARP))
NOTES_FLAT = dict(zip(NOTES_LATIN_FLAT, NOTES_AMERICAN_FLAT))      

In [3]:
def extract_chord_symbol(chord, note):
    for c in range(len(NOTES_AMERICAN)):                        
        if note == NOTES_LATIN[c]:
            tonic = NOTES[note]
            break
        elif note == NOTES_LATIN_SHARP[c]:
            tonic = NOTES_SHARP[note]
            break
        elif note == NOTES_LATIN_FLAT[c]:
            tonic = NOTES_FLAT[note]
            break        
    symbol = chord[len(tonic):]     
    return tonic, symbol, chord

In [None]:
def _parse_single_chord(chord_str, chord_mapping, degree_mapping):
        """
        Parsea un único acorde de texto a un objeto Chord de music21, conservando la inversión.
        
        chord_str: str - El acorde en formato de texto (por ejemplo, "Cmaj7/G").
        chord_mapping: dict - Mapeo de símbolos de acordes a notaciones compatibles con music21.
        degree_mapping: dict - Mapeo de grados a notas.
        
        """

        # Capturar acorde y bajo si existe
        
        chord_str = chord_str.split('/')        # Separa el bajo si existe
        
        chord = chord_str[0]
        
        if len(chord_str)>1:
                bass = chord_str[1]     # Nota bajo
        else:
                bass = ''
        
        note = degree_mapping[chord][0] # Nota fundamental del acorde 

        # Capturar tonica y especie de acorde
        tonic, symbol, original = extract_chord_symbol(chord, note)

        # Reemplazar notación de sostenidos y bemoles para music21
        tonic = tonic.replace('s', '#').replace('b', '-')
        bass = bass.replace('s', '#').replace('b', '-') if bass != '' else ''
        
        print(f"Tonic: {tonic}, Symbol: {symbol}, Bass: {bass}")
        
        # Captura cifrado mapeado en ChordSymbol
        symbol_mapped = ast.literal_eval(chord_mapping[symbol])[0].replace("'", "")

        # Construir acorde final
        if bass != '':
                bass = f"/{bass}"
                
        chord_ = tonic + symbol_mapped + bass
        
        chord_m21 = music21.harmony.ChordSymbol(chord_)
        
        print(f"Chord m21: {chord_m21}")     
        
        return chord_, bass

In [12]:
progresion = "<verse_1> D A G A D A G A <chorus_1> D G Bmin A D/Fs G Bmin A D/Fs G Bmin A D/Fs G Bmin A <verse_2> D A G A D A G A <chorus_2> D G Bmin A D/Fs G Bmin A D/Fs G Bmin A D/Fs G Bmin A <chorus_3> D G Bmin A D/Fs G Bmin A D/Fs G Bmin A D/Fs G Bmin A <bridge_1> Bmin G D A Bmin G D A Emin D/Fs G A Bmin Bmin7 Bmin <chorus_4> D G Bmin A D/Fs G Bmin A D/Fs G Bmin A D/Fs G Bmin A <chorus_5> D G Bmin A D/Fs G Bmin A D/Fs G Bmin A D/Fs G Bmin A <outro_1> D G Bmin A Asus2"
mapping_path_chords = '/mnt/c/Users/nehem/OneDrive - Universidad de Chile/Universidad/6to año/Data/MIDI/preprocced/Chordomicon/mirex_mapping_v2.csv' 
mapping_path_degree = '/mnt/c/Users/nehem/OneDrive - Universidad de Chile/Universidad/6to año/Data/MIDI/preprocced/Chordomicon/chords_mapping.csv' 

# Captura mapping de acordes
df_mapping_chordsymbol = pd.read_csv(mapping_path_chords)
df_mapping_degree = pd.read_csv(mapping_path_degree)

In [33]:
# chord_mapping = dict(zip(df_mapping_chordsymbol['Original Symbol'].apply(lambda x: x.replace('"', '')), df_mapping_chordsymbol['ChordSymbol'].apply(lambda x: x.replace('"', ''))))
chord_mapping = dict(zip(df_mapping_chordsymbol['Original Symbol'].apply(lambda x: x.replace('"', '')), df_mapping_chordsymbol['ChordSymbol_m21'].apply(lambda x: x.replace('"', ''))))
degree_mapping = dict(zip(df_mapping_degree['Chords'], df_mapping_degree['Notes'].apply(ast.literal_eval)))

In [14]:
# Elimina etiquetas como <intro_1>, <verse_1>, etc.
progresion_cleanned= re.sub(r'<[^>]+>', '', progresion)
# Reemplaza múltiples espacios con uno solo
progresion_cleanned = re.sub(r'\s+', ' ', progresion_cleanned).strip()
progresion_cleanned = progresion_cleanned.split(' ')

In [55]:
for p in progresion_cleanned:
    chord_, bass = _parse_single_chord(p, chord_mapping, degree_mapping)   
     
    if bass != '':
        chord = chord_ + '/' + bass
    else:
        chord = chord_


Tonic: D, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol D>
Tonic: A, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol A>
Tonic: G, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol G>
Tonic: A, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol A>
Tonic: D, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol D>
Tonic: A, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol A>
Tonic: G, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol G>
Tonic: A, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol A>
Tonic: D, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol D>
Tonic: G, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol G>
Tonic: B, Symbol: min, Bass: 
Chord m21: <music21.harmony.ChordSymbol Bm>
Tonic: A, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol A>
Tonic: D, Symbol: , Bass: F#
Chord m21: <music21.harmony.ChordSymbol D/F#>
Tonic: G, Symbol: , Bass: 
Chord m21: <music21.harmony.ChordSymbol G>
Tonic: B, S

## Popular Hook

In [92]:
DATASET_ROOT_PATH =  '/home/neme/workspace/Data/MIDI/preprocced/Popular-hook/'
# Nombre del archivo de metadatos principal
INFO_TABLES_FILENAME = 'info_tables.xlsx'
    
# Ruta completa al archivo de metadatos
INFO_TABLES_FILE_PATH = os.path.join(DATASET_ROOT_PATH, INFO_TABLES_FILENAME)

In [98]:
sequence_length = 16
piano_range = (24, 108)
piano_size = piano_range[1] - piano_range[0]

In [93]:
metadata_df = pd.read_excel(INFO_TABLES_FILE_PATH, engine='openpyxl')
metadata_df

Unnamed: 0,idx,path,index,singer,song,section,song_url,genres,youtube_url,start,...,syncStart,syncEnd,lyricPath(api_1_netease),lyricPath(api_2_qqmusic),lyricPath(api_3),nation,has_video,language,extract_lyrics,tonality
0,0,./midi/a/a-boogie-wit-da-hoodie/drowning/intro...,a,a-boogie-wit-da-hoodie,drowning,intro,https://www.hooktheory.com/theorytab/view/a-bo...,['Hip-Hop/Rap'],https://www.youtube.com/watch?v=rvaJ7QlhH0g,,...,0.000000,0.141793,,./lyrics/qqmusic/drowning-a-boogie-wit-da-hood...,,US,yes,english,yes,C harmonicminor
1,1,./midi/a/a-certain-mushroom/imma-write-a-song/...,a,a-certain-mushroom,imma-write-a-song,instrumental,https://www.hooktheory.com/theorytab/view/a-ce...,['Worship'],,,...,0.050000,0.150000,,,,,,,,C Major
2,2,./midi/a/a-day-to-remember/downfall-of-us-all/...,a,a-day-to-remember,downfall-of-us-all,intro-and-verse,https://www.hooktheory.com/theorytab/view/a-da...,"['Metal', 'Rock']",https://www.youtube.com/watch?v=CN4IIgFz93k,9.18,...,,,,,,,yes,english,yes,C Major
3,3,./midi/a/a-day-to-remember/downfall-of-us-all/...,a,a-day-to-remember,downfall-of-us-all,pre-chorus-and-chorus,https://www.hooktheory.com/theorytab/view/a-da...,"['Metal', 'Rock']",https://www.youtube.com/watch?v=CN4IIgFz93k,38.69,...,,,,,,,yes,english,yes,C Major
4,4,./midi/a/a-do/dark/chorus.mid,a,a-do,dark,chorus,https://www.hooktheory.com/theorytab/view/a-do...,['Pop'],https://www.youtube.com/watch?v=_flXaAvyGso,,...,0.403450,0.586555,,,,,yes,nynorsk,yes,E Major
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38667,38667,./midi/z/zz-top/gimme-all-your-lovin/chorus.mid,z,zz-top,gimme-all-your-lovin,chorus,https://www.hooktheory.com/theorytab/view/zz-t...,['Rock'],https://www.youtube.com/watch?v=Ae829mFAGGE,,...,0.271978,0.328957,,./lyrics/qqmusic/gimme-all-your-lovin-zz-top.txt,,US,yes,english,yes,C mixolydian
38668,38668,./midi/z/zz-top/gimme-all-your-lovin/intro.mid,z,zz-top,gimme-all-your-lovin,intro,https://www.hooktheory.com/theorytab/view/zz-t...,['Rock'],https://www.youtube.com/watch?v=Ae829mFAGGE&ab...,,...,0.156571,0.184784,,./lyrics/qqmusic/gimme-all-your-lovin-zz-top.txt,,US,yes,english,yes,C mixolydian
38669,38669,./midi/z/zz-top/gimme-all-your-lovin/verse.mid,z,zz-top,gimme-all-your-lovin,verse,https://www.hooktheory.com/theorytab/view/zz-t...,['Rock'],https://www.youtube.com/watch?v=Ae829mFAGGE&ab...,,...,0.214233,0.271463,,./lyrics/qqmusic/gimme-all-your-lovin-zz-top.txt,,US,yes,english,yes,C mixolydian
38670,38670,./midi/z/zz-top/la-grange/verse.mid,z,zz-top,la-grange,verse,https://www.hooktheory.com/theorytab/view/zz-t...,"['Blues', 'Rock']",https://www.youtube.com/watch?v=vqz0wRaie2g&ab...,,...,0.154338,0.179794,,./lyrics/qqmusic/la-grange-zz-top.txt,,US,,,,A dorian


In [102]:
tonalitys = metadata_df['tonality'].unique().tolist()

# quitar nota del tono
mode = []
tone = []
for t in tonalitys:
    mode.append(t.split(' ')[1])
    tone.append(t.split(' ')[0])
    
# eliminar duplicados
mode = list(set(mode))
tone = list(set(tone))

In [104]:
mode

['locrian',
 'mixolydian',
 'phrygian',
 'minor',
 'harmonicminor',
 'lydian',
 'dorian',
 'phrygiandominant',
 'Major']

In [103]:
tone

['D#',
 'F',
 'F#',
 'Db',
 'A#',
 'G',
 'E',
 'D',
 'Eb',
 'A',
 'Bb',
 'C',
 'B',
 'G#',
 'C#',
 'Gb',
 'E#',
 'Ab']

In [100]:
target_df = metadata_df
genre_filter= 'Video Game'

if genre_filter:
    print(f"Filtrando el dataset por el género: '{genre_filter}'...")
    # Filtrar si la columna 'genres' contiene el string del filtro
    target_df = metadata_df[metadata_df['genres'].str.contains(genre_filter, na=False)].copy()
    print(f"Se encontraron {len(target_df)} entradas para el género '{genre_filter}'.")

Filtrando el dataset por el género: 'Video Game'...
Se encontraron 4326 entradas para el género 'Video Game'.


In [101]:
def _extract_chords_from_midi(midi_path: str) -> list:
        """
        Extrae una secuencia de acordes de un archivo MIDI, buscando y utilizando
        únicamente la pista llamada 'Chord'.
        """
        try:
            # 1. Cargar el archivo MIDI completo
            score = music21.converter.parse(midi_path)
            
            # 2. Buscar la pista (Part) que contenga 'Chord' en su nombre
            chord_part = None
            
            for part in score.parts:
                # El nombre de la pista suele estar en el atributo .id o .partName
                # Usamos .title() y 'in' para ser flexibles (ej. 'chord', 'Chord', 'CHORD')
                if 'Chord' in str(part.partName).title():
                    chord_part = part
                    break # Encontramos la pista, salimos del bucle
            
            # 3. Si no se encuentra una pista de acordes, no podemos continuar
            if chord_part is None:
                # Opcional: podrías intentar hacer chordify() a toda la partitura como fallback
                # score.chordify() si quieres, pero es más seguro descartar el archivo.
                return None

            # 4. ¡La clave! Aplicar Chordify solo sobre la pista de acordes
            chordified_part = chord_part.chordify()
            
            # 5. Extraer los acordes de la parte ya procesada
            chords = [element for element in chordified_part.recurse().getElementsByClass('Chord')]
                
            return chords if chords else None
        
        except Exception as e:
            print(f"No se pudo procesar el archivo MIDI {os.path.basename(midi_path)}: {e}")
            return None

In [102]:
def _chords_to_piano_roll(chord_sequence: list) -> np.ndarray:
        """Convierte una secuencia de N acordes de music21 a una matriz de piano roll."""
        piano_roll = np.zeros((sequence_length, piano_size), dtype=np.int8)
        for i, chord in enumerate(chord_sequence):
            for pitch in chord.pitches:
                midi_note = pitch.midi
                if piano_range[0] <= midi_note < piano_range[1]:
                    note_index = midi_note - piano_range[0]
                    piano_roll[i, note_index] = 1
        return piano_roll

In [103]:
def _extract_emotion_from_csv(emotion_csv_path: str) -> dict:
    """Lee el archivo CSV de emoción y extrae los valores relevantes."""
    try:
        emotion_df = pd.read_csv(emotion_csv_path)
        # El dataset puede tener columnas como 'Q1', 'Q2', 'Q3', 'Q4' de Russell.
        return {'midi_emotion_predected': emotion_df['midi_emotion_predected'].iloc[0]}
    except Exception:
        return {}

In [None]:
processed_data = []

for index, row in tqdm(target_df.iterrows(), total=target_df.shape[0]):
        
    path_from_info = row.get('path', '')
    path_from_info = path_from_info[2:]
    # 1. Lógica de corrección de rutas
    section_folder_path = path_from_info.replace('.mid', '')
    full_section_path = os.path.join(DATASET_ROOT_PATH, section_folder_path)

    section_name = os.path.basename(full_section_path)
        
    # 2. Construir rutas a los archivos MIDI y de emoción
    midi_file_path = os.path.join(full_section_path, f"{section_name}.mid")
    emotion_csv_path = os.path.join(full_section_path, f"{section_name}_midi_emotion_result.csv")
    
    
    # 3. Extraer datos de ambas fuentes
    m21_chords = _extract_chords_from_midi(midi_file_path)
    emotion_data = _extract_emotion_from_csv(emotion_csv_path)

    if not m21_chords or len(m21_chords) < sequence_length:
                continue

    # 4. Aplicar ventana deslizante y empaquetar datos
    for i in range(len(m21_chords) - sequence_length + 1):
        sequence = m21_chords[i:i + sequence_length]
        
        piano_roll_sequence = _chords_to_piano_roll(sequence)
        chord_symbol_sequence = [c.pitchedCommonName for c in sequence]
        
        metadata = {
            'idx': row.get('idx'),
            'artist': row.get('singer', 'Unknown'),
            'song': row.get('song', 'Unknown'),
            'section': row.get('section', 'Unknown'),
            'tonality': row.get('tonality', 'Unknown'),
            'genres': row.get('genres', 'Unknown'),
            **emotion_data
        }

        processed_data.append({
            'piano_roll': piano_roll_sequence,
            'chord_symbols': chord_symbol_sequence,
            **metadata
        })

## Lectura dataset

In [None]:
# 01 leer Chordomicon dataset pickle
# chordonomicon_dataset_path = '/mnt/c/Users/nehem/OneDrive - Universidad de Chile/Universidad/6to año/Data/MIDI/preprocced/Chordomicon/dataset_01.pkl'
chordonomicon_dataset_path = '/mnt/c/Users/nehem/OneDrive - Universidad de Chile/Universidad/6to año/Data/MIDI/preprocced/Chordomicon/batch/dataset_01_194.pkl'
chordonomicon_data = pd.read_pickle(chordonomicon_dataset_path)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Users/nehem/OneDrive - Universidad de Chile/Universidad/6to año/Data/MIDI/preprocced/Chordomicon/batch/dataset_01_1.pkl'

In [6]:
chordonomicon_data.keys()

dict_keys(['piano_rolls', 'chord_symbols', 'metadata'])

In [7]:
chordonomicon_data['metadata'].columns

Index(['original_id', 'artist_id', 'song_id', 'key_tonic', 'key_mode',
       'key_correlation', 'main_genre', 'genres', 'rock_genre', 'release_date',
       'decade'],
      dtype='object')

In [17]:
chordonomicon_data['metadata']

Unnamed: 0,original_id,artist_id,song_id,key_tonic,key_mode,key_correlation,main_genre,genres,rock_genre,release_date,decade
0,109226,artist_32299,1edxpCvMuMokv45hXSoidc,C,major,0.860908,pop rock,"[alternative metal, modern rock, neon pop punk...",pop rock,2001-01-14,2000.0
1,109226,artist_32299,1edxpCvMuMokv45hXSoidc,C,major,0.860908,pop rock,"[alternative metal, modern rock, neon pop punk...",pop rock,2001-01-14,2000.0
2,109226,artist_32299,1edxpCvMuMokv45hXSoidc,C,major,0.860908,pop rock,"[alternative metal, modern rock, neon pop punk...",pop rock,2001-01-14,2000.0
3,109227,,,G,major,0.950721,,[],,,
4,109227,,,G,major,0.950721,,[],,,
...,...,...,...,...,...,...,...,...,...,...,...
3337,109608,,,C,major,0.916200,,[],,,
3338,109608,,,C,major,0.916200,,[],,,
3339,109608,,,C,major,0.916200,,[],,,
3340,109608,,,C,major,0.916200,,[],,,


In [3]:
# 02 leer Popular-hook dataset pickle
popular_hook_dataset_path = '/home/neme/workspace/Data/MIDI/preprocced/Popular-hook/batch/dataset_01_1.pkl'

popular_hook_data = pd.read_pickle(popular_hook_dataset_path)

In [65]:
popular_hook_data.keys()

dict_keys(['piano_rolls', 'chord_symbols', 'metadata'])

In [4]:
popular_hook_data['metadata'].columns

Index(['idx', 'artist', 'song', 'section', 'tonality', 'genres', 'path',
       'midi_emotion_predected'],
      dtype='object')

In [112]:
idx = popular_hook_data['metadata']['idx'].unique().tolist()
len(idx)

888

In [90]:
popular_hook_data['piano_rolls'].shape

(20, 16, 84)

In [91]:
for i in range(20):
    print(popular_hook_data['chord_symbols'][i])


['Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'F', 'F']
['Am', 'Am', 'Am', 'Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'F', 'F', 'F', 'F', 'F']
['Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'Em', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F']
['Cm', 'Cm', ['D', 'F', 'G#', 'C'], 'E-+M9/G', 'Cm', 'E-+M7', 'E-', 'Aø7', ['B-', 'C#', 'F', 'G#'], ['E-', 'G', 'B-', 'C#'], ['G#', 'C', 'E-', 'G'], ['G#', 'C', 'E-', 'G'], ['C#', 'F', 'G#', 'C'], ['C#', 'F', 'G#', 'C'], ['D', 'F', 'G#', 'C'], 'E-+M9/G']
['Cm', ['D', 'F', 'G#', 'C'], 'E-+M9/G', 'Cm', 'E-+M7', 'E-', 'Aø7', ['B-', 'C#', 'F', 'G#'], ['E-', 'G', 'B-', 'C#'], ['G#', 'C', 'E-', 'G'], ['G#', 'C', 'E-', 'G'], ['C#', 'F', 'G#', 'C'], ['C#', 'F', 'G#', 'C'], ['D', 'F', 'G#', 'C'], 'E-+M9/G', ['G#', 'C', 'E-', 'F#', 'B-', 'F']]
[['D', 'F', 'G#', 'C'], 'E-+M9/G', 'Cm', 'E-+M7', 'E-', 'Aø7', ['B-', 'C#', 'F', 'G#'], ['E-', 'G', 'B-', 'C#'], ['G#', 'C', 'E-', 'G'], ['G#', 'C', 'E-', 'G'], ['C#', 'F', 'G#', 'C'],

## Concatenar Batches

In [2]:
# Juntar batches chordonomicon
chordonomicon_batch_folder = '/mnt/c/Users/nehem/OneDrive - Universidad de Chile/Universidad/6to año/Data/MIDI/preprocced/Chordomicon/batch'
all_batches = [f for f in os.listdir(chordonomicon_batch_folder) ]#if f.startswith('dataset_') and f.endswith('.pkl')]

# Ordenar los archivos por número de batch 'dataset_1_{i}.pkl'
all_batches.sort(key=lambda x: int(re.search(r'dataset_\d+_(\d+)\.pkl', x).group(1)))

In [None]:
# Juntar todos los batches en uno solo
combined_dataset = pd.concat([pd.read_pickle(os.path.join(chordonomicon_batch_folder, f)) for f in all_batches], ignore_index=True)