In [None]:
import music21
import pandas as pd
import re
import ast
import os
from tqdm import tqdm
import numpy as np
import json
import glob

## Lectura dataset

In [12]:
# 01 leer Chordomicon dataset parquet
chordonomicon_dataset_path = '/home/neme/workspace/Data/MIDI/preprocced/Chordomicon/batch/dataset_01_1.parquet'
df_chordonomicon = pd.read_parquet(chordonomicon_dataset_path)

In [13]:
df_chordonomicon.keys()

Index(['chord_symbols', 'piano_rolls', 'sequence_length', 'piano_roll_size',
       'original_id', 'artist_id', 'song_id', 'key_tonic', 'key_mode',
       'key_correlation', 'main_genre', 'genres', 'rock_genre', 'release_date',
       'decade'],
      dtype='object')

In [14]:
df_chordonomicon['piano_rolls'][0].shape

(1344,)

In [15]:
df_chordonomicon['chord_symbols'][0]

'["C", "F", "C", "E7", "Am", "C", "F", "C", "G7", "C", "F", "C", "E7", "Am", "C", "F"]'

In [16]:

for i, row in tqdm(df_chordonomicon.iterrows(), total=len(df_chordonomicon), desc="Reconstruyendo datos"):
    # Reconstruir piano roll
    piano_roll_flat = row['piano_rolls']
    piano_rolls = np.array(piano_roll_flat).reshape(
        row['sequence_length'], row['piano_roll_size']
    )
    # Reconstruir chord symbols
    chord_symbols = json.loads(row['chord_symbols'])
    
    # Remplazar en df_chordonomicon
    df_chordonomicon.at[i, 'piano_rolls'] = piano_rolls
    df_chordonomicon.at[i, 'chord_symbols'] = chord_symbols
    
    

Reconstruyendo datos: 100%|██████████| 5000/5000 [00:00<00:00, 7485.06it/s]


In [26]:
# 02 leer Popular-hook dataset parquet
popular_hook_dataset_path = '/home/neme/workspace/Data/MIDI/preprocced/Popular-hook/batch/dataset_01_53.parquet'

popular_hook_data = pd.read_parquet(popular_hook_dataset_path)

In [27]:
popular_hook_data.keys()

Index(['piano_roll', 'chord_symbols', 'sequence_length', 'piano_roll_size',
       'idx', 'artist', 'song', 'section', 'tonality', 'genres', 'path',
       'midi_emotion_predected'],
      dtype='object')

In [28]:
popular_hook_data['artist'][0] 


'a-teens'

In [29]:
popular_hook_data['song'][0] 

'upside-down'

In [32]:
popular_hook_data['section'][0] 

'verse-and-pre-chorus'

In [30]:
popular_hook_data['tonality'][0] 

'A Major'

In [31]:
for i in popular_hook_data['chord_symbols']:
    print(i)

["A", "D", "F#m", "E", "Bm7", "D", "E", "D", "C#m", "F#sus", ["F#", "B-", "C#"], "Bm", "Esus", "E7", "C#m", "F#sus"]
["F#m", "E", "Bm7", "D", "E", "D", "C#m", "F#sus", ["F#", "B-", "C#"], "Bm", "Esus", "E7", "C#m", "F#sus", ["F#", "B-", "C#"], "Bm"]
["F", "F", "F", "Fmaj7", "Fmaj7", "Fmaj7", "C/F", "C/F", "Fmaj7", "F", "F", "F", "Fmaj7", "Fmaj7", "Fmaj7", "C/F"]
["F", "Fmaj7", "Fmaj7", "Fmaj7", "C/F", "C/F", "Fmaj7", "F", "F", "F", "Fmaj7", "Fmaj7", "Fmaj7", "C/F", "C/F", "Fmaj7"]
["Fmaj7", "Fmaj7", "C/F", "C/F", "Fmaj7", "F", "F", "F", "Fmaj7", "Fmaj7", "Fmaj7", "C/F", "C/F", "Fmaj7", "Fmaj7", "Fmaj7"]
["C/F", "C/F", "Fmaj7", "F", "F", "F", "Fmaj7", "Fmaj7", "Fmaj7", "C/F", "C/F", "Fmaj7", "Fmaj7", "Fmaj7", "Fmaj7", "FM9"]
["Fmaj7", "F", "F", "F", "Fmaj7", "Fmaj7", "Fmaj7", "C/F", "C/F", "Fmaj7", "Fmaj7", "Fmaj7", "Fmaj7", "FM9", "FM9", "FM9"]
["F", "F", "Fmaj7", "Fmaj7", "Fmaj7", "C/F", "C/F", "Fmaj7", "Fmaj7", "Fmaj7", "Fmaj7", "FM9", "FM9", "FM9", "FM9", "Fmaj7"]
["Fmaj7", "Fmaj7",

## Concatenar Batches

In [2]:

def concatenate_parquet_batches(batch_dir: str, output_file: str, chunk_size: int = 10):
    """
    Concatena batches en formato Parquet de manera eficiente en memoria.
    """
    # Encontrar todos los archivos parquet
    batch_files = sorted(glob.glob(os.path.join(batch_dir, "dataset_01_*.parquet")))
    print(f"Encontrados {len(batch_files)} archivos batch")
    
    if not batch_files:
        print("No se encontraron archivos batch")
        return
    
    # Procesar en chunks para evitar problemas de memoria
    chunk_files = []
    
    for i in range(0, len(batch_files), chunk_size):
        chunk_batch_files = batch_files[i:i + chunk_size]
        
        print(i+chunk_size, len(batch_files))
        
        print(f"Procesando chunk {i//chunk_size + 1}/{(len(batch_files)-1)//chunk_size + 1}")
        
        # Leer y concatenar archivos del chunk
        chunk_dfs = []
        for file in tqdm(chunk_batch_files, desc="Cargando archivos"):
            df = pd.read_parquet(file)
            chunk_dfs.append(df)
        
        # Concatenar chunk
        chunk_df = pd.concat(chunk_dfs, ignore_index=True)
        
        # Guardar chunk intermedio
        chunk_output = f"{output_file}_chunk_{i//chunk_size}.parquet"
        chunk_df.to_parquet(chunk_output, compression='snappy')
        chunk_files.append(chunk_output)
        
        # Limpiar memoria
        del chunk_dfs, chunk_df            
            
        
    # Concatenar chunks finales
    print("Concatenando chunks finales...")
    final_dfs = []
    for chunk_file in tqdm(chunk_files, desc="Cargando chunks"):
        df = pd.read_parquet(chunk_file)
        final_dfs.append(df)
    
    # Concatenación final
    print("Realizando concatenación final...")
    final_df = pd.concat(final_dfs, ignore_index=True)
    
    # Guardar resultado final
    final_df.to_parquet(f"{output_file}.parquet", compression='snappy')
    print(f"Dataset final guardado: {output_file}.parquet")
    print(f"Forma final: {final_df.shape}")
    
    # Limpiar archivos temporales
    for chunk_file in chunk_files:
        os.remove(chunk_file)
    
    return final_df

In [None]:
chunk_path = '/home/neme/workspace/Data/MIDI/preprocced/Chordomicon/chunk'

In [None]:
# concatenar chordonomicon
concatenate_parquet_batches(
    batch_dir='/home/neme/workspace/Data/MIDI/preprocced/Chordomicon/batch',
    output_file='/home/neme/workspace/Data/MIDI/preprocced/Chordomicon/chunk/dataset_chordonomicon_full',
    chunk_size=10
    )

SyntaxError: invalid syntax (2928295458.py, line 6)

In [None]:
# concatenar popular-hook
concatenate_parquet_batches(
    batch_dir   ='/home/neme/workspace/Data/MIDI/preprocced/Popular-hook/batch',
    output_file ='/home/neme/workspace/Data/MIDI/preprocced/Popular-hook/chunk/dataset_popular-hook_full',
    chunk_size  =5
    )

Encontrados 21 archivos batch
5 21
Procesando chunk 1/5


Cargando archivos:   0%|          | 0/5 [00:00<?, ?it/s]

Cargando archivos: 100%|██████████| 5/5 [00:00<00:00,  6.72it/s]


10 21
Procesando chunk 2/5


Cargando archivos: 100%|██████████| 5/5 [00:00<00:00, 12.82it/s]


15 21
Procesando chunk 3/5


Cargando archivos: 100%|██████████| 5/5 [00:00<00:00, 17.67it/s]


20 21
Procesando chunk 4/5


Cargando archivos: 100%|██████████| 5/5 [00:00<00:00, 15.09it/s]


25 21
Procesando chunk 5/5


Cargando archivos: 100%|██████████| 1/1 [00:00<00:00, 15.39it/s]


Concatenando chunks finales...


Cargando chunks: 100%|██████████| 5/5 [00:02<00:00,  1.93it/s]


Realizando concatenación final...
Dataset final guardado: /home/neme/workspace/Data/MIDI/preprocced/Popular-hook/chunk/dataset_01_full.parquet
Forma final: (102299, 12)


Unnamed: 0,piano_rolls,chord_symbols,sequence_length,piano_roll_size,idx,artist,song,section,tonality,genres,path,midi_emotion_predected
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""Dm"", ""F"", ""Bdim"", ""Em"", ""Dm"", ""F"", ""Bdim"", ""...",16,84,1,a-certain-mushroom,imma-write-a-song,instrumental,C Major,['Worship'],/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q2
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""F"", ""Bdim"", ""Em"", ""Dm"", ""F"", ""Bdim"", ""Em"", ""...",16,84,1,a-certain-mushroom,imma-write-a-song,instrumental,C Major,['Worship'],/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q2
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""Bdim"", ""Em"", ""Dm"", ""F"", ""Bdim"", ""Em"", ""Dm"", ...",16,84,1,a-certain-mushroom,imma-write-a-song,instrumental,C Major,['Worship'],/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q2
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""Em"", ""Dm"", ""F"", ""Bdim"", ""Em"", ""Dm"", ""F"", ""Bd...",16,84,1,a-certain-mushroom,imma-write-a-song,instrumental,C Major,['Worship'],/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q2
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""Dm"", ""F"", ""Bdim"", ""Em"", ""Dm"", ""F"", ""Bdim"", ""...",16,84,1,a-certain-mushroom,imma-write-a-song,instrumental,C Major,['Worship'],/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q2
...,...,...,...,...,...,...,...,...,...,...,...,...
102294,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""CM9"", ""CM9"", ""CM9"", ""Dm9"", ""Dm9"", ""Dm9"", ""Dm...",16,84,16940,kevin-macleod,casa-bossa-nova,verse,C Major,"['Latin', 'Soundtrack']",/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q3
102295,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""Cm11"", ""E-M9"", ""Dm7"", [""A"", ""C"", ""E-"", ""G"", ...",16,84,16948,kevin-macleod,chill-wave,verse,C dorian,['Soundtrack'],/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q3
102296,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""E-M9"", ""Dm7"", [""A"", ""C"", ""E-"", ""G"", ""D"", ""F""...",16,84,16948,kevin-macleod,chill-wave,verse,C dorian,['Soundtrack'],/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q3
102297,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[""Dm7"", [""A"", ""C"", ""E-"", ""G"", ""D"", ""F""], ""Cm11...",16,84,16948,kevin-macleod,chill-wave,verse,C dorian,['Soundtrack'],/home/neme/workspace/Data/MIDI/preprocced/Popu...,Q3
