In [20]:
import pandas as pd

# ===============================
#    Funções Auxiliares (Helpers)
# ===============================

    """
    Loader flexível para os masters do MERGE Dataset.
    
    Parâmetros:
        path (str): caminho do master_metadata (ex: 'metadata/master_metadata_audio_balanced.csv')
        split (str): 'train', 'val', 'test', ou None (todos)
        split_type (str): 'split_70_15_15' ou 'split_40_30_30'
        quadrant (int or list): 1,2,3,4 ou lista deles, ou None (todos)
        year (int or list): ano ou lista de anos (ex: 2020 ou [2019,2020]), ou None
        artist (str or list): nome do artista ou lista (case-insensitive, match exato ou parcial)
        title (str or list): nome do título ou lista (case-insensitive, match exato ou parcial)
        genres (str or list): gênero(s) a buscar (parcial, case-insensitive)
        return_cols (list): retorna apenas estas colunas (default: todas)
        as_copy (bool): retorna uma cópia do DataFrame
    Retorna:
        pd.DataFrame: dados filtrados
    """

In [11]:
def _filter_by_column(df, value, columns):
    """Filtra o DataFrame por qualquer uma das colunas possíveis, suporta listas e case-insensitive."""
    for col in columns:
        if col in df.columns:
            if isinstance(value, (list, tuple, set)):
                
                # Para listas: comparação exata (exceto strings)
                if df[col].dtype == object:
                    return df[df[col].str.lower().isin([str(v).lower() for v in value])]
                else:
                    return df[df[col].isin(value)]
            else:
                # Para strings: busca parcial e case-insensitive
                if isinstance(value, str):
                    return df[df[col].str.lower().str.contains(value.lower(), na=False)]
                else:
                    return df[df[col] == value]
    return df

# ===============================
#    Loader Principal
# ===============================

In [18]:
def load_merge_master(
    path,
    split=None,
    split_type='split_70_15_15',
    quadrant=None,
    year=None,
    artist=None,
    title=None,
    genres=None,
    return_cols=None,
    as_copy=True
):
    """
    Loader flexível para os masters do MERGE Dataset.

    Parâmetros detalhados em schema/data_dictionary.md.
    """

    df = pd.read_csv(path)
    df.columns = [c.lower() for c in df.columns]

    # Filtros principais
    if split is not None and split_type in df.columns:
        df = df[df[split_type] == split]

    if quadrant is not None and 'quadrant' in df.columns:
        if isinstance(quadrant, (list, tuple, set)):
            df = df[df['quadrant'].isin(quadrant)]
        else:
            df = df[df['quadrant'] == quadrant]

    if year is not None:
        df = _filter_by_column(df, year, ['year', 'Year', 'ActualYear'])

    if artist is not None:
        df = _filter_by_column(df, artist, ['artist', 'Artist'])

    if title is not None:
        df = _filter_by_column(df, title, ['title', 'Title'])

    if genres is not None:
        df = _filter_by_column(df, genres, ['genres', 'Genres'])

    # Seleção de colunas finais (opcional)
    if return_cols:
        df = df[return_cols]

    return df.copy() if as_copy else df

In [12]:
    # Filtro por split
    if split is not None and split_type in df.columns:
        df = df[df[split_type] == split]

    # Filtro por quadrante
    if quadrant is not None and 'quadrant' in df.columns:
        if isinstance(quadrant, (list, tuple, set)):
            df = df[df['quadrant'].isin(quadrant)]
        else:
            df = df[df['quadrant'] == quadrant]

    # Filtro por ano
    for ycol in ['year', 'Year', 'ActualYear']:
        if year is not None and ycol in df.columns:
            if isinstance(year, (list, tuple, set)):
                df = df[df[ycol].isin(year)]
            else:
                df = df[df[ycol] == year]
            break

    # Filtro por artista
    for acol in ['artist', 'Artist']:
        if artist is not None and acol in df.columns:
            if isinstance(artist, (list, tuple, set)):
                df = df[df[acol].str.lower().isin([str(a).lower() for a in artist])]
            else:
                df = df[df[acol].str.lower().str.contains(str(artist).lower(), na=False)]
            break

    # Filtro por título
    for tcol in ['title', 'Title']:
        if title is not None and tcol in df.columns:
            if isinstance(title, (list, tuple, set)):
                mask = df[tcol].str.lower().isin([str(t).lower() for t in title])
                df = df[mask]
            else:
                df = df[df[tcol].str.lower().str.contains(str(title).lower(), na=False)]
            break

    # Filtro por gêneros
    for gcol in ['genres', 'Genres']:
        if genres is not None and gcol in df.columns:
            if isinstance(genres, (list, tuple, set)):
                mask = df[gcol].str.lower().apply(lambda x: any(g.lower() in str(x) for g in genres))
            else:
                mask = df[gcol].str.lower().str.contains(str(genres).lower(), na=False)
            df = df[mask]
            break

NameError: name 'split' is not defined

# ===============================
#    Exemplo de Uso 
# ===============================

In [15]:
if __name__ == '__main__':
    # Exemplo: carregando apenas músicas de treino do quadrante 1, ano 2010, gênero 'pop'
    df = load_merge_master(
        '../metadata/master_metadata_audio_balanced.csv',
        split='train',
        split_type='split_70_15_15',
        quadrant=1,
        year=2010,
        genres='pop'
    )
print(df.head())



Empty DataFrame
Columns: [song_id, arousal, valence, split_40_30_30, split_70_15_15, Quadrant, AllMusic Id, AllMusic Extraction Date, Artist, Title, Relevance, Year, LowestYear, Duration, Moods, MoodsAll, MoodsAllWeights, Genres, GenreWeights, Themes, ThemeWeights, Styles, StyleWeights, AppearancesTrackIDs, AppearancesAlbumIDs, Sample, SampleURL, ActualYear, num_Genres, num_MoodsAll]
Index: []

[0 rows x 30 columns]


In [19]:
# Exemplo: buscar músicas por artista 'adele'
df_adele = load_merge_master(
    '../metadata/master_metadata_audio_balanced.csv',
    artist='adele'
)
print(df_adele[['song_id', 'artist', 'title']].head())

           song_id         artist               title
1129  MT0004230500  Adele Astaire  Fascinating Rhythm


In [16]:
# Exemplo de uso direto pelo terminal/python
if __name__ == '__main__':
    df = load_merge_master(
        '../metadata/master_metadata_audio_balanced.csv',
        split='train',
        split_type='split_70_15_15',
        quadrant=[1, 2],
        year=2010,
        artist='Adele',
        genres='pop'
    )
    print(df.head())

Empty DataFrame
Columns: [song_id, arousal, valence, split_40_30_30, split_70_15_15, Quadrant, AllMusic Id, AllMusic Extraction Date, Artist, Title, Relevance, Year, LowestYear, Duration, Moods, MoodsAll, MoodsAllWeights, Genres, GenreWeights, Themes, ThemeWeights, Styles, StyleWeights, AppearancesTrackIDs, AppearancesAlbumIDs, Sample, SampleURL, ActualYear, num_Genres, num_MoodsAll]
Index: []

[0 rows x 30 columns]


In [17]:
from scripts.loader import load_merge_master

# Carregar apenas os dados de treino do split 70/15/15 e quadrante 2
df = load_merge_master('metadata/master_metadata_audio_balanced.csv', split='train', split_type='split_70_15_15', quadrant=2)
print(df.head())

ModuleNotFoundError: No module named 'scripts.loader'