In [1]:
import os
import glob # For finding files
import h5py # For reading HDF5 files
import pandas as pd # For creating DataFrames

In [2]:
# --- Configuration ---
# IMPORTANT: Update this path to where your MSD Subset data is located
MSD_SUBSET_PATH = 'C:\\Narasimha\\KLETU Related\\6th Semester Related\\GenAI and NLP\\GenAI\\Course Project\\GitHub Repo\\Multi-Hop-RAG-for-Personalized-Music-Recommendation\\data\\raw\\MillionSongSubset\\' # Example path

In [6]:
# --- Functions ---

def find_h5_files(root_dir):
    """Recursively finds all HDF5 files (.h5) in a directory."""
    h5_files = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.h5'):
                h5_files.append(os.path.join(root, file))
    return h5_files

def extract_msd_features(h5_file_path):
    """Extracts selected features from a single MSD HDF5 file."""
    features = {}
    try:
        with h5py.File(h5_file_path, 'r') as h5_file:
            # Example features - Adjust based on MSD documentation and your needs
            # Metadata group
            features['artist_name'] = h5_file['metadata']['songs']['artist_name'][0].decode('utf-8')
            features['title'] = h5_file['metadata']['songs']['title'][0].decode('utf-8')
            features['release'] = h5_file['metadata']['songs']['release'][0].decode('utf-8')
            features['song_id'] = h5_file['metadata']['songs']['song_id'][0].decode('utf-8')

            # Analysis group
            features['duration'] = h5_file['analysis']['songs']['duration'][0]
            features['key'] = h5_file['analysis']['songs']['key'][0]
            features['key_confidence'] = h5_file['analysis']['songs']['key_confidence'][0]
            features['loudness'] = h5_file['analysis']['songs']['loudness'][0]
            features['mode'] = h5_file['analysis']['songs']['mode'][0]
            features['mode_confidence'] = h5_file['analysis']['songs']['mode_confidence'][0]
            features['tempo'] = h5_file['analysis']['songs']['tempo'][0]
            features['time_signature'] = h5_file['analysis']['songs']['time_signature'][0]
            features['time_signature_confidence'] = h5_file['analysis']['songs']['time_signature_confidence'][0]

            # MusicBrainz group (often empty in subset)
            features['year'] = h5_file['musicbrainz']['songs']['year'][0]

    except Exception as e:
        print(f"Error processing file {h5_file_path}: {e}")
        # Return None or empty dict if file fails to process
        return None
    return features

In [5]:
# --- Main Execution ---

print(f"Looking for HDF5 files in: {MSD_SUBSET_PATH}")
all_h5_files = find_h5_files(MSD_SUBSET_PATH)
print(f"Found {len(all_h5_files)} HDF5 files.")

Looking for HDF5 files in: C:\Narasimha\KLETU Related\6th Semester Related\GenAI and NLP\GenAI\Course Project\GitHub Repo\Multi-Hop-RAG-for-Personalized-Music-Recommendation\data\raw\MillionSongSubset\
Found 10000 HDF5 files.


In [7]:
# --- Process a SMALL SAMPLE first! ---
sample_size = 100 # Adjust as needed
sample_files = all_h5_files[:sample_size]
print(f"Processing a sample of {len(sample_files)} files...")

all_features_list = []
for h5_path in sample_files:
    song_features = extract_msd_features(h5_path)
    if song_features: # Only append if extraction was successful
        all_features_list.append(song_features)

print(f"Successfully extracted features for {len(all_features_list)} songs.")

Processing a sample of 100 files...
Successfully extracted features for 100 songs.


In [8]:
# --- Create DataFrame ---
msd_df = pd.DataFrame(all_features_list)

# --- Initial Exploration ---
print("\nDataFrame Info:")
msd_df.info()

print("\nDataFrame Head:")
print(msd_df.head())

print("\nDataFrame Description:")
print(msd_df.describe())

print("\nMissing Values:")
print(msd_df.isnull().sum())

# --- Optional: Save the sample DataFrame ---
# msd_df.to_csv('data/processed/msd_subset_sample.csv', index=False)
# print("\nSample DataFrame saved to data/processed/msd_subset_sample.csv")


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   artist_name                100 non-null    object 
 1   title                      100 non-null    object 
 2   release                    100 non-null    object 
 3   song_id                    100 non-null    object 
 4   duration                   100 non-null    float64
 5   key                        100 non-null    int32  
 6   key_confidence             100 non-null    float64
 7   loudness                   100 non-null    float64
 8   mode                       100 non-null    int32  
 9   mode_confidence            100 non-null    float64
 10  tempo                      100 non-null    float64
 11  time_signature             100 non-null    int32  
 12  time_signature_confidence  100 non-null    float64
 13  year                       100 non