In [37]:
import h5py
import pandas as pd

In [38]:
def extract_df_from_h5(file, indent=0, datasets={}):
    """Recursively extracts datasets from an HDF5 file and creates DataFrames for datasets, 
    decoding byte strings to regular strings."""
    for key in file:
        item = file[key]
        print("  " * indent + f"- {key}: {type(item)}")
        
        if isinstance(item, h5py.Group):  
            # If it's a group, call the function recursively
            extract_df_from_h5(item, indent + 1, datasets)
        
        elif isinstance(item, h5py.Dataset):  
            # If it's a dataset, create the DataFrame
            columns = [key for key in item.dtype.names]
            df = pd.DataFrame(item[:], columns=columns)
            
            # Decode byte columns to regular strings if needed
            for col in df.columns:
                if df[col].dtype == 'O':  # Check if the column has object (likely bytes)
                    # Check if the column contains byte strings
                    if isinstance(df[col].iloc[0], bytes):  # Only decode if it's bytes
                        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

            # Add the DataFrame to the dictionary with the key as the name of the dataset
            datasets[item.name] = df 

    return datasets

In [39]:
file_path = 'data/raw/msd_summary_file.h5'

# create dfs from h5 file
# takes approx 1-2 min for me (Christoffer)
with h5py.File(file_path, 'r') as h5_file:
    dfs = extract_df_from_h5(h5_file)

- analysis: <class 'h5py._hl.group.Group'>
  - songs: <class 'h5py._hl.dataset.Dataset'>
- metadata: <class 'h5py._hl.group.Group'>
  - songs: <class 'h5py._hl.dataset.Dataset'>
- musicbrainz: <class 'h5py._hl.group.Group'>
  - songs: <class 'h5py._hl.dataset.Dataset'>


In [5]:
# save the dfs to csv files
# approx 30 sec
for key in dfs:
    name = key.split("/")[1]
    dfs[key].to_csv(f"data/{name}.csv", index=False)

In [33]:
# create a new df with the track_id and song_id such that we can map between the two
df_unique_tracks = pd.read_csv("data/raw/unique_tracks.txt", sep='<SEP>', header=None, engine='python')
df_unique_tracks.columns = ['track_id', 'song_id', 'artist', 'title']
# we choose to drop duplicates based on the song_id, as we assume that the song_id is the unique identifier
# note we have 999056 unique song_id's in the unique_tracks.txt file
# so the potential loss of data is minimal
# further more in the user data, it is the song_id that is used to identify the songs
# we will use the first track_id that a song_id is mapped to
song_to_track_mapping = df_unique_tracks[['song_id', 'track_id']].drop_duplicates(subset='song_id', keep='first')

In [40]:
metadata_df = dfs['/metadata/songs'] # has song_id
# drop duplicates based on song_id, same logic as above cell
metadata_df = metadata_df.drop_duplicates(subset='song_id', keep='first')
analysis_df = dfs['/analysis/songs'] # has track_id

In [46]:
merged_df = metadata_df.merge(song_to_track_mapping, on='song_id', how='inner')
merged_df = merged_df.merge(analysis_df, on='track_id', how='inner')
# remove all unnecessary columns
# we remove all unnecessary identifiers, as we have the song_id and track_id
# we also remove all columns with only one unique value, as they do not provide any information
merged_df = merged_df.drop(columns=["artist_7digitalid",
                                    "artist_mbid",
                                    "artist_playmeid",
                                    "release_7digitalid",
                                    "track_7digitalid",
                                    "audio_md5",])

# we also remove all columns with only one unique value, as they do not provide any valuable information
merged_df = merged_df.loc[:, merged_df.nunique() > 1]


In [91]:
df_genres_cd1 = pd.read_csv(
    "data/raw/msd_tagtraum_cd1.cls",
    sep="\t",                   # Tab-separated values
    comment='#',                # Ignore lines starting with '#'
    header=None,                # No predefined header row
    names=["trackId", "majority_genre", "minority_genre"],  # Define column names
    engine='python'             # To handle varying column counts
)
df_genres_cd2 = pd.read_csv(
    "data/raw/msd_tagtraum_cd2.cls",
    sep="\t",                   # Tab-separated values
    comment='#',                # Ignore lines starting with '#'
    header=None,                # No predefined header row
    names=["trackId", "majority_genre", "minority_genre"],  # Define column names
    engine='python'             # To handle varying column counts
)
df_genres_cd2c = pd.read_csv(
    "data/raw/msd_tagtraum_cd2c.cls",
    sep="\t",                   # Tab-separated values
    comment='#',                # Ignore lines starting with '#'
    header=None,                # No predefined header row
    names=["trackId", "majority_genre", "minority_genre"],  # Define column names
    engine='python'             # To handle varying column counts
)

In [112]:
# Find trackIds in df_genres_cd2 that are not in df_genres_cd1
new_trackIds_cd2 = df_genres_cd2[~df_genres_cd2['trackId'].isin(df_genres_cd1['trackId'])]
# Add these new trackIds to df_genres_cd1
df_genres_cd1_updated = pd.concat([df_genres_cd1, new_trackIds_cd2], ignore_index=True)

# Find trackIds in df_genres_cd2c that are not in df_genres_cd1_updated
new_trackIds_cd2c = df_genres_cd2c[~df_genres_cd2c['trackId'].isin(df_genres_cd1_updated['trackId'])]
# Add these new trackIds to df_genres_cd1_updated
df_genres_merged = pd.concat([df_genres_cd1_updated, new_trackIds_cd2c], ignore_index=True)

In [20]:
# add the genre information to the merged_df, 
# we use information from both majority_genre and minority_genre and create two columns for each genre dataset


merged_df = merged_df.merge(df_genres_cd1, left_on='track_id', right_on='trackId', how='left')
# remove the trackId column as it is redundant and just confuses
merged_df = merged_df.drop(columns=['trackId'])
# rename the columns to reflect the source of the data
merged_df = merged_df.rename(columns={"majority_genre": "majority_genre_cd1", "minority_genre": "minority_genre_cd1"})

# do the same for the other genre datasets
merged_df = merged_df.merge(df_genres_cd2, left_on='track_id', right_on='trackId', how='left')
merged_df = merged_df.drop(columns=['trackId'])
merged_df = merged_df.rename(columns={"majority_genre": "majority_genre_cd2", "minority_genre": "minority_genre_cd2"})

merged_df = merged_df.merge(df_genres_cd2c, left_on='track_id', right_on='trackId', how='left')
merged_df = merged_df.drop(columns=['trackId'])
merged_df = merged_df.rename(columns={"majority_genre": "majority_genre_cd2c", "minority_genre": "minority_genre_cd2c"})


In [21]:
# approx 20 sec
# order the columns in alphabetical order
merged_df = merged_df.reindex(sorted(merged_df.columns), axis=1)
# save the merged_df to a csv file
merged_df.to_csv("data/all_songs.csv", index=False)

In [24]:
# delete memory to free up space
%reset -f

In [6]:
# approx 1 min
import pandas as pd
# load the user data and find the unique song_id's
df_user_data = pd.read_csv("data/raw/user_data.txt", sep='\t', header=None)
df_user_data.columns = ['user_id', 'song_id', 'play_count']
# find the unique song_id's that has been played
unique_song_ids = df_user_data['song_id'].unique()

In [26]:
# remove the user data to free up space, since it is not needed anymore
del df_user_data

In [27]:
# approx 20 sec
# load the all_songs data
df_all_songs = pd.read_csv("Data/all_songs.csv")
# find the songs that are in the user data
df_played_songs = df_all_songs.merge(pd.DataFrame(unique_song_ids, columns=['song_id']), on='song_id', how='inner')
# save the data
df_played_songs.to_csv("Data/played_songs.csv", index=False)

In [None]:
from make_data import DataSetMaker
from make_data import DataLoader

def get_data():
    # Define file paths
    h5_file = 'data/raw/msd_summary_file.h5'
    unique_tracks = 'data/raw/unique_tracks.txt'
    genres_files = [
        'data/raw/msd_tagtraum_cd1.cls',
        'data/raw/msd_tagtraum_cd2.cls',
        'data/raw/msd_tagtraum_cd2c.cls'
    ]
    user_data = 'data/raw/user_data.txt'
    output_directory = 'data'

    # Initialize the DataSetMaker
    dataset_maker = DataSetMaker(
        h5_file_path=h5_file,
        unique_tracks_path=unique_tracks,
        genres_paths=genres_files,
        user_data_path=user_data,
        output_dir=output_directory
    )

    # Run all processing steps
    dataset_maker.run_all()


# get_data()

data_loader = DataLoader()
df_songs = data_loader.load_song_data(data_path='data/played_songs.csv')
df_user = data_loader.load_user_data(data_path='data/raw/user_data.txt')
