# Data preparation
## Artist recomendation task for 30Music dataset 

1. Parse data: persons.idomaar, sessions.idomaar, tracks.idomaar

In [2]:
import pandas as pd
import os, sys, csv, json
import numpy as np
import pandas as pd

### Persons

In [154]:
data_dir = 'ThirtyMusic/entities'
relations_dir = 'ThirtyMusic/relations'
artists_path = os.path.join(data_dir, 'persons.idomaar')
artists_path_csv = os.path.join(data_dir, 'persons.csv')
artists = pd.read_csv(artists_path, delimiter='\t', header=None)
artists.columns = ['Type', 'ID', 'Timestamp', 'Properties', 'LinkedEntities']

In [155]:
def parse_artist_properties(props): 
    props = props.replace(';', ',')
    try:
        prop = json.loads(props)
    except:
        print(props)
        sys.exit(0)
    return pd.Series({'MBID': prop['MBID'], 'Name': prop['name']})

artists_prop = artists['Properties'].apply(lambda s: parse_artist_properties(s))
artists.sort_index(inplace=True)
artists = artists.merge(artists_prop, left_index=True, right_index=True)
artists.drop(columns = ['Timestamp', 'Properties', 'LinkedEntities'], inplace = True)
artists.drop_duplicates('ID', inplace = True)
artists = artists.set_index('ID')

artists.to_csv(artists_path_csv, sep=';', header=False)
artists.head()

Unnamed: 0_level_0,Type,MBID,Name
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
145148,person,,Everything+Is+Illuminated
297899,person,,Robin+O%27Brien
250429,person,,Nicholas+Gunn++(2012)
32765,person,,Aspasia+Stratigou
18689,person,,Allison+Veltz


### Tracks

In [68]:
tracks_path = os.path.join(data_dir, 'tracks.idomaar')
tracks_csv_path = os.path.join(data_dir, 'tracks.csv')

In [50]:
prop_columns = ['Duration', 'Playcount', 'MBID', 'Name']
entity_columns = ['ArtistsID', 'AlbumsID', 'TagsID']

def parse_track_properties(props):
    try:
        props = props.replace(';', ',')
        prop = json.loads(props)
    except:
        print(props)
        sys.exit(0)
    return pd.Series({'Duration': prop['duration'], 'Playcount': prop['playcount'], \
                      'MBID': prop['MBID'], 'Name': prop['name']})

def parse_track_entities(entities):
    try:
        entities = entities.replace(';', ',')
        entity = json.loads(entities)
    except:
        print(entities)
        sys.exit(0)
    return pd.Series({
        'ArtistsID': ','.join([str(x['id']) for x in entity['artists']]) if \
                               entity['artists'] is not None and len(entity['artists']) > 0 else None,
        'AlbumsID': ','.join([str(x['id']) for x in entity['albums']]) if \
                              entity['albums'] is not None and len(entity['albums']) > 0 else None,
        'TagsID': ','.join([str(x['id']) for x in entity['tags']]) if \
                            entity['tags'] is not None and len(entity['tags']) > 0 else None})

def process_chunk(df, i, path):
    tracks_prop = df['Properties'].apply(lambda s: parse_track_properties(s))
    tracks_ent = df['LinkedEntities'].apply(lambda s: parse_track_entities(s))
    df.drop(['LinkedEntities', 'Properties', 'Timestamp' ], axis=1, inplace=True)
    df_new = df.merge(tracks_prop, left_index=True, right_index=True)
    df_new = df_new.merge(tracks_ent, left_index=True, right_index=True)
    df_new = df_new.drop_duplicates()
    if i == 0:
        df_new.to_csv(path, sep=';')
    else:
        df_new.to_csv(path, mode='a', sep=';', header=False)

I don't have a lot of memory that's why I had to process all the big files in chunks

In [65]:
iteration = 0
chunksize = 10**4

for chunk in pd.read_csv(tracks_path, header=None, delimiter='\t', chunksize=chunksize):
    chunk.columns = ['Type', 'ID', 'Timestamp', 'Properties', 'LinkedEntities']
    chunk.set_index('ID', inplace=True)
    process_chunk(chunk, iteration, tracks_csv_path)
    iteration += 1

In [73]:
tracks_df = pd.read_csv(tracks_csv_path, index_col='ID', sep=';')

### Sessions

In [115]:
sessions_path = os.path.join(relations_dir, 'sessions.idomaar')
sessions_path_csv = os.path.join(relations_dir, 'sessions_.csv')

In [116]:
def parse_session_entities(session, tracks):
    _, entities = session.split(' ')[0], session.split(' ')[1]
    try:
        entities = entities.replace(';', ',')
        entity = json.loads(entities)
        artists = []
        if entity['objects'] is not None and len(entity['objects']) > 0:
            for x in entity['objects']:
                try:
                    track_id = tracks.loc[x['id']]
                    artist_id = track_id['ArtistsID']
                    if not np.isnan(artist_id):
                        artists.append(str(int(artist_id)))
                except KeyError:
                    pass        
    except:
        sys.exit(0)
    return pd.Series({
        'UserId': x['id'] if entity['subjects'] is not None and len(entity['subjects']) > 0 else None,
        'ArtistsID': ','.join(artists)})

def process(df, iteration_num, tracks_df, fname):
    session_ent = df['LinkedEntities'].apply(lambda s: parse_session_entities(s, tracks = tracks_df))
    df.drop(['LinkedEntities'], axis=1, inplace=True)
    df_new = df.merge(session_ent, left_index=True, right_index=True)
    print(df_new.columns)
    if iteration_num == 0:
        df_new.to_csv(fname, sep=';')
    else:
        df_new.to_csv(fname, mode='a', sep=';', header=False)
    return df_new

In [117]:
iteration = 0
chunksize = 10**2
for chunk in pd.read_csv(sessions_path, header=None, delimiter='\t', chunksize=chunksize):
    chunk.columns = ['Type', 'ID', 'Timestamp', 'LinkedEntities']
    chunk.set_index('ID', inplace=True)
    df_new = process(chunk, iteration, tracks_df, fname = sessions_path_csv,)
    iteration += 1

Index(['Type', 'Timestamp', 'UserId', 'ArtistsID'], dtype='object')
1


In [126]:
sessions = pd.read_csv(sessions_path_csv, sep=';')
sessions.head()

Unnamed: 0,ID,Type,Timestamp,UserId,ArtistsID
0,287144,event.session,1390231051,4698881,"107103,324333,344448,309348,103824,324389,3122..."
1,287145,event.session,1390241844,249947,"54522,308305,250603,240350,346954,165867,42954..."
2,287146,event.session,1390303249,2897013,"49682,238975,206918,15622,139310,70816,112477,..."
3,287147,event.session,1390481828,1876964,28579416772975306235220
4,287140,event.session,1421443687,375509,4642546425
