In [1]:
import collections
import datetime
import math

import os
import time

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

import random

import gc

pd.options.display.float_format = "{:.2f}".format
format = '%d/%m/%Y %H:%M'

# Load data

Load all data required.

In [2]:
def load_file(data_path, msg):
    if msg != '':
        txt = msg + ' - ' + data_path
    else:
        txt = data_path

    if os.path.exists(data_path):
        print('Loaded: ' + txt)
        with open(data_path, 'rb') as f:
            file_loaded = pickle.load(f)
        return file_loaded
    else:
        print('File not found: ' + txt)

In [4]:
# All tracks
data_path = os.getcwd() + '\\data\\processed\\4_tracks_dict.pkl'
txt = 'All tracks as a dictionary - ' + data_path

if os.path.exists(data_path):
    print('Loaded: ' + txt)
    with open(data_path, 'rb') as f:
        tracks_dict = pickle.load(f)
else:
    tracks_dict = dict_all_tracks(data_path, playlists_dict)

print(len(tracks_dict))
print()

# All artists
data_path = os.getcwd() + '\\data\\processed\\5_artists_dict.pkl'
txt = 'All artists as a dictionary - ' + data_path

if os.path.exists(data_path):
    print('Loaded: ' + txt)
    with open(data_path, 'rb') as f:
        artists_dict = pickle.load(f)
else:
    artists_dict = dict_all_artists(data_path, playlists_dict)

print(len(artists_dict))
print()

# All albums
data_path = os.getcwd() + '\\data\\processed\\6_albums_dict.pkl'
txt = 'All albums as a dictionary - ' + data_path

if os.path.exists(data_path):
    print('Loaded: ' + txt)
    with open(data_path, 'rb') as f:
        albums_dict = pickle.load(f)
else:
    albums_dict = dict_all_albums(data_path, playlists_dict)
print(len(albums_dict))

Loaded: All tracks as a dictionary - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\4_tracks_dict.pkl
85458

Loaded: All artists as a dictionary - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\5_artists_dict.pkl
28669

Loaded: All albums as a dictionary - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\6_albums_dict.pkl
62095


In [5]:
# Load all data
playlists_train = load_file(
    os.getcwd() + '\\data\\processed\\3_playlists_dict_train.pkl',
    'Playlists dictionary (training)')
playlists_test = load_file(
    os.getcwd() + '\\data\\processed\\3_playlists_dict_test.pkl',
    'Playlists dictionary (test)')
print('Playlist - training set: ' + str(len(playlists_train)))
print('Playlist - test set: ' + str(len(playlists_test)))
print()

tracks_dict_train = load_file(
    os.getcwd() + '\\data\\processed\\4_tracks_dict_train.pkl',
    'Tracks dictionary (training)')
tracks_dict_test = load_file(
    os.getcwd() + '\\data\\processed\\4_tracks_dict_test.pkl',
    'Tracks dictionary (test)')
print('Tracks - training set: ' + str(len(tracks_dict_train)))
print('Tracks - test set: ' + str(len(tracks_dict_test)))
print()

artists_dict_train = load_file(
    os.getcwd() + '\\data\\processed\\5_artists_dict_train.pkl',
    'Artist dictionary (training)')
artists_dict_test = load_file(
    os.getcwd() + '\\data\\processed\\5_artists_dict_test.pkl',
    'Artist dictionary (test)')
print('Artist - training set: ' + str(len(artists_dict_train)))
print('Artist - test set: ' + str(len(artists_dict_test)))
print()

albums_dict_train = load_file(
    os.getcwd() + '\\data\\processed\\6_albums_dict_train.pkl',
    'Albums dictionary (training)')
albums_dict_test = load_file(
    os.getcwd() + '\\data\\processed\\6_albums_dict_test.pkl',
    'Albums dictionary (test)')
print('Albums - training set: ' + str(len(albums_dict_train)))
print('Albums - test set: ' + str(len(albums_dict_test)))
print()

Loaded: Playlists dictionary (training) - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\3_playlists_dict_train.pkl
Loaded: Playlists dictionary (test) - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\3_playlists_dict_test.pkl
Playlist - training set: 1243
Playlist - test set: 533

Loaded: Tracks dictionary (training) - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\4_tracks_dict_train.pkl
Loaded: Tracks dictionary (test) - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\4_tracks_dict_test.pkl
Tracks - training set: 63699
Tracks - test set: 31183

Loaded: Artist dictionary (training) - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\5_artists_dict_train.pkl
Loaded: Artist dictionary (test) - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\5_artists_dict_test.pkl
Artist - training set: 22639
Artist - test set: 12780

Loaded: Albums dictionary

## Create data frames to perform analysis

In [6]:
gc.collect()
df_playlists_train = playlists_train.T
print("df_playlists_train:", df_playlists_train.shape)

df_playlists_test = playlists_test.T
print("df_playlists_test:", df_playlists_test.shape)

df_tracks_train = pd.DataFrame.from_dict(tracks_dict_train, orient='index')
df_tracks_train = df_tracks_train.T
print("df_tracks_train:", df_tracks_train.shape)

df_tracks_test = pd.DataFrame.from_dict(tracks_dict_test, orient='index')
df_tracks_test = df_tracks_test.T
print("df_tracks_test:", df_tracks_test.shape)

df_artists_train = pd.DataFrame.from_dict(artists_dict_train, orient='index')
df_artists_train = df_artists_train.T
print("df_artists_train:", df_artists_train.shape)

df_artists_test = pd.DataFrame.from_dict(artists_dict_test, orient='index')
df_artists_test = df_artists_test.T
print("df_artists_test:", df_artists_test.shape)

df_albums_train = pd.DataFrame.from_dict(albums_dict_train, orient='index')
df_albums_train = df_albums_train.T
print("df_albums_train:", df_albums_train.shape)

df_albums_test = pd.DataFrame.from_dict(albums_dict_test, orient='index')
df_albums_test = df_albums_test.T
print("df_albums_test:", df_albums_test.shape)

df_playlists_train: (4, 1243)
df_playlists_test: (4, 533)
df_tracks_train: (7, 63699)
df_tracks_test: (7, 31183)
df_artists_train: (3, 22639)
df_artists_test: (3, 12780)
df_albums_train: (4, 46688)
df_albums_test: (4, 24258)


# Co-occurrence matrix

## Tracks co-occurrence matrix

In [7]:
def co_occurrence_matrix_tracks(playlists_dict, tracks_dict):
    matrix = {}
    for i, item in enumerate(tracks_dict):
        tracks_matched = []
        for j, playlist_id in enumerate(tracks_dict[item]['in_playlist']):
            for tr, track in enumerate(playlists_dict[playlist_id]['tracks']):
                tracks_matched.append(track['id'])
        counter = collections.Counter(tracks_matched)
        b = pd.DataFrame.from_dict(counter, orient='index', columns=['cnt'])
        b = b.sort_values('cnt', ascending=False)

        min_max = MinMaxScaler()
        b['count_scaled'] = min_max.fit_transform(b[['cnt']])
        b['rank'] = b['count_scaled'].rank(method='average', ascending=False)

        matrix.update({item: b})
    return (matrix)

## Artists co-occurrence matrix

In [8]:
def co_occurrence_matrix_artists(playlists_dict, artists_dict):
    matrix = {}
    for i, item in enumerate(artists_dict):
        artist_matched = []
        for j, playlist_id in enumerate(artists_dict[item]['in_playlist']):
            for tr, track in enumerate(playlists_dict[playlist_id]['tracks']):
                for a, artist in enumerate(track['artists']):
                    artist_matched.append(artist['id'])
        counter = collections.Counter(artist_matched)
        b = pd.DataFrame.from_dict(counter, orient='index', columns=['cnt'])
        b = b.sort_values('cnt', ascending=False)

        min_max = MinMaxScaler()
        b['count_scaled'] = min_max.fit_transform(b[['cnt']])
        b['rank'] = b['count_scaled'].rank(method='average', ascending=False)

        matrix.update({item: b})
    return (matrix)

## Albums co-occurrence matrix

In [9]:
def co_occurrence_matrix_albums(playlists_dict, albums_dict):
    matrix = {}
    for i, item in enumerate(albums_dict):
        albums_matched = []
        for j, playlist_id in enumerate(albums_dict[item]['in_playlist']):
            for tr, track in enumerate(playlists_dict[playlist_id]['tracks']):
                albums_matched.append(track['album']['id'])
        counter = collections.Counter(albums_matched)
        b = pd.DataFrame.from_dict(counter, orient='index', columns=['cnt'])
        b = b.sort_values('cnt', ascending=False)

        min_max = MinMaxScaler()
        b['count_scaled'] = min_max.fit_transform(b[["cnt"]])
        b["rank"] = b['count_scaled'].rank(method='average', ascending=False)

        matrix.update({item: b})
    return (matrix)

##  Create all co-ocurrence matrix

In [10]:
def co_ocurrence(playlist, df, data_path, txt):
    # check if data extraction already exists then load if it exists, otherwise run query
    if os.path.exists(data_path):
        print('Co-occurrence ' + txt + ' - Loaded: ' + data_path)
        # load file only if the next step was not executed (getting tracks)
        with open(data_path, 'rb') as f:
            co_occurrence = pickle.load(f)
    else:
        print('Co-occurrence ' + txt + ' - Saved: ' + data_path)
        # list of all user ids (UID) and playlist ids (PID)
        uid_pid_raw = []

        if txt == 'tracks':
            co_occurrence = co_occurrence_matrix_tracks(playlist, df)
        elif txt == 'artists':
            co_occurrence = co_occurrence_matrix_artists(playlist, df)
        elif txt == 'albums':
            co_occurrence = co_occurrence_matrix_albums(playlist, df)

        # Save the data
        with open(data_path, 'wb') as f:
            pickle.dump(co_occurrence, f)

    len(co_occurrence)
    return co_occurrence

In [11]:
gc.collect()

d = []
t_start = time.time()
co_occurrence_tracks_train = co_ocurrence(
    df_playlists_train, df_tracks_train,
    os.getcwd() + '\\data\\processed\\7_co_occurrence_tracks_train.pkl',
    'tracks')
t_end = time.time()
d.append({'task': 'co_ocurrence_tracks', 'run_time': t_end - t_start})

t_start = time.time()
co_occurrence_artists_train = co_ocurrence(
    df_playlists_train, df_artists_train,
    os.getcwd() + '\\data\\processed\\7_co_occurrence_artists_train.pkl',
    'artists')
t_end = time.time()
d.append({'task': 'co_ocurrence_artists', 'run_time': t_end - t_start})

t_start = time.time()
co_occurrence_albums_train = co_ocurrence(
    df_playlists_train, df_albums_train,
    os.getcwd() + '\\data\\processed\\7_co_occurrence_albums_train.pkl',
    'albums')
t_end = time.time()
d.append({'task': 'co_ocurrence_albums', 'run_time': t_end - t_start})

print("co_occurrence_tracks_train:", len(co_occurrence_tracks_train))
print()
print("co_occurrence_artists_train:", len(co_occurrence_artists_train))
print()
print("co_occurrence_albums_train:", len(co_occurrence_albums_train))

Co-occurrence tracks - Saved: C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\7_co_occurrence_tracks_train.pkl
Co-occurrence artists - Saved: C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\7_co_occurrence_artists_train.pkl
Co-occurrence albums - Saved: C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\7_co_occurrence_albums_train.pkl
co_occurrence_tracks_train: 63699

co_occurrence_artists_train: 22639

co_occurrence_albums_train: 46688


# Model

In [13]:
def prepare_inputs(playlist_trx, trx_train, trx_test, fraction):
    trx = playlist_trx.tracks
    n = int((len(trx) + 1) * fraction)

    ids = pd.DataFrame(trx).id  # ids of the original playlist
    selected_ids = random.sample(ids.values.tolist(), n)

    db = trx_train.T
    trx_available = db[~db.id.isin(selected_ids)]
    
    aux = trx_test.T.id
    tst = trx_test.T
    reference_trx = tst[aux.isin(selected_ids)]
    expected_trx = tst[~aux.isin(selected_ids)
                              & aux.isin(ids)]

    return trx_available, reference_trx, expected_trx

In [14]:
def get_tracks_from_playlist(pid):
    p = playlists_train.loc[pid].tracks
    pid = pd.DataFrame(p)
    return pid

# a = get_tracks_from_playlist('5mveLg9twuCna0WwuzvXch')

## Random

In [15]:
def get_trk_rnd(playlist_trx, trx_train, trx_test, ref_fraction):
    trx_available, reference_trx, expected_trx = prepare_inputs(
        playlist_trx, trx_train, trx_test, ref_fraction)

    select = random.sample(trx_available['id'].values.tolist(),
                           len(expected_trx))

    selected_trx = trx_available[trx_available.id.isin(select)]
    
    selected_trx = selected_trx.drop_duplicates(subset=['id'], keep=False)

    return reference_trx, expected_trx, selected_trx


# reference_trx, expected_trx, selected_trx = get_trk_rnd(
#     df_playlists_test['1h4bUPnrpgn8qXGJ5dvTsu'], df_tracks_train,
#     df_tracks_test, .5)
# print(len(reference_trx), len(expected_trx), len(selected_trx))
# reference_trx, expected_trx, selected_trx

## Stratified selection

In [16]:
# All tracks
data_path = os.getcwd() + '\\data\\processed\\4_tracks_dict.pkl'
txt = 'All tracks as a dictionary - ' + data_path

if os.path.exists(data_path):
    print('Loaded: ' + txt)
    with open(data_path, 'rb') as f:
        tracks_dict = pickle.load(f)
else:
    tracks_dict = dict_all_tracks(data_path, playlists_dict)

print(len(tracks_dict))

df_tracks = pd.DataFrame.from_dict(tracks_dict)
df_tracks = df_tracks.T
df_tracks['cnt'] = df_tracks.in_playlist.str.len()
df_tracks = df_tracks.sort_values(by='cnt', ascending=False)
df_tracks['cum_sum'] = df_tracks['cnt'].cumsum()
tracks_frequency = df_tracks.loc[df_tracks.index.repeat(df_tracks.cnt)]
tracks_frequency = tracks_frequency.T

Loaded: All tracks as a dictionary - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\4_tracks_dict.pkl
85458


In [17]:
def get_trk_strata(playlist_trx, trx_train, trx_test, ref_fraction):
    trx_available, reference_trx, expected_trx = prepare_inputs(
        playlist_trx, trx_train, trx_test, ref_fraction)
    
    select = random.sample(trx_available['id'].values.tolist(), 5000)
    
    selected_trx = trx_available[trx_available.id.isin(select)]
    
    selected_trx = selected_trx.drop_duplicates(subset=['id'], keep=False)
    
    selected_trx = selected_trx.iloc[0:len(expected_trx)]
    return reference_trx, expected_trx, selected_trx

## Album

In [18]:
def get_trk_album(playlist_trx, trx_train, trx_test, ref_fraction):
    trx_available, reference_trx, expected_trx = prepare_inputs(
        playlist_trx, trx_train, trx_test, ref_fraction)

    select = []
    selected_trx = trx_available[trx_available.id.isin(select)]
    try_n = 0
    while len(selected_trx) < len(expected_trx) and try_n < 10:
        try_n = try_n + 1
        for t, cur_trk in enumerate(reference_trx.T):
            aid = reference_trx.loc[cur_trk].album['id']
            if aid in albums_dict_train.keys():
                co_ocur_aid = co_occurrence_albums_train[aid].iloc[1].name
                playlist_id = random.sample(
                    albums_dict_train[co_ocur_aid]['in_playlist'], 1)[0]
                trx = get_tracks_from_playlist(playlist_id)
                aux = random.sample(
                    set(get_tracks_from_playlist(playlist_id).id), 1)[0]
            else:
                aux = random.sample(trx_available['id'].values.tolist(), 1)[0]
            select.append(aux)

        df = pd.DataFrame(select, columns=['id'])
        df = df[~df.id.isin(reference_trx['id'])]
        df = df.drop_duplicates(subset=['id'], keep=False)
        df = df.reset_index()
        df = df.iloc[0:len(expected_trx)]['id']

        selected_trx = trx_available[trx_available.id.isin(df)]
    return reference_trx, expected_trx, selected_trx

## Artist

In [19]:
def get_trk_artist(playlist_trx, trx_train, trx_test, ref_fraction):
    trx_available, reference_trx, expected_trx = prepare_inputs(
        playlist_trx, trx_train, trx_test, ref_fraction)

    select = []
    selected_trx = trx_available[trx_available.id.isin(select)]
    try_n = 0
    while len(selected_trx) < len(expected_trx) and try_n < 10:
        try_n = try_n + 1
        for t, cur_trk in enumerate(reference_trx.T):
            aid = reference_trx.loc[cur_trk].artists[0]['id']
            if aid in artists_dict_train.keys():
                co_ocur_aid = co_occurrence_artists_train[aid].iloc[1].name
                playlist_id = random.sample(
                    artists_dict_train[co_ocur_aid]['in_playlist'], 1)[0]
                trx = get_tracks_from_playlist(playlist_id)
                aux = random.sample(
                    set(get_tracks_from_playlist(playlist_id).id), 1)[0]
            else:
                aux = random.sample(trx_available['id'].values.tolist(), 1)[0]
            select.append(aux)

        df = pd.DataFrame(select, columns=['id'])
        df = df[~df.id.isin(reference_trx['id'])]
        df = df.drop_duplicates(subset=['id'], keep=False)
        df = df.reset_index()
        df = df.iloc[0:len(expected_trx)]['id']

        selected_trx = trx_available[trx_available.id.isin(df)]
    return reference_trx, expected_trx, selected_trx

## Playlist

In [20]:
def get_trk_playlist(playlist_trx, trx_train, trx_test, ref_fraction):
    trx_available, reference_trx, expected_trx = prepare_inputs(
        playlist_trx, trx_train, trx_test, ref_fraction)

    select = []
    selected_trx = trx_available[trx_available.id.isin(select)]
    try_n = 0
    while len(selected_trx) < len(expected_trx) and try_n < 10:
        try_n = try_n + 1
        for t, cur_trk in enumerate(reference_trx.T):
            aid = cur_trk
            if aid in tracks_dict_train.keys():
                co_ocur_aid = co_occurrence_tracks_train[aid].iloc[1].name
                playlist_id = random.sample(
                    tracks_dict_train[co_ocur_aid]['in_playlist'], 1)[0]
                trx = get_tracks_from_playlist(playlist_id)
                aux = random.sample(
                    set(get_tracks_from_playlist(playlist_id).id), 1)[0]
            else:
                aux = random.sample(trx_available['id'].values.tolist(), 1)[0]
            select.append(aux)

        df = pd.DataFrame(select, columns=['id'])
        df = df[~df.id.isin(reference_trx['id'])]
        df = df.drop_duplicates(subset=['id'], keep=False)
        df = df.reset_index()
        df = df.iloc[0:len(expected_trx)]['id']

        selected_trx = trx_available[trx_available.id.isin(df)]
    return reference_trx, expected_trx, selected_trx

# Evaluate models

In [21]:
def get_trk(method, reference, ref_fraction):
    if method == 'random':
        reference_trx, expected_trx, selected_trx = get_trk_rnd(
            df_playlists_test[reference], df_tracks_train, df_tracks_test,
            ref_fraction)
    elif method == 'strata':
        reference_trx, expected_trx, selected_trx = get_trk_strata(
            df_playlists_test[reference], tracks_frequency, df_tracks_test,
            ref_fraction)
    elif method == 'artist':
        reference_trx, expected_trx, selected_trx = get_trk_artist(
            df_playlists_test[reference], df_tracks_train, df_tracks_test,
            ref_fraction)
    elif method == 'album':
        reference_trx, expected_trx, selected_trx = get_trk_album(
            df_playlists_test[reference], df_tracks_train, df_tracks_test,
            ref_fraction)
    elif method == 'playlist':
        reference_trx, expected_trx, selected_trx = get_trk_playlist(
            df_playlists_test[reference], df_tracks_train, df_tracks_test,
            ref_fraction)

    return reference_trx, expected_trx, selected_trx

In [22]:
methods = ['random', 'strata', 'artist', 'album', 'playlist']
splits = [.25, .5, .75]

In [23]:
gc.collect()

results = pd.DataFrame(columns=[
    'playlist', 'method', 'frac', 'total_tracks', 'selected_tracks', 'matches',
    'error','run_time'
])

a = df_playlists_test
for m, method in enumerate(methods):
    print()
    print(method)
    t_start = time.time()
    for f, frac in enumerate(splits):
        print(frac)
        for p, playlist in enumerate(a):
            t_pl_start = time.time()
            if p % 100 == 0:
                print("%s - %d/%d - LEN: %d" %
                      (datetime.datetime.now().strftime(format), p, len(
                          a.T), len(results)))
            reference_trx, expected_trx, selected_trx = get_trk(
                method, playlist, frac)

            matches = len(selected_trx[selected_trx.id.isin(
                expected_trx.id)].id)
            total_trx = a[playlist].total_tracks
            error = 1 - matches / a[playlist].total_tracks
            sel_trx = len(selected_trx)

            t_pl_end = time.time()
            results = results.append(
                {
                    'playlist': playlist,
                    'method': method,
                    'frac': frac,
                    'total_tracks': total_trx,
                    'selected_tracks': sel_trx,
                    'matches': matches,
                    'error': error,
                    'run_time': t_pl_end - t_pl_start
                },
                ignore_index=True)
    t_end = time.time()
    d.append({'task': 'model_' + method, 'run_time': t_end - t_start})

results.set_index('playlist', inplace=True)
# print(len(reference_trx), len(expected_trx), len(selected_trx))


random
0.25
22/05/2022 09:59 - 0/533 - LEN: 0
22/05/2022 09:59 - 100/533 - LEN: 100
22/05/2022 10:00 - 200/533 - LEN: 200
22/05/2022 10:00 - 300/533 - LEN: 300
22/05/2022 10:00 - 400/533 - LEN: 400
22/05/2022 10:00 - 500/533 - LEN: 500
0.5
22/05/2022 10:00 - 0/533 - LEN: 533
22/05/2022 10:00 - 100/533 - LEN: 633
22/05/2022 10:01 - 200/533 - LEN: 733
22/05/2022 10:01 - 300/533 - LEN: 833
22/05/2022 10:01 - 400/533 - LEN: 933
22/05/2022 10:01 - 500/533 - LEN: 1033
0.75
22/05/2022 10:01 - 0/533 - LEN: 1066
22/05/2022 10:02 - 100/533 - LEN: 1166
22/05/2022 10:02 - 200/533 - LEN: 1266
22/05/2022 10:02 - 300/533 - LEN: 1366
22/05/2022 10:02 - 400/533 - LEN: 1466
22/05/2022 10:02 - 500/533 - LEN: 1566

strata
0.25
22/05/2022 10:02 - 0/533 - LEN: 1599
22/05/2022 10:03 - 100/533 - LEN: 1699
22/05/2022 10:03 - 200/533 - LEN: 1799
22/05/2022 10:03 - 300/533 - LEN: 1899
22/05/2022 10:04 - 400/533 - LEN: 1999
22/05/2022 10:04 - 500/533 - LEN: 2099
0.5
22/05/2022 10:04 - 0/533 - LEN: 2132
22/05/202

In [24]:
results

Unnamed: 0_level_0,method,frac,total_tracks,selected_tracks,matches,error,run_time
playlist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1h4bUPnrpgn8qXGJ5dvTsu,random,0.25,100,75,0,1.00,0.14
7Mht6GZuNHUvNwYMTDgB6b,random,0.25,25,19,0,1.00,0.12
6mmP0zJCnevFrXGWVSMjPP,random,0.25,100,74,0,1.00,0.13
4UJ9uTJatHD4isU6JjC9BB,random,0.25,100,75,0,1.00,0.15
0wL43aIFVaADL1Xbd6JTL9,random,0.25,100,75,0,1.00,0.14
...,...,...,...,...,...,...,...
7FJ4amm0nefcEIUysxLfuv,playlist,0.75,53,13,0,1.00,0.20
5Gr2UCda20e13ZPHZ1lHTd,playlist,0.75,100,25,0,1.00,0.30
3y2wvGfXaszvSI1fZWpllM,playlist,0.75,25,6,0,1.00,0.16
4CeyODedBm7b9bObDpkoGw,playlist,0.75,24,6,0,1.00,0.16


In [25]:
# All tracks
data_path = os.getcwd() + '\\data\\processed\\10_results.pkl'
txt = 'Results - ' + data_path

if os.path.exists(data_path):
    print('Loaded: ' + txt)
    with open(data_path, 'rb') as f:
        results = pickle.load(f)
else:
    with open(data_path, 'wb') as f:
        pickle.dump(results, f)

print(len(results))

7995


# Save run time

In [26]:
data_path = os.getcwd() + '\\data\\processed\\run_times_2.pkl'
txt = 'Run times - ' + data_path

profiling_grid = pd.DataFrame(d)
print(len(profiling_grid))

print('Saved: ' + txt)
with open(data_path, 'wb') as f:
    pickle.dump(profiling_grid, f)

8
Saved: Run times - C:\Users\map_f\OneDrive\Documents\GitHub\Follow_the_beat\data\processed\run_times_3.pkl


In [27]:
profiling_grid = pd.DataFrame(d)
profiling_grid

Unnamed: 0,task,run_time
0,co_ocurrence_tracks,287.2
1,co_ocurrence_artists,125.85
2,co_ocurrence_albums,241.4
3,model_random,201.51
4,model_strata,331.17
5,model_artist,456.2
6,model_album,476.48
7,model_playlist,444.29
