### Imports ###

In [1]:
import numpy as np
import pandas as pd

### Set filepaths ###

In [25]:
filepath_artists = "./artists.csv"
filepath_genres = "./genre_translations.csv"
filepath_musicmatch = "./datalake_musicmatch.csv"
filepath_jive = "./datalake_jive.csv"

# Get artist dataframe #
### Read artist data ###

Get data for all the approved artists from CSV file and drop the rows without a value for the genre(s), which honestly should not be there in the first place, but oh well.

In [6]:
df_artists = pd.read_csv(filepath_artists)
df_artists = df_artists[df_artists['cached_genres'].notna()]

print("The length of the dataframe:\t{}\n\nThe dtypes of all columns:\n\n{}\n".format(len(df_artists),df_artists.dtypes))

df_artists.head(10)

The length of the dataframe:	9391

The dtypes of all columns:

id                   int64
name                object
country             object
city                object
locale              object
cached_genres       object
act                  int64
num_members        float64
set_length         float64
cover                int64
ranking            float64
average_rating       int64
reviews_count      float64
reviews_average    float64
minimum_price      float64
price              float64
dtype: object



Unnamed: 0,id,name,country,city,locale,cached_genres,act,num_members,set_length,cover,ranking,average_rating,reviews_count,reviews_average,minimum_price,price
0,14401,JEZ GUITAR,es,Barcelona,es,"Acústico, Chill out, Flamenco",2,1.0,90.0,2,0.0,0,0.0,0.0,0.0,185.0
1,3796,James Partoir / Instrumental Fingerstyle Guitar,de,Marburg (an der Lahn),de,"Fingerstyle, Akustisch",2,1.0,120.0,1,80.0,195,0.0,0.0,100.0,225.0
2,11862,Sara Memphis,es,Madrid,es,"Electrónica, Dance, Dubstep",1,1.0,120.0,2,67.0,174,0.0,0.0,0.0,200.0
3,1212,Derick Saxman,nl,Amsterdam,nl,"Soul, Dance, Disco",2,1.0,120.0,0,67.0,168,0.0,0.0,0.0,425.0
4,2553,De Heidedoosjes,nl,Pieterburen,nl,"Kleinkunst, Muziektheater, Entertainment",3,2.0,90.0,2,75.0,178,0.0,0.0,0.0,850.0
5,1550,SPAWN ( ska ),nl,Aalten,nl,"Rock, Punk, Reggae",0,7.0,100.0,1,76.0,252,3.0,9.0,350.0,675.0
6,2032,SoulNL,nl,Amsterdam,nl,"Soul, Nederpop, Hip Hop",0,5.0,60.0,0,75.0,194,0.0,0.0,0.0,1500.0
7,216,Ivan Waters Band,nl,Amsterdam,nl,"Akoestisch, Alternatief, Pop",0,3.0,120.0,1,80.0,197,3.0,9.0,0.0,300.0
8,22547,Saxophonist Philipp Schoof,de,Leipzig,de,"Elektronisch, Chill out",2,1.0,480.0,0,70.0,246,0.0,0.0,0.0,500.0
9,19358,Off the Record Trio,nl,Delft,nl,"Coverband, Pop, Akoestisch",0,3.0,120.0,0,70.0,214,0.0,0.0,0.0,425.0


### Convert 'act' column to several columns for easier readability ###

In [7]:
# 0 = band
# 1 = dj
# 2 = solo
# 3 = ensemble

df_artists.insert(8, "band", pd.Series(dtype=int))
df_artists.insert(9, "dj", pd.Series(dtype=int))
df_artists.insert(10, "solo", pd.Series(dtype=int))
df_artists.insert(11, "ensemble", pd.Series(dtype=int))

df_artists.loc[df_artists.act == 0, "band"] = 1
df_artists.loc[df_artists.act != 0, "band"] = 0
df_artists.loc[df_artists.act == 1, "dj"] = 1
df_artists.loc[df_artists.act != 1, "dj"] = 0
df_artists.loc[df_artists.act == 2, "solo"] = 1
df_artists.loc[df_artists.act != 2, "solo"] = 0
df_artists.loc[df_artists.act == 3, "ensemble"] = 1
df_artists.loc[df_artists.act != 3, "ensemble"] = 0

df_artists = df_artists.drop("act", axis=1)

df_artists["band"] = df_artists["band"].astype(int)
df_artists["dj"] = df_artists["dj"].astype(int)
df_artists["solo"] = df_artists["solo"].astype(int)
df_artists["ensemble"] = df_artists["ensemble"].astype(int)

df_artists.head()

Unnamed: 0,id,name,country,city,locale,cached_genres,num_members,band,dj,solo,ensemble,set_length,cover,ranking,average_rating,reviews_count,reviews_average,minimum_price,price
0,14401,JEZ GUITAR,es,Barcelona,es,"Acústico, Chill out, Flamenco",1.0,0,0,1,0,90.0,2,0.0,0,0.0,0.0,0.0,185.0
1,3796,James Partoir / Instrumental Fingerstyle Guitar,de,Marburg (an der Lahn),de,"Fingerstyle, Akustisch",1.0,0,0,1,0,120.0,1,80.0,195,0.0,0.0,100.0,225.0
2,11862,Sara Memphis,es,Madrid,es,"Electrónica, Dance, Dubstep",1.0,0,1,0,0,120.0,2,67.0,174,0.0,0.0,0.0,200.0
3,1212,Derick Saxman,nl,Amsterdam,nl,"Soul, Dance, Disco",1.0,0,0,1,0,120.0,0,67.0,168,0.0,0.0,0.0,425.0
4,2553,De Heidedoosjes,nl,Pieterburen,nl,"Kleinkunst, Muziektheater, Entertainment",2.0,0,0,0,1,90.0,2,75.0,178,0.0,0.0,0.0,850.0


### Do the same for 'cover' column ###

In [8]:
df_artists.insert(7, "covers", pd.Series(dtype=int))
df_artists.insert(8, "own_work", pd.Series(dtype=int))

df_artists.loc[df_artists.cover == 1, "covers"] = 0
df_artists.loc[df_artists.cover == 0, "covers"] = 1
df_artists.loc[df_artists.cover == 1, "own_work"] = 1
df_artists.loc[df_artists.cover == 0, "own_work"] = 0
df_artists.loc[df_artists.cover == 2, "covers"] = 1
df_artists.loc[df_artists.cover == 2, "own_work"] = 1

df_artists["covers"] = df_artists["covers"].astype(int)
df_artists["own_work"] = df_artists["own_work"].astype(int)

df_artists = df_artists.drop("cover", axis=1)

df_artists.head(10)

Unnamed: 0,id,name,country,city,locale,cached_genres,num_members,covers,own_work,band,dj,solo,ensemble,set_length,ranking,average_rating,reviews_count,reviews_average,minimum_price,price
0,14401,JEZ GUITAR,es,Barcelona,es,"Acústico, Chill out, Flamenco",1.0,1,1,0,0,1,0,90.0,0.0,0,0.0,0.0,0.0,185.0
1,3796,James Partoir / Instrumental Fingerstyle Guitar,de,Marburg (an der Lahn),de,"Fingerstyle, Akustisch",1.0,0,1,0,0,1,0,120.0,80.0,195,0.0,0.0,100.0,225.0
2,11862,Sara Memphis,es,Madrid,es,"Electrónica, Dance, Dubstep",1.0,1,1,0,1,0,0,120.0,67.0,174,0.0,0.0,0.0,200.0
3,1212,Derick Saxman,nl,Amsterdam,nl,"Soul, Dance, Disco",1.0,1,0,0,0,1,0,120.0,67.0,168,0.0,0.0,0.0,425.0
4,2553,De Heidedoosjes,nl,Pieterburen,nl,"Kleinkunst, Muziektheater, Entertainment",2.0,1,1,0,0,0,1,90.0,75.0,178,0.0,0.0,0.0,850.0
5,1550,SPAWN ( ska ),nl,Aalten,nl,"Rock, Punk, Reggae",7.0,0,1,1,0,0,0,100.0,76.0,252,3.0,9.0,350.0,675.0
6,2032,SoulNL,nl,Amsterdam,nl,"Soul, Nederpop, Hip Hop",5.0,1,0,1,0,0,0,60.0,75.0,194,0.0,0.0,0.0,1500.0
7,216,Ivan Waters Band,nl,Amsterdam,nl,"Akoestisch, Alternatief, Pop",3.0,0,1,1,0,0,0,120.0,80.0,197,3.0,9.0,0.0,300.0
8,22547,Saxophonist Philipp Schoof,de,Leipzig,de,"Elektronisch, Chill out",1.0,1,0,0,0,1,0,480.0,70.0,246,0.0,0.0,0.0,500.0
9,19358,Off the Record Trio,nl,Delft,nl,"Coverband, Pop, Akoestisch",3.0,1,0,1,0,0,0,120.0,70.0,214,0.0,0.0,0.0,425.0


### Get genre ids ###

In [9]:
df_genres = pd.read_csv(filepath_genres)
df_genres = df_genres.drop(["description", "slug", "created_at", "updated_at", ], axis=1)

print("The length of the dataframe:\t{}\n\nThe dtypes of all columns:\n\n{}\n".format(len(df_genres), df_genres.dtypes))

df_genres.head(10)

The length of the dataframe:	897

The dtypes of all columns:

id           int64
genre_id     int64
name        object
locale      object
dtype: object



Unnamed: 0,id,genre_id,name,locale
0,1,1,A capela,es
1,2,4,Afro,es
2,3,2,Acústico,es
3,4,3,Alternativa,es
4,5,95,Americana,es
5,6,5,Bachata,es
6,7,86,Balcánica,es
7,8,6,Barroca,es
8,9,7,Bebop,es
9,10,8,Big Band,es


In [10]:
genres_to_index = dict(zip(df_genres.name, df_genres.genre_id))
genre_names_strings = df_artists["cached_genres"].tolist()
genre_ids_lists = []

for i, genre_names_string in enumerate(genre_names_strings):
    genre_names_list = genre_names_string.split(", ")
    genre_id_list = []
    for genre_name in genre_names_list:
        if genre_name not in genres_to_index.keys():
            genre_name = genre_name.strip()
        genre_id = genres_to_index[genre_name]
        genre_id_list.append(genre_id)
    genre_ids_lists.append(genre_id_list)
            
df_artists.insert(3, "genre_ids", genre_ids_lists)
df_artists.head(10)

Unnamed: 0,id,name,country,genre_ids,city,locale,cached_genres,num_members,covers,own_work,...,dj,solo,ensemble,set_length,ranking,average_rating,reviews_count,reviews_average,minimum_price,price
0,14401,JEZ GUITAR,es,"[2, 15, 29]",Barcelona,es,"Acústico, Chill out, Flamenco",1.0,1,1,...,0,1,0,90.0,0.0,0,0.0,0.0,0.0,185.0
1,3796,James Partoir / Instrumental Fingerstyle Guitar,de,"[28, 2]",Marburg (an der Lahn),de,"Fingerstyle, Akustisch",1.0,0,1,...,0,1,0,120.0,80.0,195,0.0,0.0,100.0,225.0
2,11862,Sara Memphis,es,"[25, 18, 23]",Madrid,es,"Electrónica, Dance, Dubstep",1.0,1,1,...,1,0,0,120.0,67.0,174,0.0,0.0,0.0,200.0
3,1212,Derick Saxman,nl,"[70, 18, 22]",Amsterdam,nl,"Soul, Dance, Disco",1.0,1,0,...,0,1,0,120.0,67.0,168,0.0,0.0,0.0,425.0
4,2553,De Heidedoosjes,nl,"[45, 80, 84]",Pieterburen,nl,"Kleinkunst, Muziektheater, Entertainment",2.0,1,1,...,0,0,1,90.0,75.0,178,0.0,0.0,0.0,850.0
5,1550,SPAWN ( ska ),nl,"[61, 56, 58]",Aalten,nl,"Rock, Punk, Reggae",7.0,0,1,...,0,0,0,100.0,76.0,252,3.0,9.0,350.0,675.0
6,2032,SoulNL,nl,"[70, 53, 37]",Amsterdam,nl,"Soul, Nederpop, Hip Hop",5.0,1,0,...,0,0,0,60.0,75.0,194,0.0,0.0,0.0,1500.0
7,216,Ivan Waters Band,nl,"[2, 3, 55]",Amsterdam,nl,"Akoestisch, Alternatief, Pop",3.0,0,1,...,0,0,0,120.0,80.0,197,3.0,9.0,0.0,300.0
8,22547,Saxophonist Philipp Schoof,de,"[25, 15]",Leipzig,de,"Elektronisch, Chill out",1.0,1,0,...,0,1,0,480.0,70.0,246,0.0,0.0,0.0,500.0
9,19358,Off the Record Trio,nl,"[91, 55, 2]",Delft,nl,"Coverband, Pop, Akoestisch",3.0,1,0,...,0,0,0,120.0,70.0,214,0.0,0.0,0.0,425.0


# Get Musicmatch and JIVE dataframe #
### Functions for both, find percentage of likes ###

In [19]:
def add_percentage_liked(df, column_number):
    """
    Takes data on either musicmatch or jive sessions and computes what
    percentage of artists was liked during that session.
    
    Parameters
        
        df:             the dataframe containing the sessions
        
        column_number:  the index at which the new column will be inserted
        
    Returns
    
        df:             the updated dataframe
    """
    seen_artists = df["seen_artists"].tolist()
    liked_artists = df["liked_artists"].tolist()

    percentages = np.zeros(len(df))
    for i, list_of_artists in enumerate(seen_artists):
        list_of_likes = liked_artists[i]
        if not list_of_likes:
            percentage = 0
        else:
            percentage = (len(list_of_likes)/len(list_of_artists)) * 100
        percentages[i] = percentage
    df.insert(column_number, "percentage_liked", percentages)
    return df

def add_liked_genres(df, column_number):
    """
    Takes data on either musicmatch or jive sessions and get the
    genres that were liked, adding them to new column.
    
    Parameters
        
        df:             the dataframe containing the sessions
        
        column_number:  the index at which the new column will be inserted
        
    Returns
    
        df:             the updated dataframe
    """
    artists_to_genres = dict(zip(df_artists.id, df_artists.genre_ids))
    liked_artists_lists = df["liked_artists"].tolist()
    liked_genres_lists = []
    
    for liked_artists in liked_artists_lists:
        liked_genres = []
        for artist in liked_artists:
            if artist in artists_to_genres.keys():
                genres_artist = artists_to_genres[artist]
            else:
                genres_artist = []
            liked_genres.extend(genres_artist)
        liked_genres_lists.append(liked_genres)
    df.insert(column_number, "liked_genres", liked_genres_lists)
    return df

### Get Musicmatch sessions ###
Read data from the CSV file and find out what percentage of seen artists was liked and which genres were liked.

In [21]:
df_musicmatch = pd.read_csv(filepath_musicmatch, converters={"liked_artists": eval,
                                                             "seen_artists": eval,
                                                             "disliked_artists": eval,
                                                             "timestamps": eval,
                                                             "likes_sequence": eval})
df_musicmatch = add_percentage_liked(df_musicmatch, 2)
df_musicmatch = add_liked_genres(df_musicmatch, 7)

print("The length of the dataframe:\t{}\n\nThe dtypes of all columns:\n\n{}\n".format(len(df_musicmatch),df_musicmatch.dtypes))

df_musicmatch.head(10)

The length of the dataframe:	1280

The dtypes of all columns:

source_id             int64
current_url          object
percentage_liked    float64
seen_artists         object
timestamps           object
liked_artists        object
disliked_artists     object
liked_genres         object
likes_sequence       object
dtype: object



Unnamed: 0,source_id,current_url,percentage_liked,seen_artists,timestamps,liked_artists,disliked_artists,liked_genres,likes_sequence
0,1643,https://www.gigstarter.nl/calls/1643/musicmatc...,0.0,"[3805, 1146, 4309, 3868, 1164, 1407, 1504, 169...","[1522827445619, 1522827449510, 1522827452921, ...",[],"[3805, 1146, 4309, 3868, 1164, 1407, 1504, 169...",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1647,http://www.gigstarter.be/calls/1647/musicmatch...,13.235294,"[3060, 3435, 3540, 2977, 3743, 3789, 3175, 611...","[1522857520574, 1522857535728, 1522857559668, ...","[2977, 3175, 3499, 7894, 2973, 3343, 3580, 668...","[3060, 3435, 3540, 3743, 3789, 6110, 2866, 356...","[53, 45, 94, 30, 45, 2, 53, 67, 67, 1, 80, 61,...","[0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,1649,http://www.gigstarter.nl/calls/1649/musicmatch...,0.0,"[3155, 343, 4517, 1161, 2628, 102, 2411, 2759,...","[1522862834009, 1522862839576, 1522862842288, ...",[],"[3155, 343, 4517, 1161, 2628, 102, 2411, 2759,...",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1726,https://www.gigstarter.es/calls/1726/musicmatc...,27.272727,"[7952, 9274, 3318, 2695, 3035, 5793, 3870, 447...","[1524351227331, 1524351237388, 1524351244801, ...","[3870, 6958, 8504, 9274, 2691, 3878, 3901, 418...","[7952, 9274, 3318, 2695, 3035, 5793, 4476, 732...","[17, 30, 39, 2, 55, 61, 2, 61, 55, 61, 70, 31,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ..."
4,1675,https://www.gigstarter.be/calls/1675/musicmatc...,0.469484,"[2889, 4468, 3456, 5008, 1609, 3014, 3525, 352...","[1523356481043, 1523356533403, 1523356538042, ...",[2889],"[4468, 3456, 5008, 1609, 3014, 3525, 3527, 353...","[70, 10, 61]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,1676,http://www.gigstarter.nl/calls/1676/musicmatch...,9.52381,"[1212, 1428, 1161, 5865, 1212, 1428, 1161, 586...","[1523356879532, 1523356881322, 1523356883608, ...","[1212, 1161]","[1212, 1428, 1161, 5865, 1428, 5865, 2628, 551...","[70, 18, 22, 55, 53, 22]","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,1694,https://www.gigstarter.nl/calls/1694/musicmatc...,0.595238,"[1974, 1212, 1976, 1428, 2013, 2045, 2100, 214...","[1523619349736, 1523619358568, 1523619364200, ...","[1869, 5863, 6037, 3004, 889]","[1974, 1212, 1976, 1428, 2013, 2045, 2100, 214...","[2, 40, 24, 77, 76, 2, 67, 2, 55, 61, 10, 10, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,1693,https://www.gigstarter.es/calls/1693/musicmatc...,0.663717,"[8220, 2368, 2460, 3947, 2729, 2735, 3318, 378...","[1523638337695, 1523638343520, 1523638351871, ...","[4476, 5051, 6472]","[8220, 2368, 2460, 3947, 2729, 2735, 3318, 378...","[55, 40, 70, 11, 46, 65, 31, 58, 46]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,1721,https://www.gigstarter.es/calls/1721/musicmatc...,0.571429,"[7952, 9274, 3318, 2691, 2400, 8220, 3878, 312...","[1524427882308, 1524427883564, 1524427884174, ...",[3912],"[7952, 9274, 3318, 2691, 2400, 8220, 3878, 312...","[61, 91, 55]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,1759,https://www.gigstarter.es/calls/1759/musicmatc...,0.0,"[7327, 7952, 8295, 9274, 9292, 3318, 3870, 2691]","[1524844711026, 1524844714003, 1524844715853, ...",[],"[7327, 7952, 8295, 9274, 9292, 3318, 3870, 2691]",[],"[0, 0, 0, 0, 0, 0, 0, 0]"


### Get JIVE sessions ###
Here too, read data and find percentages of likes.

In [28]:
df_jive = pd.read_csv(filepath_jive, converters={"liked_artists": eval,
                                                 "seen_artists": eval,
                                                 "disliked_artists": eval,
                                                 "timestamps": eval,
                                                 "likes_sequence": eval})
df_jive = add_percentage_liked(df_jive, 4)
df_jive = add_liked_genres(df_jive, 9)

print("The length of the dataframe:\t{}\n\nThe dtypes of all columns:\n\n{}\n".format(len(df_jive),df_jive.dtypes))

df_jive.head(10)

The length of the dataframe:	124

The dtypes of all columns:

current_url          object
user_id              object
user_id_type         object
filter_type          object
percentage_liked    float64
seen_artists         object
timestamps           object
liked_artists        object
disliked_artists     object
liked_genres         object
likes_sequence       object
dtype: object



Unnamed: 0,current_url,user_id,user_id_type,filter_type,percentage_liked,seen_artists,timestamps,liked_artists,disliked_artists,liked_genres,likes_sequence
0,https://www.gigstarter.nl/jive-beta,5ztsfdkb8ra3or42q5b7zf,session,band_proband,52.697095,"[4526, 14527, 327, 6900, 15994, 908, 4280, 157...","[1603316685330, 1603316690350, 1603316694508, ...","[7448, 1718, 220, 2799, 1529, 12031, 13063, 12...","[4526, 14527, 327, 6900, 15994, 908, 4280, 157...","[53, 91, 22, 55, 18, 31, 40, 70, 17, 55, 30, 6...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, ..."
1,https://www.gigstarter.nl/jive-beta,036yzmylcy37yrkfxbnv5y,session,band_proband,2.380952,"[6650, 18676, 2533, 2267, 19657, 3520, 1247, 2...","[1603293650894, 1603293653270, 1603293657599, ...",[1247],"[6650, 18676, 2533, 2267, 19657, 3520, 2438, 7...","[44, 40, 55]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,https://www.gigstarter.fr/jive-beta,td1ilfmnmted7ulkdwm9lj,session,band_type,40.0,"[21167, 16497, 19645, 20227, 10686]","[1603270086849, 1603270112087, 1603270127495, ...","[16497, 10686]","[21167, 19645, 20227]","[21, 38, 54, 38, 14, 25]","[0, 1, 0, 0, 1]"
3,https://www.gigstarter.nl/jive-beta,40559,user_account,band_proband,0.0,[3214],[1604049794708],[],[3214],[],[0]
4,https://www.gigstarter.nl/jive-beta,q7k9jbzc8if298q1azxetk,session,band_proband,66.37931,"[1797, 10242, 15130, 11921, 3488, 13191, 3020,...","[1604050041532, 1604050044082, 1604050048054, ...","[10242, 3488, 1909, 16081, 1343, 1587, 269, 53...","[1797, 15130, 11921, 13191, 3020, 651, 5355, 1...","[40, 10, 46, 77, 46, 29, 81, 55, 61, 53, 44, 2...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,https://www.gigstarter.nl/jive-beta,v4satelp99qam4rlqscs8,session,band_type,75.0,"[12030, 1329, 1074, 4348]","[1604050068095, 1604050069420, 1604050071533, ...","[12030, 1329, 4348]",[1074],"[40, 70, 71, 31, 70, 22, 53]","[1, 1, 0, 1]"
6,https://www.gigstarter.nl/jive-beta,35vin4gg6m6gema5n6d9p,session,band_proband,66.666667,"[14511, 3083, 16278]","[1604049327256, 1604049341900, 1604049343000]","[3083, 16278]",[14511],"[55, 63, 70, 37, 96, 67]","[0, 1, 1]"
7,https://www.gigstarter.nl/jive-beta,7r8swg1ht8m9wzmnhiyodj,session,band_type,33.333333,"[3514, 2097, 205, 39, 17741, 18293, 4104, 4104...","[1604067349071, 1604067350261, 1604067352222, ...","[205, 39, 651, 1545, 4861, 5706, 14511, 2041]","[3514, 2097, 17741, 18293, 4104, 4104, 12569, ...","[2, 40, 46, 69, 53, 76, 30, 60, 86, 76, 83, 91...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."
8,https://www.gigstarter.fr/jive-beta,8v5sckg1wiqrjv8zgmw4ab,session,band_type,100.0,"[20227, 15724]","[1603707418845, 1603707420316]","[20227, 15724]",[],"[40, 16, 75, 50, 81]","[1, 1]"
9,https://www.gigstarter.nl/jive-beta,hr126pdpzbm1w2xruxv0k5,session,band_type,0.0,"[2151, 4434, 17650, 16990, 16950]","[1603966082106, 1603966086420, 1603966088769, ...",[],"[2151, 4434, 17650, 16990, 16950]",[],"[0, 0, 0, 0, 0]"


# Make genre clusters #
Now, we'll investigate which genres are often liked by the same person. 

### Some functions ###

In [45]:
def fill_freq_matrix(freq_matrix, feature_lists):
    """
    Takes a frequency matrix and features represented as a list and
    fills the frequency matrix given the features.
    
    Parameters
        
        freq_matrix:     numpy ndarray.
        
        feature_lists:   list of preprocessed features (lists of numbers).
        
    Returns
    
        freq_matrix:     the filled frequency matrix, showing the relations between features.
    """
    for feature_list in feature_lists:
        for i, feature1 in enumerate(feature_list):
            for _, feature2 in enumerate(feature_list[i + 1:]):
                freq_matrix[feature1, feature2] += 1
    return freq_matrix

def make_clusters(freq_matrix, idx_to_name, n):
    """
    Take a frequency matrix and make clusters showing the most common combinations.
    
    Parameters
    
        freq_matrix:    numpy ndarray show how often items occur together.
        
        idx_to_name:    dictionary that lets you get the name of each item given its index.
        
        n:              int, n most common combinations are considered.
        
    Returns
        
        clusters:       dictionary showing the found clusters
    """
    flat_indices = np.argpartition(freq_matrix.ravel(), -n)[-n:]
    row_indices, col_indices = np.unravel_index(flat_indices, freq_matrix.shape)
    
    clusters = {}
    for i, index1 in enumerate(row_indices):
        index2 = col_indices[i]
        name1 = idx_to_name[index1]
        name2 = idx_to_name[index2]
        if name1 in clusters.keys():
            if not name2 in clusters[name1]:
                clusters[name1].append(name2)
        else:
            clusters[name1] = [name2]

        # make sure relation goes both ways
        if name2 in clusters.keys():
            if not name1 in clusters[name2]:
                clusters[name2].append(name1)
        else:
            clusters[name2] = [name1]
    return clusters

def write_clusters_to_file(file_name, clusters):
    """
    Takes a file name and a dictionary of clusters and writes clusters to txt file.
    """
    text_file = open("{}.txt".format(file_name), "w")
    for key in clusters.keys():
        text_file.write("{}:\n".format(key))
        for item in clusters[key]:
            text_file.write("\t - {}\n".format(item))
        text_file.write("\n")
    text_file.close()

### Make frequency matrix denoting how common each combination of liked genres is ###

In [36]:
genre_ids = df_genres["id"].unique()
max_id = max(genre_ids)
freq_matrix = np.zeros((max_id, max_id))

lists_liked_genres_musicmatch = df_musicmatch["liked_genres"].tolist()
lists_liked_genres_jive = df_jive["liked_genres"].tolist()

freq_matrix = fill_freq_matrix(freq_matrix, lists_liked_genres_musicmatch)
freq_matrix = fill_freq_matrix(freq_matrix, lists_liked_genres_jive)

array([[    0.,     0.,     0., ...,     0.,     0.,     0.],
       [    0.,   119.,  3574., ...,     0.,     0.,     0.],
       [    0.,  2418., 73919., ...,     0.,     0.,     0.],
       ...,
       [    0.,     0.,     0., ...,     0.,     0.,     0.],
       [    0.,     0.,     0., ...,     0.,     0.,     0.],
       [    0.,     0.,     0., ...,     0.,     0.,     0.]])

### Make clusters given the frequency matrix ###

In [50]:
df_genres_english = df_genres.copy()
df_genres_english = df_genres_english[df_genres_english["locale"] == "en"]
index_to_genre = dict(zip(df_genres_english.genre_id, df_genres_english.name))
genre_clusters = make_clusters(freq_matrix, index_to_genre, 130)
write_clusters_to_file("genre_clusters", genre_clusters)