In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
folder = os.path.join(os.path.abspath(".."), "interest")

In [3]:
music = pd.read_csv(r"rearrange_data/music_features.csv")
influence = pd.read_csv(os.path.join(folder, "influence_data.csv"))

#### extract musician and music list

In [4]:
music_artists_lst = music["artists_id"].str.extract(r"\[(\d+(?:,\s*\d+)*)\]")[0].str.split(", ")
artists_lst = music_artists_lst.values.tolist()
artists_lst = [item for sublist in artists_lst for item in sublist]
artists_lst = list(set(artists_lst))

In [5]:
musician_creation = {}
for idx, artists_id in enumerate(music_artists_lst):
    music_id = music.iloc[idx, :]["music_id"]
    for artist in artists_id:
        if artist not in musician_creation.keys():
            musician_creation[artist] = [music_id]
        else:
            musician_creation[artist].append(music_id)

In [6]:
len(musician_creation)

5527

#### add year info

In [7]:
music["release_date"] = music["release_date"].astype("datetime64")
music["year"] = music["release_date"].dt.year

#### get the genre information

In [8]:
music.columns

Index(['danceability', 'energy', 'valence', 'tempo', 'loudness', 'mode', 'key',
       'acousticness', 'instrumentalness', 'liveness', 'speechiness',
       'explicit', 'duration_ms', 'popularity', 'musician_cnt', 'niche1',
       'niche2', 'pop_rock', 'r_b', 'artists_id', 'release_date', 'music_id',
       'genre', 'year'],
      dtype='object')

In [9]:
feature_columns = ['danceability', 'energy', 'valence', 'tempo', 'loudness', 'mode', 'key',
       'acousticness', 'instrumentalness', 'liveness', 'speechiness',
       'explicit', 'duration_ms', 'popularity', 'musician_cnt', 'niche1',
       'niche2', 'pop_rock', 'r_b']

In [10]:
genre = music.groupby(["year", "genre"])[feature_columns].mean().reset_index()

In [11]:
genre = genre.sort_values(by = ["genre", "year"], ascending = True)
genre = genre.reset_index(drop = True)

In [12]:
genre_development_diminish = genre.copy()
genre_development_diminish[feature_columns] = genre_development_diminish[feature_columns] * 0.8

In [13]:
genre_development_diminish.head()

Unnamed: 0,year,genre,danceability,energy,valence,tempo,loudness,mode,key,acousticness,...,liveness,speechiness,explicit,duration_ms,popularity,musician_cnt,niche1,niche2,pop_rock,r_b
0,1944,niche1,-0.425773,-0.751501,-0.139096,-0.079124,-0.309791,0.21362,-0.025846,1.231592,...,0.189502,0.010919,-0.156847,-0.157783,-1.392744,-0.119718,1.540793,-0.342352,-0.836825,-0.281806
1,1945,niche1,-0.086779,-0.848578,-0.019196,-0.241941,-0.608203,0.113533,-0.064668,1.138727,...,-0.045011,0.006573,-0.156847,-0.384733,-1.421234,-0.053981,1.540793,-0.342352,-0.836825,-0.281806
2,1946,niche1,-0.657405,-0.898225,-0.419601,-0.226451,-0.713062,-0.05285,-0.035378,1.153888,...,0.135937,0.063853,-0.156847,-0.172136,-1.460655,0.074085,1.540793,-0.342352,-0.836825,-0.281806
3,1947,niche1,-0.684399,-1.068329,-0.530062,-0.21814,-1.141873,0.056511,-0.010904,1.136917,...,-0.148331,-0.13466,-0.156847,0.262906,-1.640082,-0.062897,1.540793,-0.342352,-0.836825,-0.281806
4,1948,niche1,-0.60961,-0.98941,-0.540387,-0.258269,-0.764132,0.01737,-0.03229,1.195543,...,0.115637,0.091434,-0.156847,0.020253,-1.531979,-0.103407,1.540793,-0.342352,-0.836825,-0.281806


In [14]:
def niche_time_gen(artist_id):
    music_id = musician_creation[artist_id]
    artist_pieces = music.loc[music["music_id"].isin(music_id), :] # related rows in music
    max_idx = len(artist_pieces) # how many rows in total
    if max_idx > 20:
        selection = np.random.choice(range(0, max_idx - 20 + 1), 1, replace = False)[0] # randomly select a part of the music feature
        return artist_pieces.iloc[selection:(selection + 20), :][feature_columns], False
    else:
        start_year = min(artist_pieces["year"]) # find the first year the musician start to make songs
        musician_genre = (artist_pieces["genre"].values[0]) # find the musician's genre
        related_genre_development = genre.loc[(genre["genre"] == musician_genre) & (genre["year"] < start_year), :] # extract the genre development info before the year
        if len(related_genre_development) == 0:
            related_genre_development = genre.loc[(genre["genre"] == musician_genre), :].head(1) # if too early, use the first year existed in the database
        miss = 20 - max_idx # how many rows to be filled
        related_genre_development = related_genre_development.tail(miss) # use the year before the oldest records of that genre to fill blanks
        
        empty_flag = False
        if len(related_genre_development) < miss: # if the genre information still not enough
            empty_flag = True
            miss = miss - len(related_genre_development) # how many rows left
            baseline_idx = related_genre_development.index[0] # find the earliest year to fill it as the first t = -1 records
            baseline = genre_development_diminish.loc[baseline_idx, :][feature_columns].copy()
            empty = np.zeros((miss, 19))
            diminish_rate = 0.8
            for i in range(empty.shape[0] - 1, -1, -1):
                baseline *= diminish_rate
                empty[i, :] = baseline.copy()
            empty = pd.DataFrame(empty)
            empty.columns = feature_columns
                
        layer1 = (np.zeros((0, 19)) if not empty_flag else empty)
        layer2 = related_genre_development[feature_columns]
        layer3 = artist_pieces.loc[:, feature_columns]
        res = np.concatenate((layer1, layer2, layer3), axis = 0)
        return res, True

#### find influence relation

In [15]:
influence_zip = influence.apply(lambda x: [str(x['influencer_id']), str(x['follower_id'])], axis=1).tolist()

In [16]:
len(artists_lst) ** 2

30547729

In [17]:
len(influence_zip) * 2

85540

In [25]:
influencer_feature1 = []
follower_feature1 = []
niche_cnt = 0
cnt = 0
for pair in influence_zip:
    influencer_id = str(pair[0])
    follower_id = str(pair[1])
    if influencer_id in artists_lst and follower_id in artists_lst:
        try:
            influencer_input, flag1 = niche_time_gen(influencer_id)
        except Exception as e:
            print("influencer", influencer_id, e)
            break
        try:
            follow_input, flag2 = niche_time_gen(follower_id)
        except Exception as e:
            print("follower", follower_id, e)
            break    
        influencer_feature1.append(influencer_input)
        follower_feature1.append(follow_input)
        if flag1 or flag2:
            niche_cnt += 1
    if cnt % 100 == 0:
        print(cnt, influencer_id, follower_id)
    cnt += 1

0 759491 74
200 7689 5307
300 562304 5953
400 64145 7448
500 65590 8549
600 211758 9680
800 617196 12972
900 815537 13508
1000 7689 15158
1100 359966 16281
1200 81796 18137
1300 3595 19392
1500 592954 21008
1600 98465 22124
1700 213212 24944
1800 193320 26367
1900 769444 27221
2000 307461 29791
2100 840402 30431
2200 384920 31606
2300 582313 32971
2400 774588 34249
2500 888730 35464
2600 267354 36934
2700 344275 38466
2800 422947 39013
2900 42234 40501
3000 326249 42589
3100 5882 44764
3200 152312 45475
3300 180228 46699
3400 809266 47033
3500 894465 48566
3600 59156 50034
3700 843607 51376
3800 528221 52900
3900 316834 57280
4000 61172 58934
4100 492037 61041
4200 57209 61637
4300 307461 64149
4400 848784 66429
4500 798248 68551
4600 198783 69986
4700 293490 71636
4800 369270 73644
4900 596126 75100
5000 404434 76057
5100 246611 78191
5200 574772 80340
5300 245489 81017
5400 224186 81739
5500 607448 83095
5600 696015 84446
5700 939567 85776
5800 793988 87597
5900 894465 89021
6000 772

In [26]:
len(influencer_feature1)

41214

In [27]:
len(follower_feature1)

41214

In [28]:
influencer_feature1 = np.array(influencer_feature1)
follower_feature1 = np.array(follower_feature1)

In [29]:
import pickle

In [None]:
musician_sequential_1 = [influencer_feature1, follower_feature1]
with open(r'variables/musician_sequential_1.pkl', 'wb') as save_data:
    pickle.dump(musician_sequential_1, save_data) 

#### generate genre features

In [31]:
music["genre"].unique()

array(['niche1', 'niche2', 'pop_rock', 'r_b'], dtype=object)

In [32]:
genre.head()

Unnamed: 0,year,genre,danceability,energy,valence,tempo,loudness,mode,key,acousticness,...,liveness,speechiness,explicit,duration_ms,popularity,musician_cnt,niche1,niche2,pop_rock,r_b
0,1944,niche1,-0.532216,-0.939376,-0.17387,-0.098906,-0.387239,0.267026,-0.032307,1.53949,...,0.236877,0.013649,-0.196059,-0.197228,-1.74093,-0.149647,1.925991,-0.42794,-1.046032,-0.352258
1,1945,niche1,-0.108474,-1.060723,-0.023994,-0.302426,-0.760254,0.141916,-0.080835,1.423408,...,-0.056263,0.008216,-0.196059,-0.480916,-1.776543,-0.067477,1.925991,-0.42794,-1.046032,-0.352258
2,1946,niche1,-0.821757,-1.122781,-0.524502,-0.283064,-0.891328,-0.066062,-0.044222,1.44236,...,0.169921,0.079817,-0.196059,-0.21517,-1.825818,0.092606,1.925991,-0.42794,-1.046032,-0.352258
3,1947,niche1,-0.855498,-1.335411,-0.662578,-0.272675,-1.427341,0.070639,-0.01363,1.421146,...,-0.185413,-0.168325,-0.196059,0.328632,-2.050102,-0.078622,1.925991,-0.42794,-1.046032,-0.352258
4,1948,niche1,-0.762013,-1.236762,-0.675484,-0.322836,-0.955165,0.021713,-0.040363,1.494429,...,0.144546,0.114292,-0.196059,0.025316,-1.914973,-0.129259,1.925991,-0.42794,-1.046032,-0.352258


In [33]:
genre_features = []
for genre in music["genre"].unique():
    genre_feature_df = music.loc[music["genre"] == genre, :].tail(20)
    if len(genre_feature_df) < 20:
        print(genre)
        break
    genre_features.append(genre_feature_df[feature_columns].values)

In [38]:
music["genre"].unique()

array(['niche1', 'niche2', 'pop_rock', 'r_b'], dtype=object)

In [35]:
genre_features = np.array(genre_features)

In [36]:
genre_features.shape

(4, 20, 19)

In [37]:
musician_genre_prediction_genre = genre_features
with open(r'variables/musician_genre_prediction_genre.pkl', 'wb') as save_data:
    pickle.dump(musician_genre_prediction_genre, save_data) 