In [71]:
import numpy as np
import pandas as pd
import os

In [72]:
folder = os.path.join(os.path.abspath(".."), "interest")

In [73]:
music = pd.read_csv(r"rearrange_data/music_features.csv") # 这个data是已经整理好的music，data，不再是最初的music feature了
influence = pd.read_csv(os.path.join(folder, "influence_data.csv"))

#### extract musician and music list

In [74]:
# 提取每个音乐对应的音乐家的id信息
music_artists_lst = music["artists_id"].str.extract(r"\[(\d+(?:,\s*\d+)*)\]")[0].str.split(", ")
artists_lst = music_artists_lst.values.tolist()
artists_lst = [item for sublist in artists_lst for item in sublist]
artists_lst = list(set(artists_lst)) # 找到所有数据库出现的音乐家

In [75]:
# 生成（key = musician_id, val = [music_id]）的dictionary
musician_creation = {}
for idx, artists_id in enumerate(music_artists_lst):
    music_id = music.iloc[idx, :]["music_id"]
    for artist in artists_id:
        if artist not in musician_creation.keys():
            musician_creation[artist] = [music_id]
        else:
            musician_creation[artist].append(music_id)

In [76]:
len(musician_creation) #一共有5000为音乐家

5527

#### add year info

In [77]:
music["release_date"] = music["release_date"].astype("datetime64")
music["year"] = music["release_date"].dt.year

#### get the genre information

In [78]:
music.columns

Index(['danceability', 'energy', 'valence', 'tempo', 'loudness', 'mode', 'key',
       'acousticness', 'instrumentalness', 'liveness', 'speechiness',
       'explicit', 'duration_ms', 'popularity', 'musician_cnt', 'niche1',
       'niche2', 'pop_rock', 'r_b', 'artists_id', 'release_date', 'music_id',
       'genre', 'year'],
      dtype='object')

In [81]:
feature_columns = ['danceability', 'energy', 'valence', 'tempo', 'loudness', 'mode', 'key',
       'acousticness', 'instrumentalness', 'liveness', 'speechiness',
       'explicit', 'duration_ms', 'popularity', 'musician_cnt', 'niche1',
       'niche2', 'pop_rock', 'r_b']

In [82]:
genre = music.groupby(["year", "genre"])[feature_columns].mean().reset_index() # 生成每个genre的music feature方便之后对于缺失值填充

In [83]:
genre = genre.sort_values(by = ["genre", "year"], ascending = True)
genre = genre.reset_index(drop = True)

In [87]:
genre_development_diminish = genre.copy() # 额外复制一个genre yearly development的df，方便填补缺失值
genre_development_diminish[feature_columns] = genre_development_diminish[feature_columns] * 0.8

In [88]:
genre_development_diminish.head()

Unnamed: 0,year,genre,danceability,energy,valence,tempo,loudness,mode,key,acousticness,...,liveness,speechiness,explicit,duration_ms,popularity,musician_cnt,niche1,niche2,pop_rock,r_b
0,1944,niche1,-0.425773,-0.751501,-0.139096,-0.079124,-0.309791,0.21362,-0.025846,1.231592,...,0.189502,0.010919,-0.156847,-0.157783,-1.392744,-0.119718,1.540793,-0.342352,-0.836825,-0.281806
1,1945,niche1,-0.086779,-0.848578,-0.019196,-0.241941,-0.608203,0.113533,-0.064668,1.138727,...,-0.045011,0.006573,-0.156847,-0.384733,-1.421234,-0.053981,1.540793,-0.342352,-0.836825,-0.281806
2,1946,niche1,-0.657405,-0.898225,-0.419601,-0.226451,-0.713062,-0.05285,-0.035378,1.153888,...,0.135937,0.063853,-0.156847,-0.172136,-1.460655,0.074085,1.540793,-0.342352,-0.836825,-0.281806
3,1947,niche1,-0.684399,-1.068329,-0.530062,-0.21814,-1.141873,0.056511,-0.010904,1.136917,...,-0.148331,-0.13466,-0.156847,0.262906,-1.640082,-0.062897,1.540793,-0.342352,-0.836825,-0.281806
4,1948,niche1,-0.60961,-0.98941,-0.540387,-0.258269,-0.764132,0.01737,-0.03229,1.195543,...,0.115637,0.091434,-0.156847,0.020253,-1.531979,-0.103407,1.540793,-0.342352,-0.836825,-0.281806


In [152]:
# 用于根据musician id生成input data
def niche_time_gen(artist_id):
    # 找到该musician创作过的所有音乐
    music_id = musician_creation[artist_id]
    artist_pieces = music.loc[music["music_id"].isin(music_id), :] # related rows in music
    max_idx = len(artist_pieces) # how many rows in total
    if max_idx > 20:
        # 如果超过20首，随机取连续的20首音乐的feature作为input data
        selection = np.random.choice(range(0, max_idx - 20 + 1), 1, replace = False)[0] # randomly select a part of the music feature
        return artist_pieces.iloc[selection:(selection + 20), :][feature_columns], False
    else:
        # 如果不足20，则除了它创作过的音乐外，找到它活跃的最早年份，取出这一年以前的它所属genre的feature作为信息补齐够20条
        start_year = min(artist_pieces["year"]) # find the first year the musician start to make songs
        musician_genre = (artist_pieces["genre"].values[0]) # find the musician's genre
        related_genre_development = genre.loc[(genre["genre"] == musician_genre) & (genre["year"] < start_year), :] # extract the genre development info before the year
        if len(related_genre_development) == 0:
            related_genre_development = genre.loc[(genre["genre"] == musician_genre), :].head(1) # if too early, use the first year existed in the database
        miss = 20 - max_idx # how many rows to be filled
        related_genre_development = related_genre_development.tail(miss) # use the year before the oldest records of that genre to fill blanks
        
        empty_flag = False
        if len(related_genre_development) < miss: # if the genre information still not enough
            # 但是如果它所属genre有数据的年份也不够，则用该genre最早的一年的feature，按照每年80%的递减率，用最早一年进行递减补全
            empty_flag = True
            miss = miss - len(related_genre_development) # how many rows left
            baseline_idx = related_genre_development.index[0] # find the earliest year to fill it as the first t = -1 records
            baseline = genre_development_diminish.loc[baseline_idx, :][feature_columns].copy()
            empty = np.zeros((miss, 19))
            diminish_rate = 0.8
            for i in range(empty.shape[0] - 1, -1, -1):
                baseline *= diminish_rate
                empty[i, :] = baseline.copy()
            empty = pd.DataFrame(empty)
            empty.columns = feature_columns
                
        layer1 = (np.zeros((0, 19)) if not empty_flag else empty)
        layer2 = related_genre_development[feature_columns]
        layer3 = artist_pieces.loc[:, feature_columns]
        res = np.concatenate((layer1, layer2, layer3), axis = 0)
        return res, True

#### find influence relation

In [153]:
# 找到明确有influence的所有pair
influence_zip = influence.apply(lambda x: [str(x['influencer_id']), str(x['follower_id'])], axis=1).tolist()

In [154]:
len(artists_lst) ** 2

30547729

In [155]:
len(influence_zip) * 2

85540

#### 先生成所有target variable为0的input data，一共生成5w组

In [156]:
# 从所有的musician当中随机生成pair
sample_lst = np.random.choice(artists_lst, (50000, 2), replace = True).tolist()

In [158]:
influencer_feature0 = []
follower_feature0 = []
niche_cnt = 0
cnt = 0
# 先生成所有target variable为0的input data，一共生成5w组
for pair in sample_lst:
    influencer_id = str(pair[0])
    follower_id = str(pair[1])
    if influencer_id in artists_lst and follower_id in artists_lst:
        if (influencer_id != follower_id) and (pair not in influence_zip):
            try:
                influencer_input, flag1 = niche_time_gen(influencer_id)
            except Exception as e:
                print(influencer_id, e)
                break
            try:
                follow_input, flag2 = niche_time_gen(follower_id)
            except Exception as e:
                print(follower_id, e)
                break    
            influencer_feature0.append(influencer_input)
            follower_feature0.append(follow_input)
            if flag1 or flag2:
                niche_cnt += 1
    if cnt % 100 == 0:
        print(cnt, influencer_id, follower_id)
    cnt += 1

0 28619 840402
100 862855 108265
200 37197 607202
300 144349 91650
400 150530 80649
500 11210 799081
600 186440 44703
700 753094 372986
800 891907 1352264
900 572105 193160
1000 227487 96613
1100 44764 169124
1200 954964 791480
1300 26023 926085
1400 633079 47789
1500 286572 542549
1600 818627 110454
1700 817011 567769
1800 26520 51376
1900 92107 451296
2000 517074 36312
2100 244073 143781
2200 306851 318085
2300 564333 836103
2400 403184 149515
2500 686577 222974
2600 509223 942529
2700 996127 951374
2800 48061 338593
2900 796045 36468
3000 207360 20319
3100 835085 500992
3200 918140 7031
3300 823482 165830
3400 933809 147245
3500 843607 543169
3600 308742 593129
3700 91312 726942
3800 2368302 308762
3900 818593 42993
4000 833782 179191
4100 866417 391933
4200 141838 988655
4300 2795896 466759
4400 390743 747445
4500 30925 585210
4600 757342 812741
4700 127949 11664
4800 101776 753104
4900 184451 295415
5000 815236 762885
5100 196690 163420
5200 45941 524634
5300 403784 274321
5400 74

42000 2128228 118493
42100 486775 669015
42200 201667 208997
42300 231645 752770
42400 204968 749743
42500 6860 814871
42600 32071 351733
42700 57636 75308
42800 161351 620759
42900 105354 27102
43000 622277 934350
43100 40922 349180
43200 149840 300024
43300 825883 500889
43400 290775 694325
43500 206658 166025
43600 647670 903609
43700 142652 147468
43800 164093 756291
43900 627250 944105
44000 40922 3475903
44100 108124 37730
44200 723868 223054
44300 757251 729857
44400 614455 14651
44500 176360 3276902
44600 216586 755745
44700 3045620 841881
44800 591048 274637
44900 117001 359062
45000 36327 219203
45100 923705 21492
45200 251210 592954
45300 495861 189848
45400 48061 493848
45500 765302 247824
45600 657041 665824
45700 448207 639621
45800 102750 7731
45900 38321 104217
46000 204714 3044549
46100 486395 147771
46200 939187 673883
46300 792348 933809
46400 61480 768737
46500 586319 419102
46600 569914 389213
46700 933898 197532
46800 631078 117490
46900 127337 31765
47000 489876 

In [172]:
import pickle

In [169]:
# 转换成numpy格式并保存
influencer_feature0 = np.array(influencer_feature0)
follower_feature0 = np.array(follower_feature0)

musician_sequential_0 = [influencer_feature0, follower_feature0]
with open(r'variables/musician_sequential_0.pkl', 'wb') as save_data:
    pickle.dump(musician_sequential_0, save_data) 

##### 生成所有target variable为1的input data

In [None]:
influencer_feature1 = []
follower_feature1 = []
niche_cnt = 0
cnt = 0
for pair in influence_zip:
    influencer_id = str(pair[0])
    follower_id = str(pair[1])
    if influencer_id in artists_lst and follower_id in artists_lst:
        try:
            influencer_input, flag1 = niche_time_gen(influencer_id)
        except Exception as e:
            print("influencer", influencer_id, e)
            break
        try:
            follow_input, flag2 = niche_time_gen(follower_id)
        except Exception as e:
            print("follower", follower_id, e)
            break    
        influencer_feature1.append(influencer_input)
        follower_feature1.append(follow_input)
        if flag1 or flag2:
            niche_cnt += 1
    if cnt % 100 == 0:
        print(cnt, influencer_id, follower_id)
    cnt += 1

In [None]:
influencer_feature1 = np.array(influencer_feature1)
follower_feature1 = np.array(follower_feature1)

musician_sequential_1 = [influencer_feature1, follower_feature1]
with open(r'variables/musician_sequential_1.pkl', 'wb') as save_data:
    pickle.dump(musician_sequential_1, save_data) 