In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
folder = os.path.join(os.path.abspath(".."), "interest")

In [3]:
artist = pd.read_csv(os.path.join(folder, "data_by_artist.csv"))
year = pd.read_csv(os.path.join(folder, "data_by_year.csv"))
music = pd.read_csv(os.path.join(folder, "full_music_data.csv"))
influence = pd.read_csv(os.path.join(folder, "influence_data.csv"))

#### arrange music data

In [4]:
# preprocessing
# 修正数据类型
music["music_id"] = range(len(music))
music["release_date"] = music["release_date"].astype("datetime64")
music = music.sort_values(by = 'release_date', ascending = True)
music = music.reset_index(drop = True)

In [5]:
# 做pre和写report没用
# 因为原数据当中没有genre的信息，influence里面才有每个musician的genre，所以需要匹配一下每一首音乐对应的genre
# mapping music with genre
lookup1 = influence.loc[:, ["influencer_id", "influencer_main_genre"]]
lookup1.columns = ["artists_id", "genre"]

lookup2 = influence.loc[:, ["follower_id", "follower_main_genre"]]
lookup2.columns = ["artists_id", "genre"]

lookup = pd.concat([lookup1, lookup2])
lookup = lookup.groupby("artists_id").first().reset_index()

lookup_dict = lookup.set_index('artists_id')['genre'].to_dict()
# lookup_dict

lookup2 = influence.loc[:, ["follower_id", "follower_main_genre"]]
lookup2.columns = ["artists_id", "genre"]

lookup = pd.concat([lookup1, lookup2])
lookup = lookup.groupby("artists_id").first().reset_index()

lookup_dict = lookup.set_index('artists_id')['genre'].to_dict()

music_artists_lst = music["artists_id"].str.extract(r"\[(\d+(?:,\s*\d+)*)\]")[0].str.split(", ")

def artists_to_genre(lst):
    lst = [lookup_dict[int(artist_id)] if int(artist_id) in lookup_dict.keys() else "Unknown" for artist_id in lst]
    return max(set(lst), key = lst.count)

music["genre"] = music_artists_lst.apply(lambda x:artists_to_genre(x))

In [6]:
# delete niche groups
music = music.loc[music["genre"] != "Unknown", :]

In [7]:
# 去掉43年以前的音乐（因为数据太少）
print(music["year"].value_counts().tail())
music = music.loc[music["year"] > 1943, :]

1925    3
1921    2
1924    2
1926    2
1929    1
Name: year, dtype: int64


In [8]:
music = music.reset_index(drop = True)

#### group by niche categories
#### 按照correlation of coefficient来合并genre
##### 最终，将"Comedy/Spoken", "Children's"去掉，'Avant-Garde',  'Classical',  'Jazz',  'Stage & Screen',  'Folk',  'Easy Listening',  'New Age',  'Vocal',  'International'合并为niche1，"Religious", "Electronic", "Country", "Latin", "New Age"合并为genre2

In [9]:
music.columns

Index(['artist_names', 'artists_id', 'danceability', 'energy', 'valence',
       'tempo', 'loudness', 'mode', 'key', 'acousticness', 'instrumentalness',
       'liveness', 'speechiness', 'explicit', 'duration_ms', 'popularity',
       'year', 'release_date', 'song_title (censored)', 'music_id', 'genre'],
      dtype='object')

In [10]:
feature_columns = ['danceability', 'energy', 'valence',
       'tempo', 'loudness', 'mode', 'key', 'acousticness', 'instrumentalness',
       'liveness', 'speechiness', 'explicit', 'duration_ms', 'popularity']

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
tmp_scaler = StandardScaler()
music_tmp = tmp_scaler.fit_transform(music[feature_columns])
music_tmp = pd.DataFrame(music_tmp)

music_tmp.columns = feature_columns
music_tmp["music_id"] = music["music_id"]
music_tmp["genre"] = music["genre"]


genre_attributes = music_tmp.groupby(["genre"])[feature_columns].mean().T

genre_corr = genre_attributes.corr()

music["genre"].value_counts()

Pop/Rock          47350
R&B;              10567
Country            7566
Jazz               6840
Vocal              5379
Latin              4569
Classical          2090
International      1423
Folk               1397
Reggae             1299
Electronic         1223
Blues              1109
Stage & Screen      993
Easy Listening      847
Religious           716
Comedy/Spoken       330
New Age             253
Avant-Garde         214
Children's           59
Name: genre, dtype: int64

In [13]:
genre_corr.loc["Children's", :].sort_values(ascending = False).head()

genre
Children's       1.000000
Blues            0.824063
Country          0.704647
Folk             0.596422
International    0.582325
Name: Children's, dtype: float64

In [14]:
genre_corr.loc["Avant-Garde", :].sort_values(ascending = False).head(10)

genre
Avant-Garde       1.000000
Classical         0.920723
Jazz              0.879857
Stage & Screen    0.844490
Folk              0.844168
Easy Listening    0.829934
New Age           0.829531
Vocal             0.777213
International     0.667144
Blues             0.497891
Name: Avant-Garde, dtype: float64

In [15]:
niche_genre1 = list(genre_corr.loc["Avant-Garde", :].sort_values(ascending = False).head(9).index)

In [16]:
niche_genre1

['Avant-Garde',
 'Classical',
 'Jazz',
 'Stage & Screen',
 'Folk',
 'Easy Listening',
 'New Age',
 'Vocal',
 'International']

In [103]:
music["genre"] = music["genre"].apply(lambda x:"niche1" if x in niche_genre1 else x)

In [104]:
tmp_scaler = StandardScaler()
music_tmp = tmp_scaler.fit_transform(music[feature_columns])
music_tmp = pd.DataFrame(music_tmp)

music_tmp.columns = feature_columns
music_tmp["music_id"] = music["music_id"]
music_tmp["genre"] = music["genre"]

genre_attributes = music_tmp.groupby(["genre"])[feature_columns].mean().T

genre_corr = genre_attributes.corr()

music["genre"].value_counts()

Pop/Rock         47721
niche1           19934
R&B;             10363
Country           7584
Latin             4744
Reggae            1329
Electronic        1236
Religious          715
Comedy/Spoken      326
New Age            252
Children's          56
Name: genre, dtype: int64

In [105]:
genre_corr.loc["Religious", :].sort_values(ascending = False)

genre
Religious        1.000000
Pop/Rock         0.629851
Latin            0.174193
R&B;             0.128398
Electronic       0.066644
Country          0.037776
Reggae          -0.052267
Comedy/Spoken   -0.120106
Children's      -0.220047
New Age         -0.497711
niche1          -0.627856
Name: Religious, dtype: float64

In [106]:
niche_genre2 = ["Religious", "Electronic", "Country", "Latin", "New Age"]

In [107]:
music["genre"] = music["genre"].apply(lambda x:"niche2" if x in niche_genre2 else x)

In [108]:
tmp_scaler = StandardScaler()
music_tmp = tmp_scaler.fit_transform(music[feature_columns])
music_tmp = pd.DataFrame(music_tmp)

music_tmp.columns = feature_columns
music_tmp["music_id"] = music["music_id"]
music_tmp["genre"] = music["genre"]

genre_attributes = music_tmp.groupby(["genre"])[feature_columns].mean().T

genre_corr = genre_attributes.corr()

music["genre"].value_counts()

Pop/Rock         47721
niche1           19934
niche2           14531
R&B;             10363
Reggae            1329
Comedy/Spoken      326
Children's          56
Name: genre, dtype: int64

In [109]:
genre_corr.loc["Reggae", :].sort_values(ascending = False)

genre
Reggae           1.000000
R&B;             0.762506
Children's       0.407293
Comedy/Spoken    0.369596
niche2           0.354136
Pop/Rock        -0.027313
niche1          -0.320169
Name: Reggae, dtype: float64

In [110]:
music.loc[music["genre"] == "Reggae", "genre"] = "Pop/Rock"

In [111]:
tmp_scaler = StandardScaler()
music_tmp = tmp_scaler.fit_transform(music[feature_columns])
music_tmp = pd.DataFrame(music_tmp)

music_tmp.columns = feature_columns
music_tmp["music_id"] = music["music_id"]
music_tmp["genre"] = music["genre"]


genre_attributes = music_tmp.groupby(["genre"])[feature_columns].mean().T

genre_corr = genre_attributes.corr()

music["genre"].value_counts()

Pop/Rock         49050
niche1           19934
niche2           14531
R&B;             10363
Comedy/Spoken      326
Children's          56
Name: genre, dtype: int64

In [112]:
music = music.loc[~music["genre"].isin(["Comedy/Spoken", "Children's"]), :]
music = music.reset_index(drop = True)

In [113]:
music["genre"].value_counts()

Pop/Rock    49050
niche1      19934
niche2      14531
R&B;        10363
Name: genre, dtype: int64

#### add # of musician's information，增加每首歌有几位音乐家创作的feature

In [114]:
music_artists_lst = music["artists_id"].str.extract(r"\[(\d+(?:,\s*\d+)*)\]")[0].str.split(", ")
music_artists_lst

0         [792507]
1         [259529]
2         [792507]
3         [259529]
4         [792507]
           ...    
93873    [3006966]
93874     [312004]
93875     [312004]
93876     [994823]
93877     [303506]
Name: 0, Length: 93878, dtype: object

In [115]:
music["musician_cnt"] = music_artists_lst.apply(lambda x:len(x))

#### arrange the data

In [116]:
music.head()

Unnamed: 0,artist_names,artists_id,danceability,energy,valence,tempo,loudness,mode,key,acousticness,...,speechiness,explicit,duration_ms,popularity,year,release_date,song_title (censored),music_id,genre,musician_cnt
0,['Frank Sinatra'],[792507],0.197,0.0546,0.1,90.15,-22.411,1,1,0.95,...,0.0346,0,186173,53,1944,1944-01-01,Put Your Dreams Away (For Another Day),34075,niche1,1
1,['Lester Young'],[259529],0.392,0.279,0.65,170.639,-13.507,1,8,0.884,...,0.0597,0,143173,3,1944,1944-01-01,***** 'N' Bell's - Take 3,53744,niche1,1
2,['Frank Sinatra'],[792507],0.561,0.335,0.59,126.974,-11.093,0,9,0.84,...,0.0499,0,163000,55,1944,1944-01-01,******** ***** (Is The Loneliest ***** In The ...,34074,niche1,1
3,['Lester Young'],[259529],0.488,0.308,0.798,185.988,-14.119,1,8,0.946,...,0.0674,0,174427,2,1944,1944-01-01,Indiana - Take *,53742,niche1,1
4,['Frank Sinatra'],[792507],0.169,0.147,0.152,174.301,-16.491,1,2,0.906,...,0.0336,0,218093,34,1944,1944-01-01,***** and Day,34076,niche1,1


In [117]:
music.columns

Index(['artist_names', 'artists_id', 'danceability', 'energy', 'valence',
       'tempo', 'loudness', 'mode', 'key', 'acousticness', 'instrumentalness',
       'liveness', 'speechiness', 'explicit', 'duration_ms', 'popularity',
       'year', 'release_date', 'song_title (censored)', 'music_id', 'genre',
       'musician_cnt'],
      dtype='object')

In [118]:
music = music.drop(columns = ['artist_names', 'year', "song_title (censored)"])
music = music.reset_index(drop = True)

#### Standardize music features，更改字符，通过方差均值进行标准化

In [119]:
music.columns

Index(['artists_id', 'danceability', 'energy', 'valence', 'tempo', 'loudness',
       'mode', 'key', 'acousticness', 'instrumentalness', 'liveness',
       'speechiness', 'explicit', 'duration_ms', 'popularity', 'release_date',
       'music_id', 'genre', 'musician_cnt'],
      dtype='object')

In [120]:
music["genre"].unique()

array(['niche1', 'niche2', 'Pop/Rock', 'R&B;'], dtype=object)

In [121]:
category_replace = {'Pop/Rock': "pop_rock",
                   "R&B;": "r_b"}

In [122]:
music["genre"] = music["genre"].replace(category_replace)

In [123]:
# add dummy variables
genre_dummy = pd.get_dummies(music_feature["genre"], drop_first = False)
music = pd.concat([music, genre_dummy], axis = 1)

In [124]:
music.columns

Index(['artists_id', 'danceability', 'energy', 'valence', 'tempo', 'loudness',
       'mode', 'key', 'acousticness', 'instrumentalness', 'liveness',
       'speechiness', 'explicit', 'duration_ms', 'popularity', 'release_date',
       'music_id', 'genre', 'musician_cnt', 'niche1', 'niche2', 'pop_rock',
       'r_b'],
      dtype='object')

In [125]:
feature_columns = ['danceability', 'energy', 'valence', 'tempo', 'loudness',
       'mode', 'key', 'acousticness', 'instrumentalness', 'liveness',
       'speechiness', 'explicit', 'duration_ms', 'popularity', 'musician_cnt', 
                  'niche1', 'niche2', 'pop_rock', 'r_b']

In [126]:
scalar = StandardScaler()
music = music.reset_index(drop = True)
music_feature = scalar.fit_transform(music[feature_columns])

In [127]:
music_feature = pd.DataFrame(music_feature)
music_feature.columns = feature_columns

In [128]:
music_feature["artists_id"] = music["artists_id"]
music_feature["release_date"] = music["release_date"]
music_feature["music_id"] = music["music_id"]
music_feature["genre"] = music["genre"]

In [134]:
music_feature = music_feature.sort_values(by = ["release_date", "music_id"], ascending = True)
music_feature = music_feature.reset_index(drop = True)

In [135]:
music_feature.head()

Unnamed: 0,danceability,energy,valence,tempo,loudness,mode,key,acousticness,instrumentalness,liveness,...,popularity,musician_cnt,niche1,niche2,pop_rock,r_b,artists_id,release_date,music_id,genre
0,-1.417448,-1.625664,-1.143229,0.588683,-0.929159,0.618804,0.516467,1.422265,1.328603,1.238703,...,-2.189136,-0.181069,1.925991,-0.42794,-1.046032,-0.352258,[26350],1944-01-01,3206,niche1
1,-0.233427,-1.082774,0.714122,-0.172814,-1.121847,0.618804,0.516467,1.528727,1.355216,0.313645,...,-2.129227,-0.181069,1.925991,-0.42794,-1.046032,-0.352258,[26350],1944-01-01,3227,niche1
2,-1.104572,-1.752975,-0.091898,0.290306,-1.246168,0.618804,-0.908235,1.476934,-0.445792,1.10887,...,-2.129227,-0.181069,1.925991,-0.42794,-1.046032,-0.352258,[26350],1944-01-01,3258,niche1
3,-2.184302,-1.914313,-1.927444,-0.167752,-2.948208,0.618804,-0.053414,1.560378,-0.220303,-0.67092,...,-2.129227,-0.181069,1.925991,-0.42794,-1.046032,-0.352258,[26350],1944-01-01,3259,niche1
4,-1.19046,-1.266286,-0.823936,-1.210862,-0.991625,0.618804,0.516467,1.465425,1.38183,-0.221915,...,-2.129227,-0.181069,1.925991,-0.42794,-1.046032,-0.352258,[26350],1944-01-01,3260,niche1


In [136]:
music_feature.to_csv(r"rearrange_data/music_features.csv", index = False)