# 推荐系统实现

## 根据用户数据提取歌曲特征

In [12]:
import pandas as pd
import seaborn as sns
import scipy.sparse as sp
from sklearn import ensemble
from sklearn.decomposition import TruncatedSVD
from matplotlib import pyplot as plt
import matplotlib
ZHFONT = matplotlib.font_manager.FontProperties(fname="data/SourceHanSansSC-Regular.otf")

df_train = pd.read_csv("data/train.csv")
df_songs = pd.read_csv("data/songs.csv")
df_songs_extra = pd.read_csv("data/song_extra_info.csv")

# 将train，songs和songs_extra进行内连接
df_train = df_train.merge(df_songs,on="song_id",how="inner")
df_train = df_train.merge(df_songs_extra,on="song_id",how="inner")

In [13]:
# 将文本类别转换成 Category以计算相关性
for col in df_train.select_dtypes(include=['object']).columns:
    df_train[col] = df_train[col].astype('category')
    df_train[col] = df_train[col].cat.codes
corr = df_train.corr()

In [14]:
# 创建稀疏矩阵
df_train_grouped = df_train.groupby(['msno','song_id'])
df_train_grouped = df_train_grouped.size().reset_index(name='play_count')

user_song_sparse_matrix = sp.coo_matrix((df_train_grouped['play_count'], (df_train_grouped['msno'].astype('category').cat.codes, df_train_grouped['song_id'].astype('category').cat.codes)))

In [15]:
# 计算每首歌曲的SVD向量表示
K = 64
svd = TruncatedSVD(n_components=K)
song_user_sparse_matrix = sp.coo_matrix((df_train_grouped['play_count'], (df_train_grouped['song_id'].astype('category').cat.codes, df_train_grouped['msno'].astype('category').cat.codes)))

# SVD算法
song_user_svd = svd.fit_transform(song_user_sparse_matrix)
song_user_features = pd.DataFrame(song_user_svd, 
                                columns=[f'特征{i+1}' for i in range(K)],
                                index=df_train_grouped['song_id'].astype('category').cat.categories)

In [16]:
# 拼接歌曲名称等信息
song_user_features['song_id'] = pd.Series(range(song_user_features.shape[0]))

# 重新读取未经编码的数据
df_train = pd.read_csv("data/train.csv")
df_songs = pd.read_csv("data/songs.csv")
df_songs_extra = pd.read_csv("data/song_extra_info.csv")
df_train = df_train.merge(df_songs,on="song_id",how="inner")
df_train = df_train.merge(df_songs_extra,on="song_id",how="inner")
# 只编码song_id，保证一致
df_train['song_id'] = df_train['song_id'].astype('category').cat.codes
df_train = df_train.drop_duplicates(subset=['song_id'],keep='first')
df_train = df_train.merge(song_user_features,how='inner',on='song_id')
display(df_train.shape)
display(df_train.head(15))

(359755, 78)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,特征55,特征56,特征57,特征58,特征59,特征60,特征61,特征62,特征63,特征64
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,74631,explore,Explore,online-playlist,1,206471,359,Bastille,Dan Smith| Mark Crew,...,-0.669902,-0.814714,0.365887,0.625832,-0.414132,0.44932,0.09419,-0.198572,0.45848,0.207223
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,223347,my library,Local playlist more,local-playlist,1,284584,1259,Various Artists,,...,0.011652,0.003097,0.00318,-0.008135,-0.003253,-0.005914,-0.00894,-0.003045,-0.008815,0.000713
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,120686,my library,Local playlist more,local-playlist,1,225396,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,...,0.004484,-0.041385,0.011043,0.038386,0.028808,0.080269,0.02381,-0.040498,0.036063,0.043778
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,23688,my library,Local playlist more,local-playlist,1,255512,1019,Soundway,Kwadwo Donkoh,...,0.011652,0.003097,0.00318,-0.008135,-0.003253,-0.005914,-0.00894,-0.003045,-0.008815,0.000713
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,33284,explore,Explore,online-playlist,1,187802,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,...,-1.488168,-1.639779,0.392439,-0.101235,-1.530603,-0.733189,0.995997,1.201326,1.17895,-0.927256
5,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,30167,explore,Explore,online-playlist,1,247803,1259,Desiigner,Sidney Selby| Adnan Khan,...,0.17926,-0.061225,-0.984969,-0.833324,1.912671,1.751444,-1.647275,-0.325898,0.598098,0.804848
6,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,189910,my library,Local playlist more,local-playlist,1,229982,465,BIGBANG TAEYANG,TEDDY| DEE.P| Rebecca Johnson,...,-1.695626,2.074209,-1.928169,1.301798,-0.16856,-0.928602,0.61962,-1.626899,-1.952161,0.658397
7,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,221696,explore,Explore,online-playlist,1,181115,1011,Thomas Rhett,Thomas Rhett| Rhett Akins| Ben Hayslip,...,-0.881164,-1.28191,0.488264,-0.140746,-1.270513,-0.497853,0.805853,0.915915,1.048601,-0.717854
8,uHqAtShXTRXju5GE8ri3ITsVFepPf8jUoCF7ffNOuqE=,9173,my library,Local playlist more,local-library,1,278964,2022,OneRepublic,Ryan Tedder,...,0.08439,-0.061995,0.063706,0.125521,0.016351,0.019265,-0.004544,-0.111708,-0.098042,-0.018869
9,uHqAtShXTRXju5GE8ri3ITsVFepPf8jUoCF7ffNOuqE=,93766,my library,Local playlist more,local-library,1,257369,465,OneRepublic,Ryan Tedder,...,0.116966,0.299618,-0.085313,0.349382,-1.017769,0.148962,-0.386848,1.92102,-0.766945,-0.314502


In [17]:
# 只保留中文歌曲，language=3
df_train.drop(['msno','source_system_tab','source_screen_name','source_type'],axis=1,inplace= True)
df_train = df_train.loc[df_train['language'] == 3]
display(df_train.head(15))

Unnamed: 0,song_id,target,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,...,特征55,特征56,特征57,特征58,特征59,特征60,特征61,特征62,特征63,特征64
11,58069,1,224597,465,嚴爵 (Yen-j),嚴爵,嚴爵,3.0,輕輕 (Lightly),TWK231507907,...,0.045498,-0.837263,-0.724027,-0.471066,-0.747993,0.097226,-0.059129,0.314495,-0.029684,0.262232
12,326924,0,280084,465,林俊傑 (JJ Lin),林俊傑,林怡鳳,3.0,手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋,TWA531480006,...,-0.138259,-1.029645,2.250884,1.466172,0.088254,0.893221,1.043854,1.721537,-0.735036,-1.704976
13,177564,1,253492,458,周杰倫 (Jay Chou),周杰倫,周杰倫,3.0,給我一首歌的時間,TWK970801502,...,-2.752389,-1.09404,3.560038,-2.916467,4.571066,-2.103715,1.330434,-3.190201,3.036527,1.426327
14,116704,1,210364,465|458,林俊傑 (JJ Lin),JJ Lin,Lin Yi Feng,3.0,關鍵詞 (The Key),TWA531576504,...,-0.359743,0.855385,0.396391,0.848224,0.992473,-0.492213,3.211751,-1.888377,-1.527182,-0.217943
16,214152,1,248790,465,吳汶芳 (Fang Wu),吳汶芳,吳汶芳,3.0,孤獨的總和 (Accumulated Loneliness),TWA211328806,...,-1.390997,-1.829017,-3.270219,-0.912881,0.859131,0.205735,-1.217894,-1.219546,-0.006928,0.226888
17,321525,1,259877,458,aMEI (張惠妹),Wang Feng,,3.0,春天裡,TWUM71600101,...,1.573981,-0.967788,0.269891,-1.185992,0.183924,-0.884661,0.255855,-0.112575,0.436283,-1.05846
18,37437,0,265743,465,林俊傑 (JJ Lin),JJ Lin,Wang Ya Jun,3.0,因你而在 (You N Me),TWA531398014,...,-0.134129,-1.06024,-1.272638,1.289265,0.766787,1.523809,1.033244,0.80009,2.49418,0.90294
19,67806,1,241975,465,孫燕姿 (Yanzi Sun),Li Wei Song,Xiao Han,3.0,雨天 (雨天),TWA530617601,...,-1.211062,0.54752,0.082267,0.329495,-0.625898,-0.198381,-0.108539,-0.802532,0.838897,-1.555693
20,190425,1,189846,458,陳星翰 (Starr Chen),Starr Chen,,3.0,EGO-HOLIC戀我癖 (EGO-HOLIC),TWUM71600076,...,-1.329635,0.403698,3.941835,0.461805,-3.731353,0.322436,-1.190308,-0.032522,0.149215,2.531194
21,107310,1,247911,465,莫文蔚 (Karen Mok),Skot Suyama,李焯雄,3.0,愛死你,TWA470326002,...,-0.030639,0.516861,0.235568,-0.04496,0.388393,0.044496,-0.333293,0.364095,0.225322,-0.151769


In [20]:
# 保存到本地
df_train.drop(['target'],axis=1,inplace=True)
display(df_train.shape)
df_train.to_csv('data/songs_features.csv',index=False)

(60866, 73)