В этом файле формируется матрица, содержащая информацию по тегам, лайкам и тд.

In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, hstack, save_npz, load_npz
import re
import sys

DATA_DIR = './drive/My Drive/Colab Notebooks/VK_internship/'

In [0]:
users_amount_df = pd.read_csv(DATA_DIR+"users_artists.csv").drop_duplicates()

artists_id = users_amount_df["artists"].unique()
artists_id.sort()
train, test = train_test_split(artists_id, test_size=0.1)
del artists_id, test

users_amount_df_grpby = users_amount_df.groupby("artists")
max_artist_idx = np.max(users_amount_df["artists"])
# время прослушивания исполнителей:
playtime_df = users_amount_df_grpby["playtime"].sum().reset_index()
playtime_col = csr_matrix((playtime_df["playtime"], (playtime_df["artists"], [0]*playtime_df.shape[0])))
playtime_col = csr_matrix((playtime_df["playtime"], (playtime_df["artists"], [0]*playtime_df.shape[0])), shape=(max_artist_idx+1, 1))
del playtime_df
# количество уникальных пользователей, слушающих исполнителя:
users_amount_df = users_amount_df_grpby["user_id"].sum().reset_index()
users_amount_col = csr_matrix((users_amount_df["user_id"], (users_amount_df["artists"], [0]*users_amount_df.shape[0])), shape=(max_artist_idx+1, 1))
del users_amount_df

In [0]:
# матрица смежности для лайков:
likes_df = pd.read_csv(DATA_DIR+"artists_user_likes.csv").drop_duplicates()
likes_df = likes_df.loc[likes_df["artists"]<=max_artist_idx]

likes_amount_for_artist_df = likes_df.groupby("artists")["likes_amount"].sum().reset_index() # количество лайков уникальных пользователей
# В рассмотрение берутся топ 5 тысяч исполнителей по количеству лайков (из тренировочного множества)
artists_col_idxs = likes_amount_for_artist_df.loc[
    likes_amount_for_artist_df["artists"].isin(train)
].sort_values("likes_amount", ascending = False).iloc[:5000]
artists_col_idxs = artists_col_idxs["artists"].values

likes_df = likes_df.loc[:, ["user_id", "artists"]].drop_duplicates()
artists_col_idxs = likes_df.loc[likes_df["artists"].isin(artists_col_idxs)]
likes_df = likes_df.merge(artists_col_idxs, on="user_id")
likes_df = likes_df.groupby(["artists_x", "artists_y"]).count().reset_index()

likes_matrix = csr_matrix((likes_df["user_id"],
                           (likes_df["artists_x"], likes_df["artists_y"])), 
                          shape=(max_artist_idx+1, max_artist_idx+1))
del likes_df


In [4]:
svd = TruncatedSVD(n_components=200, random_state=42) 
likes_matrix = svd.fit_transform(likes_matrix)

print("get likes")
np.savetxt(DATA_DIR+"likes_matrix.csv", likes_matrix)
del likes_matrix

get likes


In [0]:
# матрица смежности плейлистов:
playlists_df = pd.read_csv(DATA_DIR+"artists_in_playlists.csv").drop_duplicates()
playlists_df = playlists_df.loc[playlists_df["artists"]<=max_artist_idx]
playlists_amount_for_artist_df = playlists_df
playlists_amount_for_artist_df = playlists_amount_for_artist_df.loc[playlists_amount_for_artist_df["artists"].isin(train)]
playlists_amount_for_artist_df = playlists_amount_for_artist_df.groupby("playlist_id")["artists"].count().reset_index() # количество плейлистов уникальных пользователей

# В рассмотрение берутся топ 5 тысяч плейлистов по количеству исполнителей (из тренировочного множества)
artists_col_idxs = playlists_amount_for_artist_df.sort_values("artists", ascending = False).iloc[:5000]
artists_col_idxs = artists_col_idxs["playlist_id"].values

playlists_df = playlists_df.drop_duplicates()
playlists_df = playlists_df.merge(playlists_df.loc[playlists_df["artists"].isin(artists_col_idxs)], on="playlist_id")
playlists_df = playlists_df.groupby(["artists_x", "artists_y"]).count().reset_index()

playlists_matrix = csr_matrix((playlists_df["playlist_id"],
                               (playlists_df["artists_x"], playlists_df["artists_y"])), 
                              shape=(max_artist_idx+1,max_artist_idx+1))
del playlists_df

In [6]:
svd = TruncatedSVD(n_components=200, random_state=42) 
playlists_matrix = svd.fit_transform(playlists_matrix)

print("get playlists")
np.savetxt(DATA_DIR+"playlists_matrix.csv", playlists_matrix)
del playlists_matrix

get playlists


In [0]:
# one hot для тегов:
users_tags_df = pd.read_csv(DATA_DIR+"artists_tags.csv")
users_tags_df = users_tags_df.loc[users_tags_df["artists"]<=max_artist_idx]
# В рассмотрение берутся топ 5 тысяч тегов по количеству исполнителей (из тренировочного множества)
tags_popularity = users_tags_df.loc[users_tags_df["artists"].isin(train)]
tags_popularity = tags_popularity.groupby("tags")["artists"].count().reset_index()
artists_col_idxs = tags_popularity.sort_values("artists", ascending = False).iloc[:5000]
artists_col_idxs = artists_col_idxs["tags"].values

le = LabelEncoder()
users_tags_df.loc[:,"tags"] = le.fit_transform(users_tags_df.loc[:,"tags"])
user_tags_matrix = csr_matrix(([1]*users_tags_df.shape[0], 
                               (users_tags_df["artists"], users_tags_df["tags"])), 
                              shape=(max_artist_idx+1, np.max(users_tags_df["tags"])+1))
del users_tags_df

In [8]:
svd = TruncatedSVD(n_components=200, random_state=42) 
user_tags_matrix = svd.fit_transform(user_tags_matrix)

print("get tags")
np.savetxt(DATA_DIR+"user_tags_matrix.csv", user_tags_matrix)
del user_tags_matrix

get tags


In [0]:
"""
likes_matrix = np.loadtxt(DATA_DIR+"likes_matrix.csv")
playlists_matrix = np.loadtxt(DATA_DIR+"playlists_matrix.csv")
full_matrix = np.hstack((likes_matrix, playlists_matrix))
del likes_matrix, playlists_matrix
user_tags_matrix = np.loadtxt(DATA_DIR+"user_tags_matrix.csv")
full_matrix = np.hstack((full_matrix, user_tags_matrix))
del user_tags_matrix
playtime_col = playtime_col.toarray()
users_amount_col = users_amount_col.toarray()
full_matrix = np.hstack((full_matrix, playtime_col, users_amount_col))
"""

In [0]:
#np.savetxt(DATA_DIR+"svd_features_tags_etc.csv", full_matrix)