In [17]:
import numpy as np
import json
import random
import time
import pickle

In [18]:
def random_sample_playlist(in_file, out_file, max_sample=10, seed=None):
    """
    从给定的输入文件in_file中提取出最多max_sample条样本数据，并将其输入到out_file文件中
    :param in_file:
    :param out_file:
    :param max_sample:
    :param seed:
    :return:
    """
    # 设置给定的随机数种子
    if seed:
        random.seed(seed)

    # 进行数据的随机抽取过程
    with open(out_file, 'w', encoding='utf-8') as writer:
        with open(in_file, 'r', encoding='utf-8') as reader:
            count = 0
            for line in reader:
                if random.random() < 0.8:
                    writer.writelines(line)
                    count += 1
                    if count % 100 == 0:
                        print("已经抽取%d个歌单数据!!" % count)

                    if count >= max_sample:
                        break
            print("实际抽取的总歌单数据为:%d条" % count)

In [19]:
def parse_playlist_2_song(playlist_json):
    """
    将传入的json格式的歌单数据转换为具体的特征数据
    :param playlist_json:
    :return:
    """
    try:
        # 将字符串转换为json对象 --> 类似dict
        data = json.loads(playlist_json)

        # 开始获取数据
        result_data = data['result']
        # 1. 获取歌单数据（用户id，歌单id，歌单名称，最近的更新时间，订阅数，播放数）
        user_id = result_data['userId']
        playlist_id = result_data['id']
        playlist_name = result_data['name'].replace('\t', '')
        playlist_subscribed_count = result_data['subscribedCount']
        playlist_play_count = result_data['playCount']
        playlist_update_time = result_data['updateTime']
        # 2. 获取歌曲信息
        song_info = ''
        songs = result_data['tracks']
        for song in songs:
            # 歌曲id，歌曲名称，歌曲热度
            song_id = song['id']
            song_name = song['name'].replace('\t', '')
            song_popularity = song['popularity']
            song_info += '\t' + '::::'.join([str(song_id), song_name, str(song_popularity)])

        return '##'.join(
            [str(user_id), str(playlist_id), playlist_name, str(playlist_update_time), str(playlist_subscribed_count),
             str(playlist_play_count)]) + song_info
    except Exception as e:
        return False

In [20]:
def parse_playlist_file(in_file, out_file):
    """
    从给定的歌单原始数据文件中提取有关的特征数据，并将其保存到给定的输出文件中
    :param in_file:
    :param out_file:
    :return:
    """
    with open(out_file, 'w', encoding='utf-8') as writer:
        with open(in_file, 'r', encoding='utf-8') as reader:
            for line in reader:
                # 1. 对当前的歌单数据line进行处理
                result = parse_playlist_2_song(line)

                # 2. 对于处理的结果进行判断，如果处理成功并且有值，那么直接输出
                if result:
                    writer.writelines(result)
                    writer.writelines('\n')
                else:
                    print("提取歌单主要特征数据失败:{}".format(line))

In [21]:
def is_last_time(update_time):
    return int(time.time()) * 1000 - update_time < 31536000000

In [22]:
def parse_user_song_rating(user_id, update_time, subscribed_count, play_count, song_info):
    """
    基于输入的参数，构建一个评分
    :param user_id:
    :param update_time:
    :param subscribed_count:
    :param play_count:
    :param song_info:
    :return:
    """
    try:
        # 提取特征
        song_id, _, song_popularity = song_info.split('::::')

        # 计算当前用户对于单个歌曲的评分
        # TODO: 由于我们的数据原因，没法直接获取用用户对于歌曲的评分信息，那么将歌曲的热度作为评分值，并且结合播放次数、订阅次数、更新时间等做一个加权变化；并且我们只要这个歌单中存在这个歌曲，那么表示这个人对于这个歌曲是比较偏好的，所以评分一定是正向的，假设评分的取值范围[1~10], 那么认为原始数据中的评分一定是超过5分的
        w = 1.0
        if play_count > 10000 and subscribed_count > 1000 and is_last_time(update_time):
            w = 1.1
        elif play_count <= 10000 and subscribed_count <= 1000 and (not is_last_time(update_time)):
            w = 0.9
        # rating评分范围为:[0,110]
        rating = float(song_popularity) * w
        # 首先将评分截断为[1,10]， 如果评分超过10分，那么设置为10；如果低于1分，那么设置为1
        rating = np.clip(rating / 11, 1.0, 10.0)
        # 认为数据实在存在，那么一定是喜好的，所以评分一定是超过5分的
        rating = ((rating - 1.0) / 9.0) * 5.0 + 5.0

        return ','.join([user_id, song_id, str(rating)])
    except:
        return ''

In [23]:
def parse_user_song_rating_file(in_file, out_file):
    """
    对输入的特征属性的文件内容进行用户-歌曲评分的构建，并将结果保存到out_file
    :param in_file:
    :param out_file:
    :return:
    """
    with open(out_file, 'w', encoding='utf-8') as writer:
        with open(in_file, 'r', encoding='utf-8') as reader:
            for line in reader:
                # 获取歌单信息和歌单中的歌曲数据
                contents = line.split('\t')
                playlist_content, song_contents = contents[0], contents[1:]
                # 获取具体的歌单数据
                user_id, _, _, update_time, subscribed_count, play_count = playlist_content.split("##")
                update_time = float(update_time)
                subscribed_count = float(subscribed_count)
                play_count = float(play_count)
                # 获取当前用户user_id对于所有songs的分别的评分信息(user-song-rating)
                user_song_info = map(
                    lambda song: parse_user_song_rating(user_id, update_time, subscribed_count, play_count, song),
                    song_contents)
                user_song_info = filter(lambda t: len(t.split(',')) == 3, user_song_info)

                # 输出数据
                result = '\n'.join(user_song_info)
                if result:
                    writer.writelines(result)
                    writer.writelines('\n')

In [24]:
def parse_playlist_song_rating(playlist_id, song_info):
    """
    基于给定的数据，得到歌单-歌曲的评分值
    :param playlist_id:
    :param song_info:
    :return:
    """
    try:
        # 提取歌曲信息
        song_id, _, _ = song_info.split('::::')

        # 在这里计算的评分矩阵主要是为了实现需求：计算歌单之间的相似度，然后将相似歌单作为当前歌单的推荐数据
        # 可以认为如果两个歌单中，出现的歌曲的重复度越高，那么这两个歌单就越相似度；也就是说只要这两个歌单中的歌曲列表是一致的，那么就可以认为是歌单的相似的---->基于jaccard相似度计算 ---> 不需要考虑具体的评分值
        rating = 1.0

        return ','.join([playlist_id, song_id, str(rating)])
    except:
        return ''

In [25]:
def parse_playlist_song_rating_file(in_file, out_file):
    """
    构建歌单-歌曲之间的评分矩阵
    :param in_file:
    :param out_file:
    :return:
    """
    with open(out_file, 'w', encoding='utf-8') as writer:
        with open(in_file, 'r', encoding='utf-8') as reader:
            for line in reader:
                # 获取歌单信息和歌单中的歌曲数据
                contents = line.split('\t')
                playlist_content, song_contents = contents[0], contents[1:]
                # 获取具体的歌单数据
                _, playlist_id, _, _, _, _ = playlist_content.split("##")
                # 获取当前歌单playlist_id对于所有songs的分别的评分信息(playlist-song-rating)
                playlist_song_info = map(lambda song: parse_playlist_song_rating(playlist_id, song), song_contents)
                playlist_song_info = filter(lambda t: len(t.split(',')) == 3, playlist_song_info)

                # 输出数据
                result = '\n'.join(playlist_song_info)
                if result:
                    writer.writelines(result)
                    writer.writelines('\n')

In [26]:
def parse_playlist_song_id_2_name(in_file, out_playlist, out_song):
    """
    提取id和name映射关系到文件中
    :param in_file:
    :param out_playlist:
    :param out_song:
    :return:
    """
    # 歌单id和歌单名称的映射字典
    playlist_id_2_name = {}
    # 歌曲id和歌曲名称的映射字典
    song_id_2_name = {}

    # 从输入数据中进行处理
    with open(in_file, 'r', encoding='utf-8') as reader:
        for line in reader:
            try:
                # 划分数据
                contents = line.split("\t")
                try:
                    # 提取歌单id和歌单名称
                    _, playlist_id, playlist_name, _, _, _ = contents[0].split('##')
                    playlist_id_2_name[playlist_id] = playlist_name
                    # 提取歌曲id和歌曲名称
                    for song in contents[1:]:
                        try:
                            song_id, song_name, _ = song.split("::::")
                            song_id_2_name[song_id] = song_name
                        except:
                            print("Fetch the song id throw error:{}".format(song))
                except:
                    print("Fetch the playlist id throw error:{}".format(contents[0]))
            except:
                print("Fetch error!")

    # 输出数据
    with open(out_playlist, 'wb') as playlist_writer:
        pickle.dump(playlist_id_2_name, playlist_writer)
    with open(out_song, 'wb') as song_writer:
        pickle.dump(song_id_2_name, song_writer)

In [27]:
all_music_playlist_file_path = 'D:/music/playlist_detail_music_all.json'
sample_playlist_file_pth = 'D:/music/playlist_detail_music_500.json'
music_playlist_song_file_path = 'D:/music/163_music_playlist_song.txt'
music_user_song_rating_file_path = 'D:/music/163_music_user_song_rating.txt'
music_playlist_song_rating_file_path = 'D:/music/163_music_playlist_song_rating.txt'
playlist_id_2_name_file_path = 'D:/music/163_playlist_id_2_name.pkl'
song_id_2_name_file_path = 'D:/music/163_song_id_2_name.pkl'

In [28]:
# 1. 样本数据的抽取
random_sample_playlist(all_music_playlist_file_path, sample_playlist_file_pth, 500)

已经抽取100个歌单数据!!
已经抽取200个歌单数据!!
已经抽取300个歌单数据!!
已经抽取400个歌单数据!!
已经抽取500个歌单数据!!
实际抽取的总歌单数据为:500条


In [29]:
# 2. 特征属性的提取
parse_playlist_file(sample_playlist_file_pth, music_playlist_song_file_path)

In [30]:
# 3. 构建用户-歌曲评分矩阵
parse_user_song_rating_file(music_playlist_song_file_path, music_user_song_rating_file_path)

In [31]:
# 4. 构建歌单-歌曲评分矩阵
parse_playlist_song_rating_file(music_playlist_song_file_path, music_playlist_song_rating_file_path)

In [32]:
# 5. 提取id和name的映射
parse_playlist_song_id_2_name(music_playlist_song_file_path, playlist_id_2_name_file_path, song_id_2_name_file_path)