# 慕课网-个性化入门实战课

In [1]:
# Import packages
from __future__ import division
import os 
import math
import operator

## 公共信息抽取函数

In [2]:

def get_user_click(rating_file):
    """
    get user click list
    Args:
        rating_fiel: input file
    Return:
        dict, key: userid, value:[itemid1, itemid2]
    """
    if not os.path.exists(rating_file):
        return {},{}
    fp = open(rating_file)
    num = 0
    user_click = {}
    user_click_time = {}
    for line in fp:
        if num == 0:
            num += 1
            continue
        item = line.strip().split(',')
        if len(item) < 4:
            continue
        [userid, itemid, rating, timestamp] = item 
        if userid + '_' + itemid not in user_click_time:
            user_click_time[userid + '_' + itemid] = int(timestamp)
        if float(rating) < 3.0:
            continue
        if userid not in user_click:
            user_click[userid] = []
        user_click[userid].append(itemid)
    fp.close()
    return user_click, user_click_time

In [3]:
def get_item_info(item_file):
    """
    get item info[title, genres]
    Args:
        item_file:input iteminfo file
    return:
        a dict, key itemid, value: [title, genres]
    """
    if not os.path.exists(item_file):
        return {}
    num = 0
    fp = open(item_file)
    item_info ={}
    for line in fp:
        if num == 0:
            num += 1
            continue
        item = line.strip().split(',')
        if len(item) < 3:
            continue
        if len(item) == 3:
            [itemid, title, genres] = item
        elif len(item) > 3:
            itemid = item[0]
            genres = item[-1]
            title = ','.join(item[1:-1])
        if itemid not in item_info:
            item_info[itemid] = [title, genres]
    fp.close()
    return item_info

In [4]:
user_click,user_click_time = get_user_click("./tmp/dataset/ml-latest-small/ratings.csv")

In [5]:
print(len(user_click))
print(user_click['1'])

609
['1', '3', '6', '47', '50', '70', '101', '110', '151', '157', '163', '216', '223', '231', '235', '260', '296', '316', '333', '349', '356', '362', '367', '423', '441', '457', '480', '500', '527', '543', '552', '553', '590', '592', '593', '596', '608', '648', '661', '673', '733', '736', '780', '804', '919', '923', '940', '943', '954', '1009', '1023', '1024', '1025', '1029', '1030', '1031', '1032', '1042', '1049', '1060', '1073', '1080', '1089', '1090', '1092', '1097', '1127', '1136', '1196', '1197', '1198', '1206', '1208', '1210', '1213', '1214', '1220', '1222', '1224', '1226', '1240', '1256', '1258', '1265', '1270', '1275', '1278', '1282', '1291', '1298', '1348', '1377', '1396', '1408', '1445', '1473', '1500', '1517', '1552', '1573', '1580', '1587', '1617', '1620', '1625', '1644', '1676', '1732', '1777', '1793', '1804', '1805', '1920', '1927', '1954', '1967', '2000', '2005', '2012', '2018', '2028', '2033', '2046', '2048', '2054', '2058', '2078', '2090', '2093', '2094', '2096', '2099

In [6]:
item_info = get_item_info("./tmp/dataset/ml-latest-small/movies.csv")

In [7]:
print(len(item_info))
print(item_info['1'])

9742
['Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy']


## 基于物品协同过滤实战

步骤：

1. 计算物品相似度矩阵
2. 根据物品相似度矩阵推荐相似物品

### 计算物品和用户相似度函数

In [8]:
def base_contribution_score():
    """
    item cf base sim contribution score by user，基础权重计算公式
    """
    return 1

In [9]:
def update_one_contribution_score(user_total_click_num):
    """
    升级版本权重计算公式1，惩罚点击数目过多的用户的权重
    """
    return 1/math.log(1+user_total_click_num)

In [10]:
alpha = 1.0 # scale rate
def update_two_contribution_score(click_time_one, click_time_two):
    """
    升级版本权重计算公式2，惩罚时间差越大的用户的权重
    """
    
    delta_time = abs(click_time_one - click_time_two)
    total_sec = 60*60*24
    delta_time = delta_time/total_sec
    return 1/(1+alpha * delta_time)

In [11]:
# choose the function to calculate the contribution score
contribution_score_type = 2 # 0: base,  1: update_one, 2: update_two

In [12]:
def cal_item_sim(user_click, user_click_time):
    """
    Args:
        user_click: key userid,  value: [itemid1, itemid2]
    Return:
        dict, key:itemid_i, value dict, value_key itemid_j, value_value simscore
    """
    # 计算两个物品之间的贡献矩阵
    co_appear = {}
    item_user_click_time = {}
    for user, itemlist in user_click.items():
        for index_i in range(0, len(itemlist)):
            itemid_i = itemlist[index_i]
            item_user_click_time.setdefault(itemid_i, 0)
            item_user_click_time[itemid_i] += 1
            for index_j in range(index_i + 1, len(itemlist)):
                itemid_j = itemlist[index_j]
                co_appear.setdefault(itemid_i, {})
                co_appear[itemid_i].setdefault(itemid_j, 0)
                co_appear.setdefault(itemid_j, {})
                co_appear[itemid_j].setdefault(itemid_i, 0)

                if contribution_score_type == 0:
                    # 基础贡献公式
                    co_appear[itemid_i][itemid_j] += base_contribution_score()
                    co_appear[itemid_j][itemid_i] += base_contribution_score()
                elif contribution_score_type == 1:
                    # 升级版本公式1
                    co_appear[itemid_i][
                        itemid_j] += update_one_contribution_score(len(itemlist))

                    co_appear[itemid_j][itemid_i] += update_one_contribution_score(len(itemlist))
                else:
                    # 升级版本公式2
                    if user + '_' + itemid_i not in user_click_time:
                        click_time_one = 0
                    else:
                        click_time_one = user_click_time[user + '_' + itemid_i]
                    if user + '_' + itemid_j not in user_click_time:
                        click_time_two = 0
                    else:
                        click_time_two = user_click_time[user + '_' + itemid_j]
                    co_appear[itemid_i][
                        itemid_j] += update_two_contribution_score(
                            click_time_one, click_time_two)
                    co_appear[itemid_j][itemid_i] += update_two_contribution_score(click_time_one, click_time_two)

    item_sim_score = {}
    item_sim_score_sorted = {}
    for itemid_i, relate_item in co_appear.items():
        for itemid_j, co_time in relate_item.items():
            sim_score = co_time / math.sqrt(item_user_click_time[itemid_i] *
                                            item_user_click_time[itemid_j])
            item_sim_score.setdefault(itemid_i, {})
            item_sim_score[itemid_i].setdefault(itemid_j, 0)
            item_sim_score[itemid_i][itemid_j] = sim_score

    for itemid in item_sim_score:
        item_sim_score_sorted[itemid] = sorted(item_sim_score[itemid].items(),
                                               key=operator.itemgetter(1),
                                               reverse=True)

    return item_sim_score_sorted

In [13]:
def cal_recom_result(sim_info, user_click):
    """
    recom by itemcf
    Args:
        sim_info: item sim dict
        user_click: user click dict
    Return:
        dict, key:userid value dict, value_key itemid, value_value recom_score
    """
    recent_click_num = 3
    topk = 5
    recom_info = {}
    for user in user_click:
        click_list = user_click[user]
        recom_info.setdefault(user, {})
        for itemid in click_list[:recent_click_num]:
            if itemid not in sim_info:
                continue
            for itemidsim_tuple in sim_info[itemid][:topk]:
                itemsimid = itemidsim_tuple[0]
                itemsimscore = itemidsim_tuple[1]
                recom_info[user][itemsimid] = itemsimscore
    return recom_info

In [14]:
sim_info = cal_item_sim(user_click, user_click_time)

In [15]:
recom_result = cal_recom_result(sim_info, user_click)

In [16]:
print(recom_result["1"])

{'780': 0.46475374613998177, '3114': 0.43256226390108665, '356': 0.42982096383633256, '588': 0.4272105446504961, '260': 0.4217381348100523, '5': 0.3706893668636461, '7': 0.36722778250755456, '736': 0.355248820396473, '788': 0.35453382503684483, '784': 0.33956136189636077, '733': 0.4166114199492754, '16': 0.39177042306799037, '32': 0.3834927290133812, '112': 0.3691490738192548, '628': 0.34343752127181054}


In [17]:
recom_result_sorted = sorted(recom_result["1"].items(), key=operator.itemgetter(1),reverse=True)

In [18]:
print(recom_result_sorted)

[('780', 0.46475374613998177), ('3114', 0.43256226390108665), ('356', 0.42982096383633256), ('588', 0.4272105446504961), ('260', 0.4217381348100523), ('733', 0.4166114199492754), ('16', 0.39177042306799037), ('32', 0.3834927290133812), ('5', 0.3706893668636461), ('112', 0.3691490738192548), ('7', 0.36722778250755456), ('736', 0.355248820396473), ('788', 0.35453382503684483), ('628', 0.34343752127181054), ('784', 0.33956136189636077)]


### 打印输出查看推荐的物品相似度情况

In [19]:
def debug_itemsim(item_info, sim_info):
    """
    Show itemsim info，强调的是输出计算出来的物品之间的相似度
    Args:
        item_info: dict, key itemid, value:[title, genres]
        sim_info: dict, key itemid, value dict, key [(itemid1, simscore), (itemid2, simscore)]
    """
    fixed_itemid = "1"
    if fixed_itemid not in item_info:
        print('Invalid itemid.')
        return
    [title_fix, genres_fix] = item_info[fixed_itemid]
    print(title_fix + '\t' + genres_fix)
    for sim_info_tuple in sim_info[fixed_itemid][:10]:
        itemid_sim = sim_info_tuple[0]
        sim_score = sim_info_tuple[1]
        if itemid_sim not in item_info:
            continue
        [title, genres] = item_info[itemid_sim]
        print('sim:' + title + '\t'+genres+'\t'+str(sim_score))

In [20]:
debug_itemsim(item_info, sim_info)

Toy Story (1995)	Adventure|Animation|Children|Comedy|Fantasy
sim:Independence Day (a.k.a. ID4) (1996)	Action|Adventure|Sci-Fi|Thriller	0.46475374613998177
sim:Toy Story 2 (1999)	Adventure|Animation|Children|Comedy|Fantasy	0.43256226390108665
sim:Forrest Gump (1994)	Comedy|Drama|Romance|War	0.42982096383633256
sim:Aladdin (1992)	Adventure|Animation|Children|Comedy|Musical	0.4272105446504961
sim:Star Wars: Episode IV - A New Hope (1977)	Action|Adventure|Sci-Fi	0.4217381348100523
sim:Mission: Impossible (1996)	Action|Adventure|Mystery|Thriller	0.42005115492687634
sim:Jurassic Park (1993)	Action|Adventure|Sci-Fi|Thriller	0.41903077289856033
sim:Star Wars: Episode VI - Return of the Jedi (1983)	Action|Adventure|Sci-Fi	0.41391971218503226
sim:"Lion King, The (1994)"	Adventure|Animation|Children|Drama|Musical|IMAX	0.40592414467745963
sim:Willy Wonka & the Chocolate Factory (1971)	Children|Comedy|Fantasy|Musical	0.39680151931956986


In [21]:
def debug_recommendation_result(recom_result, item_info):
    """
    Debug recommendation result，强调的是根据用户的偏好，输出来的推荐结果
    Args:
        recom_result: key userid value: dict, value_key:itemid, value_value:recom_score
        item_info: dict, key itemid value: [title, genre]
    """
    user_id = '1'
    if user_id not in recom_result:
        print("Invalid userid")
        return
    for recom_result_info in sorted(recom_result[user_id].items(), key=operator.itemgetter(1), reverse=True):
        itemid, score = recom_result_info
        if itemid not in item_info:
            continue
        print(",".join(item_info[itemid])+'\t' + str(score))

In [22]:
debug_recommendation_result(recom_result, item_info)

Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller	0.46475374613998177
Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy	0.43256226390108665
Forrest Gump (1994),Comedy|Drama|Romance|War	0.42982096383633256
Aladdin (1992),Adventure|Animation|Children|Comedy|Musical	0.4272105446504961
Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi	0.4217381348100523
"Rock, The (1996)",Action|Adventure|Thriller	0.4166114199492754
Casino (1995),Crime|Drama	0.39177042306799037
Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller	0.3834927290133812
Father of the Bride Part II (1995),Comedy	0.3706893668636461
Rumble in the Bronx (Hont faan kui) (1995),Action|Adventure|Comedy|Crime	0.3691490738192548
Sabrina (1995),Comedy|Romance	0.36722778250755456
Twister (1996),Action|Adventure|Romance|Thriller	0.355248820396473
"Nutty Professor, The (1996)",Comedy|Fantasy|Romance|Sci-Fi	0.35453382503684483
Primal Fear (1996),Crime|Drama|Mystery|Thriller	0.3434

## 基于用户的协同过滤实战

步骤：

1. 计算用户相似度矩阵
2. 根据相似度矩阵推荐物品

In [23]:
user_click,user_click_time = get_user_click("./tmp/dataset/ml-latest-small/ratings.csv")

In [24]:
def transfer_user_click(user):
    """
    将用户的点击转化成item被用户点击
    Args:
        user_click: key userid, value: [itemid1, itemid2]
    Return:
        dict, key itemid, value:[userid1,userid2]
    """
    item_click_by_user = {}
    for user in user_click:
        item_list = user_click[user]
        for itemid in item_list:
            item_click_by_user.setdefault(itemid, [])
            item_click_by_user[itemid].append(user)
    return item_click_by_user

In [25]:
def usercf_base_contribution_score():
    """
    基础的用户贡献权重
    """
    return 1

In [26]:
def usercf_update_contribution_score(item_user_click_count):
    """
    usercf contribution score update V1
    Args:
        item_user_click_count: how many user have clicked this item
    Return：
        contribution score
    """
    return 1/math.log10(1 + item_user_click_count)
        

In [28]:
def usercf_update_two_contribution_score(click_time_one,click_time_two):
    """
    usercf contribution score update V2
    Args:
        different user action time to the same item, click_time_one, click_time_two
    return:
        Contribution score
    """
    delta_time = abs(click_time_two - click_time_one)
    # 将时间转化为以天为单位
    norm_num = 60*60*24
    delta_time = delta_time/norm_num
    return 1/(1+delta_time)

In [29]:
usercf_contribution_type = 2

In [30]:
def cal_user_sim(item_click_by_user,user_click_time):
    """
    计算用户相似度
    Args:
        item_click_by_user: dict, key:itemid, value:[userid1, userid2]
    Return:
        dict, key itemid, value:dict, value_key:itemid_j, value_value:sim_score
    """
    co_appear = {}
    user_click_count = {}
    for itemid, user_list in item_click_by_user.items():
        for index_i in range(0, len(user_list)):
            user_i = user_list[index_i]
            user_click_count.setdefault(user_i, 0)
            user_click_count[user_i] += 1
            if user_i + "_" + itemid not in user_click_time:
                click_time_one = 0
            else:
                click_time_one = user_click_time[user_i + "_" + itemid]
            for index_j in range(index_i+1, len(user_list)):
                user_j = user_list[index_j]
                if user_j + "_" +itemid not in user_click_time:
                    click_time_two = 0
                else:
                    click_time_two = user_click_time[user_j+'_'+itemid]
                co_appear.setdefault(user_i, {})
                co_appear[user_i].setdefault(user_j, 0)
                co_appear.setdefault(user_j, {})
                co_appear[user_j].setdefault(user_i, 0)
                
                # 基础用户贡献权重
                if usercf_contribution_type == 0: 
                    co_appear[user_i][user_j] += usercf_base_contribution_score()
                    co_appear[user_j][user_i] += usercf_base_contribution_score()
                # 惩罚行为频繁用户的贡献权重
                elif usercf_contribution_type == 1:
                    co_appear[user_i][user_j] += usercf_update_contribution_score(len(user_list))
                    co_appear[user_j][user_i] += usercf_update_contribution_score(len(user_list))
                # 惩罚时间间隔太长的用户贡献权重
                else usercf_contribution_type == 2:
                    co_appear[user_i][user_j] += usercf_update_two_contribution_score(click_time_one,click_time_two)
                    co_appear[user_j][user_i] += usercf_update_two_contribution_score(click_time_one,click_time_two)
    user_sim_info = {}
    user_sim_info_sorted = {}
    for user_i, relate_user in co_appear.items():
        user_sim_info.setdefault(user_i, {})
        for user_j, cotime in relate_user.items():
            user_sim_info[user_i].setdefault(user_j, 0)
            user_sim_info[user_i][user_j] = cotime/math.sqrt(user_click_count[user_i]*user_click_count[user_j])
    # 将相似度用户排序
    for user in user_sim_info:
        user_sim_info_sorted[user] = sorted(user_sim_info[user].items(), key=operator.itemgetter(1), reverse=True)
    return user_sim_info_sorted

In [31]:
def cal_recom_result_using_usercf(user_click, user_sim):
    """
    利用usercf产生推荐结果
    Args:
        user_clik: dict, key userid, value: [itemid1, itemid2]
        user_sim: key: userid value: [(useridj, score1),(useridk, score2)]
    Return:
        dict, key userid, value: dict value_key:itemid, value_value:recom_score
    """
    
    recom_result = {}
    topk_user = 3
    item_num = 5
    for user, item_list in user_click.items():
        tmp_dict = {}
        for itemid in item_list:
            tmp_dict.setdefault(itemid, 1)
        recom_result.setdefault(user, {})
        for zuhe in user_sim[user][:topk_user]:
            userid_j, sim_score = zuhe
            if userid_j not in user_click:
                continue
            
            for itemid_j in user_click[userid_j][:item_num]:
                recom_result[user].setdefault(itemid_j, sim_score)
    return recom_result

In [32]:
item_click_by_user = transfer_user_click(user_click)

In [33]:
user_sim = cal_user_sim(item_click_by_user,user_click_time)

In [34]:
recom_result = cal_recom_result_using_usercf(user_click, user_sim)

In [35]:
print(recom_result['1'])

{'6': 0.18803307299446, '10': 0.18803307299446, '29': 0.18803307299446, '32': 0.18803307299446, '39': 0.18803307299446, '1': 0.1879004753996153, '11': 0.1879004753996153, '21': 0.1879004753996153, '3': 0.18756012667350208, '17': 0.18756012667350208}


In [36]:
def debug_user_sim(user_sim):
    """
    测试用户间相似度
    Args：
        user_sim: key: userid vale:[(userid1, score1), (userid2,score2)]
    """
    fixed_user = 120
    topk = 5
    if fixed_user not in user_sim:
        print("Invalid user")
        return    
    for zuhe in user_sim[fixed_user][:topk]:
        userid, score = zuhe
        print("fix_user" + "\t sim_user" + userid + "\t" + str(score))

In [37]:
debug_user_sim(user_sim)

Invalid user


In [38]:
item_info = get_item_info("./tmp/dataset/ml-latest-small/movies.csv")

In [39]:
def debug_recom_result_using_usercf(item_info, recom_result):
    """
    Args:
        item_info: key itemid value:[title, genres]
        recom_result: key userid, value dict, value key: itemid value_value: recom_score
    """
    fix_user = '1'
    if fix_user not in recom_result:
        print("invalid user for recoming result.")
        return
    for itemid in recom_result["1"]:
        if itemid not in item_info:
            continue
        recom_score = recom_result["1"][itemid]
        print("recom result: " + ",".join(item_info[itemid]) + "\t" + str(recom_score))

In [40]:
debug_recom_result_using_usercf(item_info, recom_result)

recom result: Heat (1995),Action|Crime|Thriller	0.18803307299446
recom result: GoldenEye (1995),Action|Adventure|Thriller	0.18803307299446
recom result: "City of Lost Children, The (Cité des enfants perdus, La) (1995)",Adventure|Drama|Fantasy|Mystery|Sci-Fi	0.18803307299446
recom result: Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller	0.18803307299446
recom result: Clueless (1995),Comedy|Romance	0.18803307299446
recom result: Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy	0.1879004753996153
recom result: "American President, The (1995)",Comedy|Drama|Romance	0.1879004753996153
recom result: Get Shorty (1995),Comedy|Crime|Thriller	0.1879004753996153
recom result: Grumpier Old Men (1995),Comedy|Romance	0.18756012667350208
recom result: Sense and Sensibility (1995),Drama|Romance	0.18756012667350208
