In [1]:
# 代码说明：
# 基于内容的推荐算法的具体实现

import math
import numpy as np
import pandas as pd

In [56]:
# 创建用户画像
# 参数说明：
# data_array: 所有用户对于其所看过的节目的评分矩阵 data_array = [[2, 0, 0, 1.1, ...], [0, 0, 1.1, ...], ...]
# users_profiles = {user1:{'label1':1.1, 'label2': 0.5, 'label3': 0.0, ...}, user2:{...}...}
def createUsersProfiles(data_array, users_names, items_names, labels_names, items_profiles):

    users_profiles = {}

    # 计算每个用户对所看过的所有节目的平均隐性评分
    # users_average_scores_list = [1.2, 2.2, 4.3,...]
    users_average_scores_list = []

    # 统计每个用户所看过的节目（不加入隐性评分信息）
    # items_users_saw = {user1:[item1, item3, item5], user2:[...],...}
    items_users_saw = {}

    # 统计每个用户所看过的节目及评分
    # items_users_saw_scores = {user1:[[item1, 1.1], [item2, 4.1]], user2:...}
    items_users_saw_scores = {}
    for i in range(len(users_names)):

        items_users_saw_scores[users_names[i]] = []
        items_users_saw[users_names[i]] = []
        count = 0
        sum1 = 0.0
        for j in range(len(items_names)):

            # 用户对该节目隐性评分为正，表示真正看过该节目
            if data_array[i][j] > 0:
                items_users_saw[users_names[i]].append(items_names[j])
                items_users_saw_scores[users_names[i]].append([items_names[j], data_array[i][j]])
                count += 1
                sum1 += data_array[i][j]
        if count == 0:
            users_average_scores_list.append(0)
        else:
            users_average_scores_list.append(sum1 / count)

    for i in range(len(users_names)):

        users_profiles[users_names[i]] = {}

        for j in range(len(labels_names)):
            count = 0
            score = 0.0
            
            print(items_users_saw_scores)
            for item in items_users_saw_scores[users_names[i]]:

                # 参数：
                # 用户user1对于类型label1的隐性评分: user1_score_to_label1
                # 用户user1对于其看过的含有类型label1的节目item i 的评分: score_to_item i
                # 用户user1对其所看过的所有节目的平均评分: user1_average_score
                # 用户user1看过的节目总数: items_count

                # 公式： user1_score_to_label1 = Sigma(score_to_item i - user1_average_score)/items_count

                # 该节目含有特定标签labels_names[j]
                if items_profiles[item[0]][labels_names[j]] > 0:
                    print(score,'123',item[1])
                    score += (item[1] - users_average_scores_list[i])
                    count += 1
            # 如果求出的值太小，直接置0
            if abs(score) < 1e-6:
                score = 0.0
            if count == 0:
                result = 0.0
            else:
                result = score / count

            users_profiles[users_names[i]][labels_names[j]] = result

    return (users_profiles, items_users_saw)

In [3]:
# 创建节目画像
# 参数说明：
# items_profiles = {item1:{'label1':1, 'label2': 0, 'label3': 0, ...}, item2:{...}...}
def createItemsProfiles(data_array, labels_names, items_names):

    items_profiles = {}

    for i in range(len(items_names)):

        items_profiles[items_names[i]] = {}

        for j in range(len(labels_names)):
            items_profiles[items_names[i]][labels_names[j]] = data_array[i][j]

    return items_profiles

In [4]:
# 计算用户画像向量与节目画像向量的距离（相似度）
# 向量相似度计算公式：
# cos(user, item) = sigma_ui/sqrt(sigma_u * sigma_i)

# 参数说明：
# user_profile: 某一用户user的画像 user = {'label1':1.1, 'label2': 0.5, 'label3': 0.0, ...}
# item: 某一节目item的画像 item = {'label1':1, 'label2': 0, 'label3': 0, ...}
# labels_names: 所有类型名
def calCosDistance(user, item, labels_names):

    sigma_ui = 0.0
    sigma_u = 0.0
    sigma_i = 0.0

    for label in labels_names:
        sigma_ui += user[label] * item[label]
        sigma_u += (user[label] * user[label])
        sigma_i += (item[label] * item[label])

    if sigma_u == 0.0 or sigma_i == 0.0:  # 若分母为0，相似度为0
        return 0

    return sigma_ui/math.sqrt(sigma_u * sigma_i)

In [5]:
# 基于内容的推荐算法：
# 借助特定某个用户user的画像user_profile和备选推荐节目集的画像items_profiles，通过计算向量之间的相似度得出推荐节目集

# 参数说明：
# user_profile: 某一用户user的画像 user_profile = {'label1':1.1, 'label2': 0.5, 'label3': 0.0, ...}
# items_profiles: 备选推荐节目集的节目画像: items_profiles = {item1:{'label1':1, 'label2': 0, 'label3': 0}, item2:{...}...}
# items_names: 备选推荐节目集中的所有节目名
# labels_names: 所有类型名
# items_user_saw: 用户user看过的节目

def contentBased(user_profile, items_profiles, items_names, labels_names, items_user_saw):

    # 对于用户user的推荐节目集为 recommend_items = [[节目名, 该节目画像与该用户画像的相似度], ...]
    recommend_items = []

    for i in range(len(items_names)):
        # 从备选推荐节目集中的选择用户user没有看过的节目
        if items_names[i] not in items_user_saw:
            recommend_items.append([items_names[i], calCosDistance(user_profile, items_profiles[items_names[i]], labels_names)])

    # 将推荐节目集按相似度降序排列
    recommend_items.sort(key=lambda item: item[1], reverse=True)

    return recommend_items

In [6]:
# 输出推荐给该用户的节目列表
# max_num:最多输出的推荐节目数
def printRecommendedItems(recommend_items_sorted, max_num):
    count = 0
    for item, degree in recommend_items_sorted:
        print("节目名：%s， 推荐指数：%f" % (item, degree))
        count += 1
        if count == max_num:
            break

In [10]:
#一些超参数
all_labels = ['sports','video','kids','middleeast','travel','finance','games','news', 'health', 'weather', 'foodanddrink', 'movies', 'autos', 'lifestyle', 'music', 'tv', 'entertainment']

In [15]:
df2 = pd.read_excel("news01.xlsx")
(m2, n2) = df2.shape
data_array2 = np.array(df2.iloc[:m2 + 1, 1:])
# 按照"所有用户看过的节目及所属类型的01矩阵"的列序排列的所有用户观看过的节目名称
items_users_saw_names2 = np.array(df2.iloc[:m2 + 1, 0]).tolist()

# 为用户看过的节目建立节目画像
items_users_saw_profiles = createItemsProfiles(data_array2, all_labels, items_users_saw_names2)

['N18955', 'N61837', 'N53526', 'N38324', 'N2073', 'N11429', 'N49186', 'N2131', 'N59295', 'N24510', 'N59883', 'N9721', 'N60905', 'N16587', 'N28361', 'N18680', 'N55610', 'N35621', 'N22850', 'N58173', 'N29120', 'N9786', 'N46481', 'N47705', 'N1834', 'N3574', 'N42474', 'N64498', 'N59538', 'N7517', 'N63665', 'N26094', 'N59469', 'N11649', 'N15926', 'N43620', 'N60603', 'N40078', 'N61409', 'N22486', 'N46013', 'N50643', 'N60723', 'N63175', 'N282', 'N33434', 'N8795', 'N43647', 'N40690', 'N41835', 'N8071', 'N49265', 'N5124', 'N42777', 'N41387', 'N41106', 'N37243', 'N20139', 'N9680', 'N35518', 'N60434', 'N48239', 'N49389', 'N40045', 'N43946', 'N14555', 'N10836', 'N24798', 'N64668', 'N27435', 'N30756', 'N3395', 'N21802', 'N53133', 'N30702', 'N14538', 'N37188', 'N30953', 'N10886', 'N52386', 'N30389', 'N25540', 'N18066', 'N51343', 'N6982', 'N60774', 'N37129', 'N64723', 'N8105', 'N40494', 'N15839', 'N19163', 'N60852', 'N61864', 'N36064', 'N35788', 'N22993', 'N61167', 'N27190', 'N9035', 'N19651', 'N3028

In [12]:
items_users_saw_profiles

{'N18955': {'sports': 0,
  'video': 0,
  'kids': 0,
  'middleeast': 0,
  'travel': 0,
  'finance': 0,
  'games': 0,
  'news': 0,
  'health': 1,
  'weather': 0,
  'foodanddrink': 0,
  'movies': 0,
  'autos': 0,
  'lifestyle': 0,
  'music': 0,
  'tv': 0,
  'entertainment': 0},
 'N61837': {'sports': 0,
  'video': 0,
  'kids': 0,
  'middleeast': 0,
  'travel': 0,
  'finance': 0,
  'games': 0,
  'news': 1,
  'health': 0,
  'weather': 0,
  'foodanddrink': 0,
  'movies': 0,
  'autos': 0,
  'lifestyle': 0,
  'music': 0,
  'tv': 0,
  'entertainment': 0},
 'N53526': {'sports': 0,
  'video': 0,
  'kids': 0,
  'middleeast': 0,
  'travel': 0,
  'finance': 0,
  'games': 0,
  'news': 0,
  'health': 1,
  'weather': 0,
  'foodanddrink': 0,
  'movies': 0,
  'autos': 0,
  'lifestyle': 0,
  'music': 0,
  'tv': 0,
  'entertainment': 0},
 'N38324': {'sports': 0,
  'video': 0,
  'kids': 0,
  'middleeast': 0,
  'travel': 0,
  'finance': 0,
  'games': 0,
  'news': 0,
  'health': 1,
  'weather': 0,
  'foodanddr

In [13]:
df = pd.read_csv(r'test\test.tsv',sep = '\t')
(m, n) = df.shape
df

Unnamed: 0,Uid,Date,History,Impression
0,U21693,11/15/2019 6:52:40 AM,N770 N50047 N4020 N306 N64467 N4607 N41449,N11930 N7419 N60747 N41946 N7342 N23513 N31958...
1,U25377,11/15/2019 5:44:39 AM,N24302 N6385 N49997 N33371 N47121 N11917 N5209...,N34633 N60658 N41946 N23767 N5051 N16854 N5732...
2,U30939,11/15/2019 6:54:55 AM,N55743 N47175 N5445 N18285 N54454 N54960 N138 ...,N42844 N20187 N35216 N6638 N13408 N20036 N3678...
3,U26167,11/15/2019 8:58:13 PM,N55189 N23653 N43142 N42620,N56080 N54562 N52492 N12409 N14802
4,U77000,11/15/2019 8:16:06 AM,N23249 N18845 N32742 N48755 N52551 N47508 N122...,N36779 N5051 N36786 N65145 N58264 N46976 N3195...
...,...,...,...,...
1995,U27889,11/15/2019 5:58:17 PM,N38939 N55189 N848 N64554 N26288 N34419 N38390...,N53572 N62949 N31910 N56080 N46749 N11930 N512...
1996,U58461,11/15/2019 9:12:18 AM,N53883 N8143,N20036 N36779
1997,U83646,11/15/2019 5:35:07 AM,N55189 N45729 N43369 N24127 N39041 N61471 N566...,N20036 N23513 N32536 N36779
1998,U23840,11/15/2019 6:32:45 AM,N995 N61319 N8845 N32203 N18030 N18094 N22420 ...,N20036 N35216 N31958 N6638 N36779 N32536 N5940...


In [14]:
#函数说明：输入data_array，行(m),获取该行的新闻标题列表，p = 3的时候获取history，p = 4的时候获取impression
def title(data_array,m,p):
    s = str(data_array[m][p])
    s1 = s.split(' ')
    return s1

In [19]:
# 按顺序提取所有用户名
all_user_names = np.array(df.iloc[:m+1, 0])[1:]

In [18]:
df2 = pd.read_csv(r'test\test_news.tsv',sep = '\t')
(m1,n1) = df2.shape
# 按顺序提取所有新闻代号
all_items_names = np.array(df2.iloc[:m1+1, 0])[1:]
items_num = len(all_items_names)
all_items_names1 = all_items_names.tolist()

In [20]:
data_array = np.array(df.iloc[:m+1,:])
data_array

array([['U21693', '11/15/2019 6:52:40 AM',
        'N770 N50047 N4020 N306 N64467 N4607 N41449',
        'N11930 N7419 N60747 N41946 N7342 N23513 N31958 N62318 N5940 N36940 N36779 N43646 N51793 N6916 N53242 N30290 N23767 N42844 N42233 N35216 N48487 N13408 N10423 N46976 N42767 N13865 N46162 N32536 N20187 N36786 N27738 N27057 N24109 N6638 N40656 N20036 N13556 N42670'],
       ['U25377', '11/15/2019 5:44:39 AM',
        'N24302 N6385 N49997 N33371 N47121 N11917 N52096 N10646 N60595 N10059 N54932 N37669 N35637 N28467 N4830 N11701 N11701 N4830 N35637 N59231 N17176 N14340 N54099 N21260 N53520 N13669 N2939 N54959 N58715 N29249 N58091 N21851 N19638 N48492 N13605 N47525 N55310 N61681 N39235 N59496 N63239 N49159 N61681 N64049 N44598 N61018 N4060 N63633 N38865 N36691 N7158 N43132 N27424 N40704 N43132 N26045 N8031 N28144 N49728 N43903 N61681 N23157 N3057 N17165 N23157 N28550 N61388 N27830 N49325 N59359 N40692 N33073 N32836 N59691 N16480 N26758 N5344 N43380 N30681 N38961 N5345 N6778 N48356 N38256 N

In [None]:
for i in range(len(all_user_names)):

    # 每个新闻的01向量
    vector = [0] * items_num
    #news_names_list = str(data_array[i][0]).split(" ")
    news_names_list = title(data_array,i,3)
    for j in range(len(news_names_list)):
        if (news_names_list[j] in all_items_names1):
            location = all_items_names1.index(news_names_list[j])
            vector[location] += 1

    data_to_be_written.append(vector)

data_to_be_written = np.transpose(data_to_be_written)
# 将01矩阵写入“news01.xlsx”
df = pd.DataFrame(data_to_be_written, index=all_items_names, columns=all_users_names)
df.to_excel("news_user_saw.xlsx")


In [23]:
'''
vector = [0] * items_num
news_names_list = title(data_array,0,3)
for j in range(len(news_names_list)):
        if (news_names_list[j] in all_items_names1):
            location = all_items_names1.index(news_names_list[j])
            vector[location] += 1
data_array1 = np.array([vector])
'''

In [41]:
items_users_saw_names1 = title(data_array,0,3)
items_saw_number = len(items_users_saw_names1)
items_users_saw_names1

['N11930',
 'N7419',
 'N60747',
 'N41946',
 'N7342',
 'N23513',
 'N31958',
 'N62318',
 'N5940',
 'N36940',
 'N36779',
 'N43646',
 'N51793',
 'N6916',
 'N53242',
 'N30290',
 'N23767',
 'N42844',
 'N42233',
 'N35216',
 'N48487',
 'N13408',
 'N10423',
 'N46976',
 'N42767',
 'N13865',
 'N46162',
 'N32536',
 'N20187',
 'N36786',
 'N27738',
 'N27057',
 'N24109',
 'N6638',
 'N40656',
 'N20036',
 'N13556',
 'N42670']

In [42]:
vector = [1] * items_saw_number
data_array1 = np.array([vector])

In [57]:
# 建立用户画像users_profiles和用户看过的节目集items_users_saw
(users_profiles, items_users_saw) = createUsersProfiles(data_array1, ['U21693'], items_users_saw_names1, all_labels, items_users_saw_profiles)

{'U21693': [['N11930', 1], ['N7419', 1], ['N60747', 1], ['N41946', 1], ['N7342', 1], ['N23513', 1], ['N31958', 1], ['N62318', 1], ['N5940', 1], ['N36940', 1], ['N36779', 1], ['N43646', 1], ['N51793', 1], ['N6916', 1], ['N53242', 1], ['N30290', 1], ['N23767', 1], ['N42844', 1], ['N42233', 1], ['N35216', 1], ['N48487', 1], ['N13408', 1], ['N10423', 1], ['N46976', 1], ['N42767', 1], ['N13865', 1], ['N46162', 1], ['N32536', 1], ['N20187', 1], ['N36786', 1], ['N27738', 1], ['N27057', 1], ['N24109', 1], ['N6638', 1], ['N40656', 1], ['N20036', 1], ['N13556', 1], ['N42670', 1]]}
0.0 123 1
0.0 123 1
0.0 123 1
0.0 123 1
0.0 123 1
0.0 123 1
{'U21693': [['N11930', 1], ['N7419', 1], ['N60747', 1], ['N41946', 1], ['N7342', 1], ['N23513', 1], ['N31958', 1], ['N62318', 1], ['N5940', 1], ['N36940', 1], ['N36779', 1], ['N43646', 1], ['N51793', 1], ['N6916', 1], ['N53242', 1], ['N30290', 1], ['N23767', 1], ['N42844', 1], ['N42233', 1], ['N35216', 1], ['N48487', 1], ['N13408', 1], ['N10423', 1], ['N46976'

In [48]:
users_profiles

{'U21693': {'sports': 0.0,
  'video': 0.0,
  'kids': 0.0,
  'middleeast': 0.0,
  'travel': 0.0,
  'finance': 0.0,
  'games': 0.0,
  'news': 0.0,
  'health': 0.0,
  'weather': 0.0,
  'foodanddrink': 0.0,
  'movies': 0.0,
  'autos': 0.0,
  'lifestyle': 0.0,
  'music': 0.0,
  'tv': 0.0,
  'entertainment': 0.0}}

In [45]:
items_users_saw

{'U21693': ['N11930',
  'N7419',
  'N60747',
  'N41946',
  'N7342',
  'N23513',
  'N31958',
  'N62318',
  'N5940',
  'N36940',
  'N36779',
  'N43646',
  'N51793',
  'N6916',
  'N53242',
  'N30290',
  'N23767',
  'N42844',
  'N42233',
  'N35216',
  'N48487',
  'N13408',
  'N10423',
  'N46976',
  'N42767',
  'N13865',
  'N46162',
  'N32536',
  'N20187',
  'N36786',
  'N27738',
  'N27057',
  'N24109',
  'N6638',
  'N40656',
  'N20036',
  'N13556',
  'N42670']}