In [1]:
from itertools import combinations
import pandas as pd

alpha = 0.5
top_k = 20

In [2]:
# 读取数据
def load_data(train_path):
    train_data = pd.read_csv(train_path, sep="\t", engine="python", names=["userid", "itemid", "rate"])#提取用户交互记录数据
    return train_data

train_data_path = "./ratings_final.txt"
train_data = load_data(train_data_path)
train_data.head()

Unnamed: 0,userid,itemid,rate
0,1,101,5
1,1,102,3
2,1,103,4
3,1,104,2
4,2,105,4


In [3]:
# 得到用户和物品的交互字典
def get_uitems_iusers(train):
    u_items = dict()
    i_users = dict()
    for index, row in train.iterrows(): #处理用户交互记录 
        u_items.setdefault(row["userid"], set())
        i_users.setdefault(row["itemid"], set())
        u_items[row["userid"]].add(row["itemid"]) #得到user交互过的所有item
        i_users[row["itemid"]].add(row["userid"]) #得到item交互过的所有user
    print("使用的用户个数为：{}".format(len(u_items)))
    print("使用的item个数为：{}".format(len(i_users)))
    return u_items, i_users 

u_items, i_users = get_uitems_iusers(train_data)
u_items, i_users

使用的用户个数为：20
使用的item个数为：20


({1: {101, 102, 103, 104},
  2: {105, 106, 107, 108},
  3: {109, 110, 111, 112},
  4: {113, 114, 115, 116},
  5: {117, 118, 119, 120},
  6: {101, 102, 103, 104},
  7: {105, 106, 107, 108},
  8: {109, 110, 111, 112},
  9: {113, 114, 115, 116},
  10: {117, 118, 119, 120},
  11: {101, 102, 103, 104},
  12: {105, 106, 107, 108},
  13: {109, 110, 111, 112},
  14: {113, 114, 115, 116},
  15: {117, 118, 119, 120},
  16: {101, 102, 103, 104},
  17: {105, 106, 107, 108},
  18: {109, 110, 111, 112},
  19: {113, 114, 115, 116},
  20: {117, 118, 119, 120}},
 {101: {1, 6, 11, 16},
  102: {1, 6, 11, 16},
  103: {1, 6, 11, 16},
  104: {1, 6, 11, 16},
  105: {2, 7, 12, 17},
  106: {2, 7, 12, 17},
  107: {2, 7, 12, 17},
  108: {2, 7, 12, 17},
  109: {3, 8, 13, 18},
  110: {3, 8, 13, 18},
  111: {3, 8, 13, 18},
  112: {3, 8, 13, 18},
  113: {4, 9, 14, 19},
  114: {4, 9, 14, 19},
  115: {4, 9, 14, 19},
  116: {4, 9, 14, 19},
  117: {5, 10, 15, 20},
  118: {5, 10, 15, 20},
  119: {5, 10, 15, 20},
  120: {

In [4]:
# 定义swing模型
def swing_model(u_items, i_users):
    item_pairs = list(combinations(i_users.keys(), 2)) # 全排列组合对
    print("item pairs length：{}".format(len(item_pairs)))
    item_sim_dict = dict()
    for (i, j) in item_pairs:
        user_pairs = list(combinations(i_users[i] & i_users[j], 2)) #item_i和item_j对应的user取交集后全排列 得到user对
        result = 0
        for (u, v) in user_pairs:
            result += 1 / (alpha + list(u_items[u] & u_items[v]).__len__()) #分数公式
        if result != 0 :
            item_sim_dict.setdefault(i, dict())
            item_sim_dict[i][j] = format(result, '.6f')
    return item_sim_dict

item_sim_dict = swing_model(u_items, i_users)
item_sim_dict

item pairs length：190


{101: {102: '1.333333', 103: '1.333333', 104: '1.333333'},
 102: {103: '1.333333', 104: '1.333333'},
 103: {104: '1.333333'},
 105: {106: '1.333333', 107: '1.333333', 108: '1.333333'},
 106: {107: '1.333333', 108: '1.333333'},
 107: {108: '1.333333'},
 109: {110: '1.333333', 111: '1.333333', 112: '1.333333'},
 110: {111: '1.333333', 112: '1.333333'},
 111: {112: '1.333333'},
 113: {114: '1.333333', 115: '1.333333', 116: '1.333333'},
 114: {115: '1.333333', 116: '1.333333'},
 115: {116: '1.333333'},
 117: {118: '1.333333', 119: '1.333333', 120: '1.333333'},
 118: {119: '1.333333', 120: '1.333333'},
 119: {120: '1.333333'}}

In [5]:
def save_item_sims(item_sim_dict, top_k, path):
    new_item_sim_dict = dict()
    try:
        writer = open(path, 'w', encoding='utf-8')
        for item, sim_items in item_sim_dict.items():
            new_item_sim_dict.setdefault(item, dict())
            new_item_sim_dict[item] = dict(sorted(sim_items.items(), key = lambda k:k[1], reverse=True)[:top_k])#排序取出 top_k个相似的item
            writer.write('item_id:%d\t%s\n' % (item, new_item_sim_dict[item]))
        print("SUCCESS: top_{} item saved".format(top_k))
    except Exception as e:
        print(e.args)

item_sim_save_path = "./item_sim_dict.txt"
top_k = 10 #与item相似的前 k 个item
save_item_sims(item_sim_dict, top_k, item_sim_save_path)

SUCCESS: top_10 item saved
