In [1]:
import pandas as pd
import os
import gc
import math
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from itertools import combinations

In [2]:
train_t1 = pd.read_csv('DATA/t1/train.tsv', sep='\t')
train_5core_t1 = pd.read_csv('DATA/t1/train_5core.tsv', sep='\t')
valid_qrel_t1 = pd.read_csv('DATA/t1/valid_qrel.tsv', sep='\t') # 验证集 正样本
valid_run_t1 = pd.read_csv('DATA/t1/valid_run.tsv', sep='\t', header=None) # 验证样本
valid_run_t1.columns = ['userId','itemIds']
test_run_t1 = pd.read_csv('DATA/t1/test_run.tsv', sep='\t', header=None) # 测试样本
test_run_t1.columns = ['userId','itemIds']

train_t2 = pd.read_csv('DATA/t2/train.tsv', sep='\t')
train_5core_t2 = pd.read_csv('DATA/t2/train_5core.tsv', sep='\t')
valid_qrel_t2 = pd.read_csv('DATA/t2/valid_qrel.tsv', sep='\t') # 验证集 正样本
valid_run_t2 = pd.read_csv('DATA/t2/valid_run.tsv', sep='\t', header=None) # 验证样本
valid_run_t2.columns = ['userId','itemIds']
test_run_t2 = pd.read_csv('DATA/t2/test_run.tsv', sep='\t', header=None) # 测试样本
test_run_t2.columns = ['userId','itemIds']

In [3]:

def item_cf(df, user_col, item_col):  # train, 'itemId', 'userId'
    user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index()     # user的item列表
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))  # 变成字典
    
    sim_item = {}                                  # 里面存的又是字典
    item_cnt = defaultdict(int)  
    for user, items in tqdm(user_item_dict.items()):     # 这段逻辑是用户交互过的item之间的相似度为  1 / math.log(1 + len(items))
        for item in items:                                                  
            item_cnt[item] += 1                    # item出现的频次
            sim_item.setdefault(item, {})          # 查找item键值，不存在设为空字典
            for relate_item in items:  
                if item == relate_item:            # item自身无相似度
                    continue
                
                sim_item[item].setdefault(relate_item, 0)   # 如果不存在，先设为0
                sim_item[item][relate_item] += 1 / math.log(1 + len(items))
                
    sim_item_corr = sim_item.copy()
    for i, related_items in tqdm(sim_item.items()):     # 做个类似于归一化的计算
        for j, cij in related_items.items():  
            sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i]*item_cnt[j])   # 相似度矩阵
  
    return sim_item_corr, user_item_dict


def recommend(sim_item_corr, user_item_dict, user_id):  # 相似度矩阵, user交互的item列表, 用户id
    rank = {}
    try:
        interacted_items = user_item_dict[user_id]      # 取出用户交互过的item
    except:
        interacted_items = {}
    for i in interacted_items:
        try:
            for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True):  # 按照相似度进行排序
                if j not in interacted_items:       # 没有交互过的item加入到待排序列中
                    rank.setdefault(j, 0)
                    rank[j] += wij                  
        except:
            pass

    return sorted(rank.items(), key=lambda d: d[1], reverse=True)   # 将相似度排序后返回
 

def match_func(items1, items2):  # 所有候选项, run中的候选项
    res = []
    for it in items1:
        if it in items2:
            res.append(it)
    if len(res) < 100:      # 如果不够100个，补齐，其实10个就完全够了
        for it in items2:
            if it not in res:
                res.append(it)
    return res[:100]
    

def recall_func(train, valid_run):
    # 构建相似矩阵
    item_sim_list, user_item = item_cf(train, 'userId', 'itemId')
    
    # 对每个user进行召回
    recom_item = []
    for i in tqdm(valid_run['userId'].unique()):
        
        rank_item = recommend(item_sim_list, user_item, i)
        
        for j in rank_item:  
            if j[1] > 0.001:  # 相似度大于0.001才加入待排序列
                recom_item.append([i, j[0], j[1]])  

    ############## 转为DataFrame
    recom_item_df = pd.DataFrame(recom_item)
    recom_item_df.columns = ['userId','itemId','score']
    
    # 聚合itemId成list
    recom_df = recom_item_df.groupby(['userId'])['itemId'].agg(list).reset_index()
    recom_df.columns = ['userId','pred_itemIds']      # 所有的item候选项

    # 合并验证集itemIds，只挑选run中有的item
    recom_df = recom_df.merge(valid_run, on='userId', how='left')
    recom_df['itemIds'] =  recom_df['itemIds'].apply(lambda x:x.split(','))
    
    recom_df['result_itemIds'] = recom_df.apply(lambda row:match_func(row['pred_itemIds'], row['itemIds']),axis = 1)  # 通过run筛选后的item候选项
    
    return recom_df


# 热度填充,目前未使用到
def hot_fill(train, valid_run, test_run):
    
    # 验证数据
    valid_run = valid_run.merge(valid_recom_df, on='userId', how='left')

    # 按热度进行填充
    valid_run['hot_itemIds'] = ','.join(train['itemId'].value_counts().reset_index()['index'].tolist())  # 每一行都有热度itemIds
    valid_run['itemIds'] = valid_run['itemIds'].apply(lambda x:x.split(','))
    valid_run['hot_itemIds'] = valid_run['hot_itemIds'].apply(lambda x:x.split(','))
    valid_run['hot_itemIds'] = valid_run.apply(lambda row:match_func(row['hot_itemIds'], row['itemIds']),axis = 1)
    valid_run['hot_itemIds'] = valid_run['hot_itemIds'].apply(lambda x:','.join(x))
    valid_run.loc[valid_run.result_itemIds.isnull(), 'result_itemIds'] = \
    valid_run.loc[valid_run.result_itemIds.isnull(), 'hot_itemIds']
    
    # 测试数据
    test_run = test_run.merge(test_recom_df, on='userId', how='left')

    # 按热度进行填充
    test_run['hot_itemIds'] = ','.join(train['itemId'].value_counts().reset_index()['index'].tolist())
    test_run['itemIds'] = test_run['itemIds'].apply(lambda x:x.split(','))
    test_run['hot_itemIds'] = test_run['hot_itemIds'].apply(lambda x:x.split(','))
    test_run['hot_itemIds'] = test_run.apply(lambda row:match_func(row['hot_itemIds'], row['itemIds']),axis = 1)
    test_run['hot_itemIds'] = test_run['hot_itemIds'].apply(lambda x:','.join(x))
    test_run.loc[test_run.result_itemIds.isnull(), 'result_itemIds'] = \
    test_run.loc[test_run.result_itemIds.isnull(), 'hot_itemIds']
    
    return valid_run, test_run

In [4]:
def getDCG(scores):
    return np.sum(
        np.divide(np.power(2, scores) - 1, np.log2(np.arange(scores.shape[0], dtype=np.float32) + 2)),
        dtype=np.float32)

def getNDCG(rank_list, pos_items):
    relevance = np.ones_like(pos_items)
    it2rel = {it: r for it, r in zip(pos_items, relevance)}
    rank_scores = np.asarray([it2rel.get(it, 0.0) for it in rank_list], dtype=np.float32)

    #idcg = getDCG(relevance)
    idcg = 1
    
    dcg = getDCG(rank_scores)

    if dcg == 0.0:
        return 0.0

    ndcg = dcg / idcg
    return ndcg

In [5]:
run_dir = './baseline_outputs/sample_run/'

In [6]:
## t1结果
print('valid_recom_df......')
valid_recom_df = recall_func(train_t1, valid_run_t1)  # train_t1原始训练数据, valid_run_t1待排序数据
print('test_recom_df......')
test_recom_df = recall_func(train_t1, test_run_t1)    # train_t1原始训练数据, valid_run_t1待排序数据

valid_qrel = valid_qrel_t1

# 合并验证集真实结果
valid_recom_df = valid_recom_df.merge(valid_qrel, on='userId', how='left')

# 计算NDCG分数
NDCG = 0
for items in valid_recom_df[['result_itemIds','itemId']].values:
    l1 = items[0][:10]
    l2 = [items[1]]
    NDCG += getNDCG(l1, l2)
NDCG = NDCG/len(valid_run_t1)
print('t1 NDCG : ', NDCG)

valid_recom_df......


100%|██████████| 9742/9742 [00:00<00:00, 23236.82it/s]
100%|██████████| 3429/3429 [00:00<00:00, 32901.58it/s]
100%|██████████| 2697/2697 [00:17<00:00, 154.51it/s]


test_recom_df......


100%|██████████| 9742/9742 [00:00<00:00, 14093.75it/s]
100%|██████████| 3429/3429 [00:00<00:00, 16887.02it/s]
100%|██████████| 2697/2697 [00:17<00:00, 158.33it/s]


t1 NDCG :  0.5832278451567718


In [7]:
# 保存t1结果，生成tsv文件
# 验证集
userId = []
itemId = []
for index, row in valid_recom_df[['userId', 'result_itemIds']].iterrows():
    userId += [row.userId]*len(row.result_itemIds)
    itemId += row.result_itemIds
df = pd.DataFrame(columns=['userId','itemId','score'])
df['userId'] = userId
df['itemId'] = itemId
df['score'] = [len(itemId)-i for i in range(len(itemId))]
df.to_csv(run_dir+'t1/valid_pred.tsv', sep='\t', index=False)
# 测试集
userId = []
itemId = []
for index, row in test_recom_df[['userId', 'result_itemIds']].iterrows():
    userId += [row.userId]*len(row.result_itemIds)
    itemId += row.result_itemIds
df = pd.DataFrame(columns=['userId','itemId','score'])
df['userId'] = userId
df['itemId'] = itemId
df['score'] = [len(itemId)-i for i in range(len(itemId))]
df.to_csv(run_dir+'t1/test_pred.tsv', sep='\t', index=False)

In [8]:
## t2结果
print('valid_recom_df......')
valid_recom_df = recall_func(train_t2, valid_run_t2)
print('test_recom_df......')
test_recom_df = recall_func(train_t2, test_run_t2)

valid_qrel = valid_qrel_t2

# 合并验证集真实结果
valid_recom_df = valid_recom_df.merge(valid_qrel, on='userId', how='left')

# 计算NDCG分数
NDCG = 0
for items in valid_recom_df[['result_itemIds','itemId']].values:
    l1 = items[0][:10]                                               # 只取前10个item
    l2 = [items[1]]
    NDCG += getNDCG(l1, l2)
NDCG = NDCG/len(valid_run_t2)
print('t2 NDCG : ', NDCG)

valid_recom_df......


100%|██████████| 18242/18242 [00:01<00:00, 12492.80it/s]
100%|██████████| 8834/8834 [00:00<00:00, 18689.40it/s]
100%|██████████| 5482/5482 [01:00<00:00, 91.11it/s] 


test_recom_df......


100%|██████████| 18242/18242 [00:01<00:00, 11709.49it/s]
100%|██████████| 8834/8834 [00:00<00:00, 13168.67it/s]
100%|██████████| 5482/5482 [01:04<00:00, 84.61it/s] 


t2 NDCG :  0.5077369139712903


In [9]:
# 保存t2结果，生成tsv文件
# 验证集
userId = []
itemId = []
for index, row in valid_recom_df[['userId', 'result_itemIds']].iterrows():
    userId += [row.userId]*len(row.result_itemIds)
    itemId += row.result_itemIds
df = pd.DataFrame(columns=['userId','itemId','score'])
df['userId'] = userId
df['itemId'] = itemId
df['score'] = [len(itemId)-i for i in range(len(itemId))]
df.to_csv(run_dir+'t2/valid_pred.tsv', sep='\t', index=False)
# 测试集
userId = []
itemId = []
for index, row in test_recom_df[['userId', 'result_itemIds']].iterrows():
    userId += [row.userId]*len(row.result_itemIds)
    itemId += row.result_itemIds
df = pd.DataFrame(columns=['userId','itemId','score'])
df['userId'] = userId
df['itemId'] = itemId
df['score'] = [(len(itemId)-i)/len(itemId)+2 for i in range(len(itemId))]
df.to_csv(run_dir+'t2/test_pred.tsv', sep='\t', index=False)

In [10]:
# 压缩文件并验证结果
! cd {run_dir} && zip -r ../sample_run.zip ./

print("*** Validating the submission Zip file ***")
# Run the validate_submission.py script to check if the file format is okay and get the performance on validation set.
! python validate_submission.py ./baseline_outputs/sample_run.zip

updating: t1/ (stored 0%)
updating: t1/test_pred.tsv (deflated 76%)
updating: t1/valid_pred.tsv (deflated 76%)
updating: t2/ (stored 0%)
updating: t2/test_pred.tsv (deflated 80%)
updating: t2/valid_pred.tsv (deflated 76%)
*** Validating the submission Zip file ***
Extracting the submission zip file
Validating the file structure of the submission
File structure validation successfully passed
Evaluating the validation set
