In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import pickle
import multitasking
import os
import signal
import math
import numpy as np
from utils import evaluate
from random import shuffle
import warnings
import math

warnings.filterwarnings('ignore')

multitasking.set_max_threads(10)
multitasking.set_engine('process')
signal.signal(signal.SIGINT, multitasking.killall)

<function _signal.default_int_handler>

In [2]:
df_query = pd.read_pickle('data/query.pkl')
df_click = pd.read_pickle('data/click.pkl')

In [3]:
df_click.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,30760,1508211672520,4,1,17,1,25,2
1,0,157507,1508211702520,4,1,17,1,25,2
2,1,289197,1508211316889,4,1,17,1,25,6
3,1,63746,1508211346889,4,1,17,1,25,6
4,2,36162,1508211438695,4,3,20,1,25,2


In [4]:
def cal_item_sim(df):
    user_item_ = df.groupby('user_id')['click_article_id'].agg(
        lambda x: list(x)).reset_index()
    user_item_dict = dict(
        zip(user_item_['user_id'], user_item_['click_article_id']))

    item_cnt = defaultdict(int)
    sim_dict = {}

    for user, items in tqdm(user_item_dict.items()):
        for item in items:
            item_cnt[item] += 1
            sim_dict.setdefault(item, {})

            for relate_item in items:
                if item == relate_item:
                    continue

                sim_dict[item].setdefault(relate_item, 0)

                sim_dict[item][relate_item] += 1 / math.log(1 + len(items))

    sim_dict_corr = sim_dict.copy()
    for item, relate_items in tqdm(sim_dict.items()):
        for relate_item, cij in relate_items.items():
            sim_dict_corr[item][relate_item] = cij / \
                math.sqrt(item_cnt[item] * item_cnt[relate_item])

    return sim_dict_corr, user_item_dict

In [5]:
item_sim, user_item_dict = cal_item_sim(df_click)

100%|█████████████████████████████████████████████████████████████████████████████| 4232/4232 [00:00<00:00, 295085.77it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1869/1869 [00:00<00:00, 475821.19it/s]


In [6]:
def recall(df_query, item_sim, user_item_dict):
    data_list = []

    for user_id, item_id in tqdm(df_query.values):
        rank = defaultdict(int)
        
        if user_id not in user_item_dict:
            continue
        
        interacted_items = user_item_dict[user_id]
        interacted_items = interacted_items[::-1]

        for loc, item in enumerate(interacted_items):
            for relate_item, wij in sorted(item_sim[item].items(), key=lambda d: d[1], reverse=True)[0:100]:
                if relate_item not in interacted_items:
                    rank.setdefault(relate_item, 0)
                    rank[relate_item] += wij

        sim_items = sorted(
            rank.items(), key=lambda d: d[1],  reverse=True)[:50]
        item_ids = [item[0] for item in sim_items]
        item_sim_scores = [item[1] for item in sim_items]

        df_temp = pd.DataFrame()
        df_temp['article_id'] = item_ids
        df_temp['sim_score'] = item_sim_scores
        df_temp['user_id'] = user_id

        if item_id == -1:
            df_temp['label'] = np.nan
        else:
            df_temp['label'] = 0
            df_temp.loc[df_temp['article_id'] == item_id, 'label'] = 1

        df_temp = df_temp[[
            'user_id',  'article_id', 'sim_score', 'label'
        ]]
        df_temp['user_id'] = df_temp['user_id'].astype('int')
        df_temp['article_id'] = df_temp['article_id'].astype('int')

        data_list.append(df_temp)

    df_data = pd.concat(data_list, sort=False)
    return df_data

In [7]:
df_data = recall(df_query, item_sim, user_item_dict)

100%|████████████████████████████████████████████████████████████████████████████| 90258/90258 [00:02<00:00, 37926.50it/s]


In [8]:
df_data.head()

Unnamed: 0,user_id,article_id,sim_score,label
0,8,209122,0.152326,0
1,8,205824,0.110495,0
2,8,50644,0.098697,1
3,8,258195,0.078657,0
4,8,70677,0.066742,0


In [9]:
# 必须加，对其进行排序
df_data = df_data.sort_values(['user_id', 'sim_score'], ascending=[
                              True, False]).reset_index(drop=True)

In [10]:
df_data.head()

Unnamed: 0,user_id,article_id,sim_score,label
0,8,209122,0.152326,0
1,8,205824,0.110495,0
2,8,50644,0.098697,1
3,8,258195,0.078657,0
4,8,70677,0.066742,0


In [11]:
from random import sample
prediction = df_data[df_data['label'].isnull()]
prediction.sort_values(['user_id', 'sim_score'],
                       inplace=True,
                       ascending=[True, False])

all_articles = set(df_click['click_article_id'].values)

lines = []
test_users = df_query[df_query['click_article_id'] == -1]['user_id'].unique()

for test_user in tqdm(test_users):
    g = prediction[prediction['user_id'] == test_user]
    g = g.head(5)
    items = g['article_id'].values.tolist()
    
    if len(set(items)) < 5:
        buchong = all_articles - set(items)
        buchong = sample(buchong, 5 - len(set(items)))
        items += buchong
    
    assert len(set(items)) == 5

    lines.append([test_user] + items)

df_sub = pd.DataFrame(lines)
df_sub.columns = ['user_id', 'article_1', 'article_2',
                  'article_3', 'article_4', 'article_5']
df_sub.sort_values(['user_id'], inplace=True)
os.makedirs('sub', exist_ok=True)
df_sub.to_csv('sub/itemcf.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:48<00:00, 1028.87it/s]


In [12]:
df_query = pd.read_pickle('data/query.pkl')
total = df_query[df_query['click_article_id'] != -1].user_id.nunique()

hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50 = evaluate(df_data[df_data['label'].notnull()], total)
hitrate_5, mrr_5, hitrate_10, mrr_10, hitrate_20, mrr_20, hitrate_40, mrr_40, hitrate_50, mrr_50

100%|█████████████████████████████████████████████████████████████████████████████████| 408/408 [00:00<00:00, 1136.26it/s]


(0.0028814148740622983,
 0.0016530875850762588,
 0.0038501664265487606,
 0.0017861874244456599,
 0.0049431168960206665,
 0.0018621919580733314,
 0.005862188881712952,
 0.001893809905259774,
 0.006160266282478017,
 0.001900411180311219)