# 新闻推荐
**问题描述**：赛题以新闻APP中的新闻推荐为背景，要求选手根据用户历史浏览点击新闻文章的数据信息预测用户未来点击行为，即用户的最后一次点击的新闻文章，测试集对最后一次点击行为进行了剔除。 

**评价指标:**   
$$
score(user) = \sum_{k=1}^5 \frac{s(user, k)}{k}
$$  

**赛题理解：**  

**使用方法：**
Baseline使用协同过滤的方法进行确定

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
import time
from tqdm import tqdm
import collections
from collections import defaultdict
import math
import pickle
import warnings
warnings.filterwarnings("ignore")


DATA_PATH = "./data/"
SAVE_PATH = "./checkpoints/"

## 节约内存的一个标配函数

In [2]:
def reduce_mem(df):
    """对于数值类型的数据进行内存节省"""
    
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2  # 统计内存使用情况
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                # 装换数据类型
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

## 读取采样或全量数据

In [3]:
def get_all_click_sample(data_path, samples = 10000):
    """对数据进行采样"""
    
    all_click = pd.read_csv(data_path + "train_click_log.csv")
    all_user_id = all_click["user_id"].unique()
    sample_user_id =  np.random.choice(all_user_id, 10000)  # 进行无放回的采样数据
    all_click = all_click[all_click["user_id"].isin(sample_user_id)]

    all_click = all_click.drop_duplicates(all_click.columns[:3])
    
    return all_click

# 读取点击数据，这里分成线上和线下，如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中
# 如果是为了线下验证模型的有效性或者特征的有效性，可以只使用训练集
def get_all_click_df(data_path='./data/', offline=True):
    if offline:
        all_click = pd.read_csv(data_path + 'train_click_log.csv')
    else:
        trn_click = pd.read_csv(data_path + 'train_click_log.csv')
        tst_click = pd.read_csv(data_path + 'testA_click_log.csv')

        all_click = trn_click.append(tst_click)
    
    all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click

In [4]:
all_click_df = get_all_click_df()

## 获取用户－文章－点击时间字典


In [5]:
# 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
    
    click_df = click_df.sort_values('click_timestamp')  # 按照时间升序排列
    
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'], df['click_timestamp']))
    
    user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    
    return user_item_time_dict

## 获取近期点击最多的文章

In [6]:
def get_item_topk_click(click_df, k):
    
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    
    return topk_click

## ItemCF的物品相似度计算
算法依据：ItemCF算法并不利用物品的内容属性计算物品之间的相似度， 主要通过分析用户的行为记录计算物品之间的相似度， 该算法认为， 物品a和物品c具有很大的相似度是因为喜欢物品a的用户大都喜欢物品c。改进之后，基于公式四进行编写  
![](./imgs/item.png)

In [7]:
def itemcf_sim(df):
    """计算物品相似度""" 
    
    user_item_time_df = get_user_item_time(df)
    
    i2i_sim = {}
    item_cnt = defaultdict(int)  # 初始值为０

    # 改进权重的基于物品的协同过滤算法
    for user, item_time_list in tqdm(user_item_time_df.items()):
            for i, i_click_time in item_time_list:
                item_cnt[i] += 1  # 统计喜欢item的用户数目
                i2i_sim.setdefault(i, {})
                
                for j, j_click_time in item_time_list:
                    if (i == j):  # 自身和自身之间进行计算相似度
                        continue
                    i2i_sim[i].setdefault(j, 0)  
                    i2i_sim[i][j] += 1/math.log(len(item_time_list) + 1)  # 既喜欢商品i，又喜欢商品j
                    
    i2i_sim_ = i2i_sim.copy()
    for i, related_items in i2i_sim.items():
        for j, wij in related_items.items():
            i2i_sim_[i][j] = wij/math.sqrt(item_cnt[i] * item_cnt[j])
    
    # 保存数据
    pickle.dump(i2i_sim_, open(SAVE_PATH + "itemCF_sim.pkl", "wb"))
    
    return i2i_sim_

In [8]:
i2i_sim = itemcf_sim(all_click_df)

100%|██████████| 200000/200000 [00:13<00:00, 15091.25it/s]


## ItemCF的文章推荐

In [9]:
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):
    """
    """
    user_hist_items = user_item_time_dict[user_id]  # 找出用户历史文章点击
    user_hist_items_ = {user_id for user_id, _ in user_hist_items}
    
    
    item_rank = {}
    for loc, (i, click_time) in enumerate(user_hist_items):
        # 找出用户度过文章中最相似的文章回合,从高到底
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x:x[1], reverse=True)[:sim_item_topk]:  # 选择与当前文章最相似的k篇文章
            if j in user_hist_items_:  # 已经存在的商品
                continue
                
            item_rank.setdefault(j, 0) # 召回
            item_rank[j] += wij
    
    # 不足10个商品，进行热门商品补全
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.items():
                continue
            item_rank[item] = - i - 100  # 后补全的商品应的重要性应不能高于已推荐商品的重要性
    
            if len(item_rank) == recall_item_num:
                break
                
    item_rank = sorted(item_rank.items(), key=lambda x:x[1], reverse=True)[:recall_item_num]

    return item_rank

## 给每位用户根据物品的协同过滤推荐文章

In [12]:
# 统计召回
user_recall_items_dict = collections.defaultdict(dict)

# 用户物品字典
user_item_time_dict = get_user_item_time(all_click_df)

i2i_sim = pickle.load(open(SAVE_PATH + "itemCF_sim.pkl", "rb"))

# 相似文章数量
sim_item_topk = 10

# 召回文章数量
recall_item_num = 10

# 用户热度补全
item_topk_click = get_item_topk_click(all_click_df, k = 50)

# 为用户推荐商品
for user in tqdm(all_click_df["user_id"].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click)

100%|██████████| 200000/200000 [26:19<00:00, 126.66it/s]


## 装换数据到DataFrame

In [13]:
# 将字典的形式转换成df
user_item_score_list = []

for user, items in tqdm(user_recall_items_dict.items()):
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])

100%|██████████| 200000/200000 [00:04<00:00, 40093.13it/s]


In [26]:
recall_df = recall_df.sort_values(by=["user_id", "pred_score"])
recall_df

Unnamed: 0,user_id,click_article_id,pred_score
1999999,0,39880,0.073596
1999998,0,36162,0.075088
1999997,0,41676,0.075770
1999996,0,3244,0.076712
1999995,0,225446,0.084663
...,...,...,...
4,199999,107301,0.095061
3,199999,108855,0.101321
2,199999,50864,0.103660
1,199999,286321,0.110638


In [32]:
recall_df["rank"] = recall_df.groupby("user_id")["pred_score"].rank(ascending=False, method="first")

In [46]:
recall_df = recall_df.drop("pred_score", axis=1)

In [47]:
recall_df

Unnamed: 0,user_id,click_article_id,rank
1999999,0,39880,10.0
1999998,0,36162,9.0
1999997,0,41676,8.0
1999996,0,3244,7.0
1999995,0,225446,6.0
...,...,...,...
4,199999,107301,5.0
3,199999,108855,4.0
2,199999,50864,3.0
1,199999,286321,2.0


## 参考
[pandas内存优化](https://blog.csdn.net/weiyongle1996/article/details/78498603)