## 排序模型 

In [1]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

## 读取特征

In [2]:
data_path = './data/'
save_path = './data/'
offline = False

In [3]:
# 重新读取数据的时候，发现click_article_id是一个浮点数，所以将其转换成int类型
trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)

if offline:
    val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')
    val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)
else:
    val_user_item_feats_df = None
    
tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

# 做特征的时候为了方便，给测试集也打上了一个无效的标签，这里直接删掉就行
del tst_user_item_feats_df['label']
    

## 评估

## LGB排序模型

In [5]:
# rank数据
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()

tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()

In [16]:
trn_user_item_feats_df_rank_model.head()

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_country,click_region,click_referrer_type,user_time_hob1,user_time_hob2,word_hbo,category_id,created_at_ts,words_count,is_cat_hab
0,0,157600,0.274769,400632000,53,0.274769,0.274769,0.274769,0.274769,0.990154,...,1,25,2,0.343715,0.992865,266.0,281,1507784459000,215,0
6946,1,207614,0.406684,22032876000,11,0.406684,0.406684,0.406684,0.406684,0.994893,...,1,25,6,0.343618,0.992721,169.0,331,1486147033000,187,0
8100,2,96187,-0.058501,5478593000,7,-0.058501,-0.058501,-0.058501,-0.058501,inf,...,1,25,2,0.343651,0.99202,210.0,209,1502698578000,212,0
18094,3,289080,0.223033,347360000,69,0.223033,0.223033,0.223033,0.223033,inf,...,1,25,2,0.343629,0.992774,196.5,418,1507834959000,257,0
26673,4,59733,0.204887,9930599000,20,0.204887,0.204887,0.204887,0.204887,inf,...,1,16,1,0.343702,0.992688,220.0,122,1498216785000,285,0


In [19]:
# 定义特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 
            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
            'click_environment','click_deviceGroup', 'click_os', 'click_country', 
            'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',
            'word_hbo', 'category_id', 'created_at_ts','words_count']

In [20]:
# 先排序
trn_user_item_feats_df_rank_model.sort_values(by="user_id", inplace=True)
# 排序模型独有，优化只在所在空间内
g_train = trn_user_item_feats_df_rank_model.groupby(["user_id"], as_index=False).count()["label"].values

if offline:
    val_user_item_feats_df_rank_model.sort_values(by="user_id", inplace=True)
    g_val = val_user_item_feats_df_rank_model.groupby(["user_id"], as_index=False).count()["label"].values

In [23]:
# 模型定义
lgb_rank = lgb.LGBMRanker(n_jobs=16)

In [24]:
lgb_rank.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model["label"], group=g_train)

LGBMRanker(n_jobs=16)

In [27]:
lgb_rank.predict(tst_user_item_feats_df_rank_model[lgb_cols])