In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
# import optuna.integration.lightgbm as lgb  # 调参用
from lightgbm import LGBMClassifier
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold

import math
from collections import defaultdict
from gensim.models import Word2Vec
from tqdm import tqdm
import os
from lgb_utils import *

import joblib
import random

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas(desc='pandas bar')

pd.set_option('max_columns', None)
pd.set_option('max_rows', 300)
pd.set_option('float_format', lambda x: '%.3f' % x)

tgt_market = 't2'
tgt_data_dir = f'./DATA/{tgt_market}/'

src_market = 's3'
src_data_dir = f'./DATA/{src_market}/'

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
train = pd.read_table(tgt_data_dir+'train.tsv')
train_5core = pd.read_table(tgt_data_dir+'train_5core.tsv')
# train = pd.concat([pd.read_table('./DATA/t1/train.tsv'), pd.read_table('./DATA/t2/train.tsv')], ignore_index=True)
# train_5core = pd.concat([pd.read_table('./DATA/t1/train_5core.tsv'), pd.read_table('./DATA/t2/train_5core.tsv')], ignore_index=True)

train_cross = train_5core.merge(train, on=['userId', 'itemId'], how='left')
train_5core = train_cross[train_cross['rating_y'].isnull()][['userId', 'itemId']]

train = pd.concat([train, train_5core, train_5core, train_5core, train_5core, train_5core], ignore_index=True)
print(train.shape)

# 用于进行嫁接学习的数据
train_src = pd.read_table(src_data_dir+'train.tsv')
train_5core_src = pd.read_table(src_data_dir+'train_5core.tsv')

train_cross_src = train_5core.merge(train, on=['userId', 'itemId'], how='left')
train_5core_src = train_cross[train_cross['rating_y'].isnull()][['userId', 'itemId']]

train_src = pd.concat([train_src, train_5core_src], ignore_index=True)
print(train_src.shape)

(128075, 3)
(55753, 3)


In [3]:
train.head()

Unnamed: 0,userId,itemId,rating
0,t2U1008825,P1014064,5.0
1,t2U1000444,P1011611,1.0
2,t2U1014700,P1028053,5.0
3,t2U1011417,P1018377,4.0
4,t2U1014369,P1020825,5.0


# 提取用户侧和物品侧的特征

In [4]:
# 转成列表
user_feature = train.groupby("userId").agg(item_list=("itemId", list), rating_list=("rating", list)).reset_index()
item_feature = train.groupby("itemId").agg(user_list=("userId", list), rating_list=("rating", list)).reset_index()

# 嫁接数据
user_feature_src = train_src.groupby("userId").agg(item_list=("itemId", list), rating_list=("rating", list)).reset_index()
item_feature_src = train_src.groupby("itemId").agg(user_list=("userId", list), rating_list=("rating", list)).reset_index()

# 构建训练集和测试集

## 构建验证集&测试集

In [5]:
def load_market_valid_run(valid_run_file):  # 把一行item_id分别拆到user_id中,构成<user_id,item_id>pair
    users, items = [], []
    with open(valid_run_file, 'r') as f:
        for line in f:
            linetoks = line.split('\t')
            user_id = linetoks[0]
            item_ids = linetoks[1].strip().split(',')
            for cindex, item_id in enumerate(item_ids):
                users.append(user_id)
                items.append(item_id)

    return users, items

# 提取特征
user_feature, item_feature = get_static_feat(train.copy(), user_feature.copy(), item_feature.copy())
user_embed = emb(train.copy(), 'userId', 'itemId', tgt_market, mode='agg')
item_embed = emb(train.copy(), 'userId', 'itemId', tgt_market, mode='single')

# 验证集，同时也作为训练集
user_ids, item_ids = load_market_valid_run(tgt_data_dir+'valid_run.tsv')
valid = pd.DataFrame(columns=['userId','itemId'])
valid['userId'] = user_ids
valid['itemId'] = item_ids
# 合并统计特征
valid = valid.merge(user_feature, on='userId',how='left')
valid = valid.merge(item_feature, on='itemId',how='left')
# 合并w2v特征
valid = valid.merge(user_embed, on='userId',how='left')
valid = valid.merge(item_embed, on='itemId',how='left')

# 测试集
user_ids, item_ids = load_market_valid_run(tgt_data_dir+'test_run.tsv')
test = pd.DataFrame(columns=['userId','itemId'])
test['userId'] = user_ids
test['itemId'] = item_ids
# 合并统计特征
test = test.merge(user_feature, on='userId',how='left')
test = test.merge(item_feature, on='itemId',how='left')
# 合并w2v特征
test = test.merge(user_embed, on='userId',how='left')
test = test.merge(item_embed, on='itemId',how='left')
# 合并item_cf特征
valid_cf = get_sim_feature(train.copy(), valid.copy())
test_cf = get_sim_feature(train.copy(), test.copy())
valid = valid.merge(valid_cf, how='left', on=['userId','itemId'])
test = test.merge(test_cf, how='left', on=['userId','itemId'])

find w2v model
find w2v model


100%|██████████| 18504/18504 [00:00<00:00, 21418.16it/s]
100%|██████████| 8919/8919 [00:00<00:00, 29805.92it/s]
100%|██████████| 18504/18504 [00:00<00:00, 19775.15it/s]
100%|██████████| 8919/8919 [00:00<00:00, 35957.60it/s]


In [6]:
print(valid.shape)
print(test.shape)
valid

(548200, 57)
(548200, 57)


Unnamed: 0,userId,itemId,item_num,item_nuique_num,user_num_max,user_num_min,user_num_mean,user_num_std,user_nuique_num_max,user_nuique_num_min,user_nuique_num_mean,user_nuique_num_std,user_num,user_nuique_num,item_num_max,item_num_min,item_num_mean,item_num_std,item_nuique_num_max,item_nuique_num_min,item_nuique_num_mean,item_nuique_num_std,userId_itemId_emb_0,userId_itemId_emb_1,userId_itemId_emb_2,userId_itemId_emb_3,userId_itemId_emb_4,userId_itemId_emb_5,userId_itemId_emb_6,userId_itemId_emb_7,userId_itemId_emb_8,userId_itemId_emb_9,userId_itemId_emb_10,userId_itemId_emb_11,userId_itemId_emb_12,userId_itemId_emb_13,userId_itemId_emb_14,userId_itemId_emb_15,itemId_emb_0,itemId_emb_1,itemId_emb_2,itemId_emb_3,itemId_emb_4,itemId_emb_5,itemId_emb_6,itemId_emb_7,itemId_emb_8,itemId_emb_9,itemId_emb_10,itemId_emb_11,itemId_emb_12,itemId_emb_13,itemId_emb_14,itemId_emb_15,sim_mean,sim_max,sim_min
0,t2U1013768,P1015252,6,6,174,3,81.000,72.584,171,3,79.833,71.247,10.000,7.000,17.000,5.000,10.100,5.109,16.000,3.000,9.300,5.229,0.601,0.215,0.525,0.782,1.111,0.512,1.078,-0.086,0.640,-0.015,0.023,0.258,0.074,0.347,0.488,-1.863,0.096,-0.058,0.128,0.150,0.314,0.072,0.429,-0.156,0.221,0.166,-0.137,-0.097,-0.101,0.156,-0.176,-0.743,0.000,0.000,0.000
1,t2U1013768,P1003628,6,6,174,3,81.000,72.584,171,3,79.833,71.247,27.000,27.000,16.000,5.000,7.333,3.340,16.000,5.000,7.259,3.230,0.601,0.215,0.525,0.782,1.111,0.512,1.078,-0.086,0.640,-0.015,0.023,0.258,0.074,0.347,0.488,-1.863,0.294,-0.274,0.432,0.386,0.798,0.135,0.829,-0.219,0.501,0.392,-0.177,-0.270,-0.206,0.252,-0.218,-1.522,0.000,0.000,0.000
2,t2U1013768,P1002479,6,6,174,3,81.000,72.584,171,3,79.833,71.247,465.000,447.000,56.000,4.000,8.252,6.100,47.000,3.000,7.828,5.313,0.601,0.215,0.525,0.782,1.111,0.512,1.078,-0.086,0.640,-0.015,0.023,0.258,0.074,0.347,0.488,-1.863,-2.890,1.855,0.048,1.447,1.745,1.238,2.639,1.087,2.949,1.741,-1.349,2.160,-0.565,3.062,3.064,-2.202,0.003,0.006,0.000
3,t2U1013768,P1015969,6,6,174,3,81.000,72.584,171,3,79.833,71.247,20.000,20.000,27.000,5.000,7.750,4.972,27.000,4.000,7.550,5.010,0.601,0.215,0.525,0.782,1.111,0.512,1.078,-0.086,0.640,-0.015,0.023,0.258,0.074,0.347,0.488,-1.863,0.184,0.085,0.258,0.286,0.498,0.172,0.590,-0.040,0.333,0.269,-0.145,-0.031,-0.031,0.278,0.192,-0.888,0.003,0.019,0.000
4,t2U1013768,P1025474,6,6,174,3,81.000,72.584,171,3,79.833,71.247,67.000,65.000,51.000,5.000,10.388,8.395,47.000,5.000,10.030,7.891,0.601,0.215,0.525,0.782,1.111,0.512,1.078,-0.086,0.640,-0.015,0.023,0.258,0.074,0.347,0.488,-1.863,0.222,-0.012,0.752,0.572,1.206,0.306,1.627,-0.367,0.853,0.463,-0.297,-0.146,-0.048,0.270,0.141,-2.507,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548195,t2U1018491,P1024403,25,5,855,740,792.000,49.011,171,148,158.400,9.802,18.000,18.000,17.000,5.000,7.389,3.517,17.000,5.000,7.389,3.517,2.087,2.704,1.302,0.193,3.783,4.693,2.914,2.125,3.059,2.092,0.618,-1.536,2.378,3.461,-5.178,-5.401,0.189,-0.071,0.359,0.236,0.525,0.117,0.645,-0.233,0.340,0.242,-0.130,-0.111,-0.063,0.151,0.056,-1.060,0.000,0.000,0.000
548196,t2U1018491,P1008219,25,5,855,740,792.000,49.011,171,148,158.400,9.802,7.000,7.000,16.000,5.000,9.143,4.220,16.000,5.000,9.000,4.203,2.087,2.704,1.302,0.193,3.783,4.693,2.914,2.125,3.059,2.092,0.618,-1.536,2.378,3.461,-5.178,-5.401,0.089,-0.015,0.170,0.057,0.070,0.088,0.106,-0.034,0.085,-0.016,-0.102,-0.079,-0.013,-0.071,-0.079,-0.125,0.000,0.000,0.000
548197,t2U1018491,P1027219,25,5,855,740,792.000,49.011,171,148,158.400,9.802,10.000,10.000,22.000,5.000,9.500,6.060,18.000,5.000,8.900,5.131,2.087,2.704,1.302,0.193,3.783,4.693,2.914,2.125,3.059,2.092,0.618,-1.536,2.378,3.461,-5.178,-5.401,0.108,-0.053,0.266,0.121,0.292,0.087,0.355,-0.181,0.153,0.104,-0.037,-0.067,-0.034,0.032,-0.026,-0.621,0.000,0.000,0.000
548198,t2U1018491,P1002612,25,5,855,740,792.000,49.011,171,148,158.400,9.802,21.000,21.000,49.000,5.000,10.095,10.246,41.000,4.000,9.667,8.800,2.087,2.704,1.302,0.193,3.783,4.693,2.914,2.125,3.059,2.092,0.618,-1.536,2.378,3.461,-5.178,-5.401,0.228,0.011,0.309,0.312,0.548,0.101,0.600,-0.101,0.301,0.194,-0.101,-0.011,-0.034,0.167,0.067,-1.002,0.000,0.000,0.000


## 构建嫁接数据
无测试集，在tgt的验证集和训练集上预测

In [7]:
# 提取特征
user_feature_src, item_feature_src = get_static_feat(train_src.copy(), user_feature_src.copy(), item_feature_src.copy())
user_embed_src = emb(train_src.copy(), 'userId', 'itemId', src_market, mode='agg')
item_embed_src = emb(train_src.copy(), 'userId', 'itemId', src_market, mode='single')

# 验证集，同时也作为训练集
user_ids_src, item_ids_src = load_market_valid_run(src_data_dir+'valid_run.tsv')
valid_src = pd.DataFrame(columns=['userId','itemId'])
valid_src['userId'] = user_ids_src
valid_src['itemId'] = item_ids_src
# 合并统计特征
valid_src = valid_src.merge(user_feature_src, on='userId',how='left')
valid_src = valid_src.merge(item_feature_src, on='itemId',how='left')
# 合并w2v特征
valid_src = valid_src.merge(user_embed_src, on='userId',how='left')
valid_src = valid_src.merge(item_embed_src, on='itemId',how='left')
# 合并item_cf特征
valid_cf = get_sim_feature(train_src.copy(), valid_src.copy())
valid_src = valid_src.merge(valid_cf, how='left', on=['userId','itemId'])

# 将嫁接学习的valid的label merge上去
valid_qrel_src = pd.read_table(src_data_dir+'valid_qrel.tsv')
valid_src = valid_src.merge(valid_qrel_src, how='left', on=['userId','itemId']).rename({'rating':'label'}, axis=1)
valid_src['label'] = valid_src['label'].fillna(0)
print(valid_src.shape)

find w2v model
find w2v model


100%|██████████| 8925/8925 [00:00<00:00, 29404.68it/s]
100%|██████████| 2882/2882 [00:00<00:00, 37967.40it/s]


(332800, 58)


# 训练&预测

In [8]:
train = valid_src
test = pd.concat([valid, test], ignore_index=True)
y = train['label']
folds = KFold(n_splits=3, shuffle=True, random_state=546789)
s_folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=546789)
oof_preds, test_preds, importances = train_model_lgb(train, test, y, s_folds)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[81]	training's auc: 0.966919	training's binary_logloss: 0.0190923	valid_1's auc: 0.93925	valid_1's binary_logloss: 0.021312
Fold  1 AUC : 0.939250
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[79]	training's auc: 0.970476	training's binary_logloss: 0.0186466	valid_1's auc: 0.93162	valid_1's binary_logloss: 0.021888
Fold  2 AUC : 0.931620
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	training's auc: 0.975045	training's binary_logloss: 0.0181033	valid_1's auc: 0.934831	valid_1's binary_logloss: 0.0213758
Fold  3 AUC : 0.934831
=====Full AUC score 0.935101=====


In [9]:
test_preds.rename({'score':'transfer_score'}, axis=1).to_csv(f'./transfer_data/{src_market}_{tgt_market}_transfer.csv', index=False)