In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
# import optuna.integration.lightgbm as lgb  # 调参用
from lightgbm import LGBMClassifier
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold

import math
from collections import defaultdict
from gensim.models import Word2Vec
from tqdm import tqdm
import os
from lgb_utils import *

import joblib
import random

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas(desc='pandas bar')

pd.set_option('max_columns', None)
pd.set_option('max_rows', 300)
pd.set_option('float_format', lambda x: '%.3f' % x)

tgt_market = 't1'
src_market = 's1'
tgt_data_dir = f'./DATA/{tgt_market}/'

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
train = pd.read_table(tgt_data_dir+'train.tsv')
train_5core = pd.read_table(tgt_data_dir+'train_5core.tsv')

train_cross = train_5core.merge(train, on=['userId', 'itemId'], how='left')
train_5core = train_cross[train_cross['rating_y'].isnull()][['userId', 'itemId']]

train = pd.concat([train, train_5core, train_5core, train_5core, train_5core, train_5core], ignore_index=True)
print(train.shape)

(128075, 3)


In [3]:
train.head()

Unnamed: 0,userId,itemId,rating
0,t2U1008825,P1014064,5.0
1,t2U1000444,P1011611,1.0
2,t2U1014700,P1028053,5.0
3,t2U1011417,P1018377,4.0
4,t2U1014369,P1020825,5.0


# 提取用户侧和物品侧的特征

In [4]:
# 转成列表
user_feature = train.groupby("userId").agg(item_list=("itemId", list), rating_list=("rating", list)).reset_index()
item_feature = train.groupby("itemId").agg(user_list=("userId", list), rating_list=("rating", list)).reset_index()

# 构建训练集和测试集

## 构建训练集，未交互过的样本当作负样本

## 构建验证集&测试集

In [5]:
def load_market_valid_run(valid_run_file):  # 把一行item_id分别拆到user_id中,构成<user_id,item_id>pair
    users, items = [], []
    with open(valid_run_file, 'r') as f:
        for line in f:
            linetoks = line.split('\t')
            user_id = linetoks[0]
            item_ids = linetoks[1].strip().split(',')
            for cindex, item_id in enumerate(item_ids):
                users.append(user_id)
                items.append(item_id)

    return users, items

# 提取特征
user_feature, item_feature = get_static_feat(train.copy(), user_feature.copy(), item_feature.copy())
user_embed = emb(train.copy(), 'userId', 'itemId', tgt_market, mode='agg')
item_embed = emb(train.copy(), 'userId', 'itemId', tgt_market, mode='single')

# 验证集，同时也作为训练集
user_ids, item_ids = load_market_valid_run(tgt_data_dir+'valid_run.tsv')
valid = pd.DataFrame(columns=['userId','itemId'])
valid['userId'] = user_ids
valid['itemId'] = item_ids
# 合并统计特征
valid = valid.merge(user_feature, on='userId',how='left')
valid = valid.merge(item_feature, on='itemId',how='left')
# 合并w2v特征
valid = valid.merge(user_embed, on='userId',how='left')
valid = valid.merge(item_embed, on='itemId',how='left')

# 测试集
user_ids, item_ids = load_market_valid_run(tgt_data_dir+'test_run.tsv')
test = pd.DataFrame(columns=['userId','itemId'])
test['userId'] = user_ids
test['itemId'] = item_ids
# 合并统计特征
test = test.merge(user_feature, on='userId',how='left')
test = test.merge(item_feature, on='itemId',how='left')
# 合并w2v特征
test = test.merge(user_embed, on='userId',how='left')
test = test.merge(item_embed, on='itemId',how='left')
# test.isnull().sum()

# 合并嫁接学习分数特征
transfer_feat_s1 = pd.read_csv(f'./transfer_data/s1_{tgt_market}_transfer.csv')
transfer_feat_s2 = pd.read_csv(f'./transfer_data/s2_{tgt_market}_transfer.csv')
transfer_feat_s3 = pd.read_csv(f'./transfer_data/s3_{tgt_market}_transfer.csv')

valid = valid.merge(transfer_feat_s1, on=['userId', 'itemId'],how='left')
test = test.merge(transfer_feat_s1, on=['userId', 'itemId'],how='left')

# valid = valid.merge(transfer_feat_s2, on=['userId', 'itemId'],how='left')
# test = test.merge(transfer_feat_s2, on=['userId', 'itemId'],how='left')

# valid = valid.merge(transfer_feat_s3, on=['userId', 'itemId'],how='left')
# test = test.merge(transfer_feat_s3, on=['userId', 'itemId'],how='left')

find w2v model
find w2v model


In [6]:
# 合并item_cf特征
valid_cf = get_sim_feature(train.copy(), valid.copy())
test_cf = get_sim_feature(train.copy(), test.copy())
valid = valid.merge(valid_cf, how='left', on=['userId','itemId'])
test = test.merge(test_cf, how='left', on=['userId','itemId'])

100%|██████████| 18504/18504 [00:00<00:00, 25081.13it/s]
100%|██████████| 8919/8919 [00:00<00:00, 42308.59it/s]
100%|██████████| 18504/18504 [00:00<00:00, 24914.57it/s]
100%|██████████| 8919/8919 [00:00<00:00, 41140.84it/s]


In [7]:
# 将valid的label merge上去
valid_qrel = pd.read_table(tgt_data_dir+'valid_qrel.tsv')
valid = valid.merge(valid_qrel, how='left', on=['userId','itemId']).rename({'rating':'label'}, axis=1)
valid['label'] = valid['label'].fillna(0)
print(valid.shape)

(602317, 59)


# 训练&预测

In [8]:
train = valid
y = train['label']
folds = KFold(n_splits=3, shuffle=True, random_state=546789)
s_folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=546789)
oof_preds, test_preds, importances = train_model_lgb(train, test, y, s_folds)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[103]	training's auc: 0.937751	training's binary_logloss: 0.0250958	valid_1's auc: 0.892234	valid_1's binary_logloss: 0.0284825
Fold  1 AUC : 0.892234
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[81]	training's auc: 0.932008	training's binary_logloss: 0.0258244	valid_1's auc: 0.890189	valid_1's binary_logloss: 0.02767
Fold  2 AUC : 0.890189
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	training's auc: 0.935478	training's binary_logloss: 0.025266	valid_1's auc: 0.891585	valid_1's binary_logloss: 0.0280631
Fold  3 AUC : 0.891585
=====Full AUC score 0.891279=====


In [9]:
run_dir = './baseline_outputs/sample_run/'

In [10]:
test_preds.sort_values(by=['userId', 'score'], ascending=[True, False], inplace=True)
oof_preds.sort_values(by=['userId', 'score'], ascending=[True, False], inplace=True)

test_preds.to_csv(run_dir+f'{tgt_market}/test_pred.tsv', sep='\t', index=False)
oof_preds.to_csv(run_dir+f'{tgt_market}/valid_pred.tsv', sep='\t', index=False)