In [2]:
import pandas as pd
user = pd.read_csv("target_users.tsv")
item = pd.read_csv("item_history.tsv",sep="\t")


In [3]:
user_master = pd.read_csv("user_master.tsv",sep="\t")

### item to idx mapping

In [4]:
item_ids = item.item_id.unique()
user_ids = item.user_id.unique()
item2idx = {item:idx for idx,item in enumerate(item_ids)}
idx2item = {idx:item for item,idx in item2idx.items()}
user2idx = {user:idx for idx,user in enumerate(user_ids)}
idx2user = {idx:user for idx,user in enumerate(user_ids)}

In [5]:
item_user = set(item.user_id)
test_user = set(user.user_id.values)
len(item_user), len(test_user - item_user) 

(913707, 27078)

In [6]:
extra_user_ids = list(test_user - item_user)
start_id = len(user_ids)
for i in range(len(extra_user_ids)):
    user2idx[extra_user_ids[i]] = start_id+i
    idx2user[start_id+i] = extra_user_ids[i]

In [8]:
test_user - set(user_master.user_id)

set()

In [9]:
item.groupby("user_id")["user_id"].count().mean(),\
    item.groupby("item_id")["item_id"].count().mean()

(6.9168190678193335, 19.21990250073748)

In [10]:
item.shape,  item.user_id.max(), user.user_id.max(),

((6319946, 4), 1640955, 1640953)

### group train data by timestamp

In [7]:

timestamps = sorted(item.latest_timestamp.unique())
timestamp_order = {t:idx for idx,t in enumerate(timestamps)}

# timestamps

In [8]:
item["timestamp_rank"] = item["latest_timestamp"].apply(lambda x:timestamp_order[x])
# item = item.drop(columns=["timestamp_rank"])
item.timestamp_rank.nunique()

90

In [9]:
item["time_group"] = item["timestamp_rank"].apply(lambda x: x//5)
item

Unnamed: 0,user_id,item_id,latest_timestamp,frequency,timestamp_rank,time_group
0,484,I355496,1587686400,1,84,16
1,484,I301315,1587686400,1,84,16
2,815,I146224,1583798400,1,39,7
3,998,I159337,1581033600,1,7,1
4,998,I69340,1580688000,1,3,0
...,...,...,...,...,...,...
6319941,1640781,I75214,1588032000,1,88,17
6319942,1640781,I103751,1588032000,1,88,17
6319943,1640781,I66854,1587772800,1,85,17
6319944,1640781,I102695,1588032000,1,88,17


In [10]:
item.time_group.max(),item.time_group.nunique(),item.time_group.unique()

(17,
 18,
 array([16,  7,  1,  0,  3,  5, 17, 12, 13,  8, 14, 10,  9,  2,  6, 15,  4,
        11], dtype=int64))

In [11]:
item.columns

Index(['user_id', 'item_id', 'latest_timestamp', 'frequency', 'time_group'], dtype='object')

### user features

In [10]:
user_set = set(user.user_id).union(set(item.user_id))
len(user_set)
user_info = user_master[user_master.columns[:]]
user_info = user_info[user_info.user_id.isin(user_set)].reset_index(drop=True)
user_info.shape

(940785, 16)

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

def multihot_encode(user_fea, target, topk = 9999):
    feature_list = user_fea[target].apply(lambda x:str(x).split(',')).values
    feature_list_multihot = MultiLabelBinarizer().fit_transform(feature_list)
    common_feature_idx = np.argsort(feature_list_multihot.sum(axis=0))[-topk:]
    common_feature_idx = common_feature_idx[::-1]
    # print(feature_list_multihot.sum(axis=0)[common_feature_idx])
    # feature_list_multihot = feature_list_multihot.astype(np.bool_)
    extra_feature = pd.DataFrame(feature_list_multihot[:,common_feature_idx],dtype=bool)
    extra_feature.columns=[target+"_"+str(ii) for ii in range(extra_feature.shape[1])]
    # print(extra_feature.memory_usage())
    return extra_feature

In [12]:
for i in [1,3,5]:
    feature_list_multihot = multihot_encode(user_info,f"feature_list{i}",10)
    user_info = pd.concat([user_info,feature_list_multihot],axis=1)

In [13]:
user_feature = user_info.set_index("user_id")[list(user_info.columns[1:7]) + \
                        list(user_info.columns[16:])]

In [14]:
from sklearn.preprocessing import LabelEncoder
for i in range(1,7):
    col = "feature"+str(i)
    le = LabelEncoder()
    le.fit(user_feature[col].unique())
    user_feature.loc[:,col] = le.transform(user_info[col])

In [16]:
user_feature.columns,user_feature.index

(Index(['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
        'feature_list1_0', 'feature_list1_1', 'feature_list1_2',
        'feature_list1_3', 'feature_list1_4', 'feature_list1_5',
        'feature_list1_6', 'feature_list1_7', 'feature_list1_8',
        'feature_list1_9', 'feature_list3_0', 'feature_list3_1',
        'feature_list3_2', 'feature_list3_3', 'feature_list3_4',
        'feature_list3_5', 'feature_list3_6', 'feature_list3_7',
        'feature_list3_8', 'feature_list3_9', 'feature_list5_0',
        'feature_list5_1', 'feature_list5_2', 'feature_list5_3',
        'feature_list5_4', 'feature_list5_5', 'feature_list5_6',
        'feature_list5_7', 'feature_list5_8', 'feature_list5_9'],
       dtype='object'),
 Index([    484,     815,     998,    1213,    1462,    1962,    2137,    2842,
           3472,    3797,
        ...
        1638755, 1638951, 1639138, 1639195, 1639321, 1639523, 1639795, 1640422,
        1640554, 1640781],
       dtype='int64'

In [26]:
user_feature.head()

Unnamed: 0_level_0,feature1,feature2,feature3,feature4,feature5,feature6,feature_list1_0,feature_list1_1,feature_list1_2,feature_list1_3,...,feature_list5_0,feature_list5_1,feature_list5_2,feature_list5_3,feature_list5_4,feature_list5_5,feature_list5_6,feature_list5_7,feature_list5_8,feature_list5_9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
484,0,34,5,1,9,7,False,False,False,False,...,True,True,True,True,True,False,True,True,False,True
815,1,12,5,1,0,2,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
998,0,0,5,1,9,3,False,False,False,False,...,True,True,True,True,True,False,True,False,False,True
1213,0,2,5,1,7,4,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1462,0,5,5,1,6,3,True,True,True,False,...,False,False,False,False,False,True,False,False,False,False


### train / val split by user_id

In [76]:
from sklearn.model_selection import GroupKFold
kFold = GroupKFold(n_splits=5)
np.random.seed(42)
for i, (trn_idx, val_idx) in enumerate(kFold.split(item,item,groups=item["user_id"])):
    item_trn = item.iloc[trn_idx]
    item_val = item.iloc[val_idx]
    break


### feature engineering

In [15]:
from tqdm import tqdm
def make_item_features(item_feature):#, max_time_group, time_group_range=10):

    grouped_item = item_feature.groupby("item_id")
    item_ret = grouped_item["user_id"].agg(item_count="count").reset_index()

    item_ret = item_ret.merge(grouped_item["timestamp_rank"].agg(item_max_timerank="max"),on="item_id")
    item_ret = item_ret.merge(grouped_item["timestamp_rank"].agg(item_min_timerank="min"),on="item_id")
    
    item_ret["item_encode"] = item_ret.item_id.apply(lambda x:item2idx[x])
    return item_ret

def make_user_feature(user_feature,item_feature, item_trn):#,max_time_group, time_group_range=10): 
    grouped_item = item_feature.groupby("user_id")
    item_trn = item_trn.merge(grouped_item["item_id"].agg(user_count="count"),on="user_id",how="left")
    item_trn = item_trn.merge(user_feature,on="user_id")
    print("add user count", item_trn.user_count.isna().sum())#,item_trn.item_count.isna().sum(),item_trn.user_count.isna().sum())
    item_trn = item_trn.fillna({"user_count":0})

    item_trn = item_trn.merge(grouped_item["timestamp_rank"].agg(user_max_timerank="max"),on="user_id",how="left")
    item_trn = item_trn.merge(grouped_item["timestamp_rank"].agg(user_min_timerank="min"),on="user_id",how="left")
    print("add user time", item_trn.user_max_timerank.isna().sum())
    item_trn = item_trn.fillna({"user_count":0, "user_max_timerank":grouped_item["timestamp_rank"]\
        .max().mean(),  "user_min_timerank":grouped_item["timestamp_rank"].min().mean()})

    item_trn["user_encode"] = item_trn.user_id.apply(lambda x:user2idx[x])
    return item_trn

# cross features
def make_user_feature_item(user_feature, item_feature):
    item_user_feature = item_feature[["user_id","item_id"]].merge(user_feature,on=["user_id"])
    # feature_item_dict = {}
    feature_item_grouped = {}
    for feature_id in tqdm(range(1,7)):
        feature_name = "feature"+str(feature_id)
        # print(feature_name)
        item_fea_group = item_user_feature.groupby([feature_name,"item_id"])["user_id"]\
            .count().rename(f"feature{feature_id}_item")
        feature_item_grouped[feature_name] = item_fea_group
        
    for feature_id in [1,3,5]:
        feature_type = "feature_list"+str(feature_id)
        # print(feature_name)
        for feature_id_i in range(10):
            feature_name = feature_type +"_"+ str(feature_id_i)
            item_fea_group = item_user_feature.groupby([feature_name,"item_id"])["user_id"]\
                .count().rename(f"{feature_name}_item")
            feature_item_grouped[feature_name] = item_fea_group
    return feature_item_grouped


In [67]:
item_counts = item.item_id.value_counts()
np.sum(item_counts[:5000]), np.sum(item_counts[:10000]), np.sum(item_counts[:30000]), \
    np.sum(item_counts[:300000]), item.shape


(4017008, 4548487, 5276350, 6291123, (6319946, 6))

In [125]:
from BPR_lib import reduce_mem
time_top_candidates = dict()
time_trn_data = dict()
time_val_data = dict()
time_all_data = dict()
for ii in [13,14,15,16,17]:
    print("ii", ii)
    for split in ["all","val"]:#"train",
        item_split = item 
        if split=="train": item_split = item_trn
        elif split=="val": item_split = item_val

        hist_item = (item_split[item_split.time_group>=ii-10])
        hist_item = hist_item[hist_item.time_group<ii]
        # top_candidates_df = item_split[item_split.time_group>=ii-3]
        top_candidates_df = top_candidates_df[top_candidates_df.time_group==ii]
        top_candidates = top_candidates_df.item_id.value_counts().index[:30000]
        
        df_gt = (item_split[item_split.time_group==ii])
        df_gt = df_gt[df_gt.item_id.isin(top_candidates)]
        df_gt.loc[:,"label"] = 1

        df_gt = make_user_feature(user_feature,hist_item,df_gt)
        item_fea = make_item_features(hist_item)
        time_user_fea_item = make_user_feature_item(user_feature,hist_item)

        neg_samples = 10
        neg_df_list = []

        for i in range(neg_samples):
            neg_df = df_gt.copy()
            neg_df["label"] = 0
            neg_df.loc[:,"item_id"] = np.random.choice(top_candidates,neg_df.shape[0])
            neg_df_list.append(neg_df)
        concat_data = pd.concat([df_gt,*neg_df_list])

        concat_data = concat_data.merge(item_fea, on="item_id",how="left")
        print("merge item feature", concat_data.item_count.isna().sum(), concat_data.shape)
        concat_data = concat_data.fillna({"item_count":0,"item_min_timerank":\
            item_fea.item_min_timerank.mean(), "item_max_timerank": item_fea.item_max_timerank.mean()})
        concat_data["item_encode"] = concat_data["item_id"].apply(lambda x:item2idx[x])

        concat_data = concat_data.drop(columns = ["timestamp_rank","frequency","latest_timestamp","time_group"])
        concat_data["time_group"] = ii
       
        for feature_id in range(1,7):
            feature_name = "feature"+str(feature_id)
            concat_data = concat_data.merge(time_user_fea_item[feature_name],\
                on=[feature_name,"item_id"],how="left")
            concat_data = concat_data.fillna({f"{feature_name}_item":0})
            concat_data[feature_name] = concat_data[feature_name].astype(int)
        
        for feature_id in [1,3,5]:
            feature_type = "feature_list"+str(feature_id)
            # print(feature_name)
            for feature_id_i in range(10):
                feature_name = feature_type +"_"+ str(feature_id_i)
                concat_data = concat_data.merge(time_user_fea_item[feature_name],\
                        on=[feature_name,"item_id"],how="left")
                concat_data = concat_data.fillna({f"{feature_name}_item":0})
                concat_data[f"{feature_name}_item"] = concat_data[f"{feature_name}_item"].astype(np.float32)
        if split == "train": time_trn_data[ii] = concat_data
        elif split=="val": time_val_data[ii] = concat_data
        else: time_all_data[ii] = concat_data


ii 13
add user count 158499
add user time 158499


100%|██████████| 6/6 [00:09<00:00,  1.57s/it]


merge item feature 83902 (3816615, 51)


In [55]:
import pickle
time_trn_data = dict()
time_all_data = dict()
for ii in range(11,18):
    # time_trn_data[ii]["time_group"] = ii
    # time_all_data[ii].to_pickle(f"xgb_data/all_81_sample10_top3w_{ii}.pkl")
    # time_all_data[ii]["time_group"] = ii
    # time_all_data[ii].to_pickle(f"xgb_data/all_80_sample10_top5w_{ii}.pkl")
    time_all_data[ii] = pd.read_pickle(f"xgb_data/all_80_sample10_top5w_{ii}.pkl")

### LR & XGB classification

In [80]:
train_data = pd.concat([time_all_data[ii] for ii in [11,12,16]])
val_data_time = time_all_data[16]
# val_data_user = pd.concat([time_val_data[ii] for ii in list(range(13,17))])


In [262]:
train_data.shape,train_data.label.sum()

((12115983, 84), 1101453)

In [1]:
# train_data.shape, val_data_time.shape, val_data_user.shape,time_trn_data[13].shape
#     time_trn_item[13].shape,train_data.label.sum(),val_data_user.label.sum()

In [1009]:
len(set(val_data_user.item_id).intersection(train_data.item_id)),\
    val_data_user.item_id.nunique()

(57210, 77277)

In [1380]:
train_data.item_id.nunique(),time_trn_data[13].item_id.nunique(),\
    val_data_time.item_id.nunique(),val_data_user.item_id.nunique()

(36658, 30000, 30000, 41098)

In [None]:
val_data_user.time_group.unique(), train_data.time_group.unique()

In [1336]:
val_data_user.item_count.mean(), train_data.item_count.mean()

(46.220843257699215, 187.6863987898326)

In [1404]:
# val_data_user.user_max_timerank.mean(),train_data.user_max_timerank.mean(),\
#     val_data_time.user_max_timerank.mean()

(895689477.7144561, 895691904.8289988, 933698216.3643284)

In [1094]:
val_data_time.user_id.nunique(), train_data.user_id.nunique(),\
len(set(val_data_time.user_id).intersection(train_data.user_id))

(114977, 68736)

In [2]:
# val_data_user.frequency.value_counts()[:5], train_data.frequency.value_counts()[:5]

In [81]:
# trn_cols = [x for x in time_all_data[13].columns if x not in ["label","user_id","item_id"]]
trn_cols = [x for x in train_data.columns if x not in ["label","user_id","item_id"]]

# trn_cols =  [x for x in train_data.columns if x not in ['user_id', 'item_id', \
#     'latest_timestamp', 'frequency','timestamp_rank', 'time_group', \
#        "label"]]
# trn_cols = [x for x in trn_cols if x not in ["item_count"]]
# trn_cols = [x for x in trn_cols if x not in ["user_count"]]
# trn_cols =  [x for x in trn_cols if x not in \
#     ['time_group', 'timestamp_rank','latest_timestamp']]
trn_cols =  [x for x in trn_cols if x not in ['timestamp_rank','latest_timestamp']]
# trn_cols =  [x for x in trn_cols if x not in \
#     ['user_max_timerank', 'user_min_timerank']]
# trn_cols = [x for x in trn_cols if x not in ['item_max_timerank','item_min_timerank']]

trn_cols =  [x for x in trn_cols if x not in ['frequency']]
# trn_cols =  [x for x in trn_cols if x not in ['user_encode']]
# trn_cols =  [x for x in trn_cols if x not in ['item_encode']]
# trn_cols = ["user_encode", "item_encode"]
# trn_cols = [x for x in trn_cols if "_item" not in x]
trn_cols = [x for x in trn_cols if "feature_list" not in x]
trn_cols = [x for x in trn_cols if "list" not in x or "item" not in x]
# trn_cols = [x for x in trn_cols if "feature_list1" not in x]
# trn_cols = [x for x in trn_cols if x not in [f"feature{i}" for i in range(1,7)]]

# trn_cols = [x for x in trn_cols if x not in [f"feature_list1_{i}" for i in range(10)]]
# trn_cols = [x for x in trn_cols if x not in [f"feature_list3_{i}" for i in range(10)]]
# trn_cols = [x for x in trn_cols if x not in [f"feature_list5_{i}" for i in range(10)]]

# trn_cols = ["user_encode"]
# trn_cols = ["item_encode"]
# trn_cols = trn_cols 
trn_cols,len(trn_cols)



(['user_count',
  'feature1',
  'feature2',
  'feature3',
  'feature4',
  'feature5',
  'feature6',
  'user_max_timerank',
  'user_min_timerank',
  'user_encode',
  'item_count',
  'item_max_timerank',
  'item_min_timerank',
  'item_encode',
  'time_group',
  'feature1_item',
  'feature2_item',
  'feature3_item',
  'feature4_item',
  'feature5_item',
  'feature6_item'],
 21)

In [51]:
check_data = train_data
for col_t in trn_cols:
    check_str = col_t +"\t\t\t" +str(check_data[col_t].dtype)
    na_count = check_data[check_data[col_t].isna()].shape[0]
    if na_count:
        check_str+="\t has na" + "\t" + str(na_count)
    print(check_str)

user_count			float64
feature1			int32
feature2			int32
feature3			int32
feature4			int32
feature5			int32
feature6			int32
feature_list1_0			bool
feature_list1_1			bool
feature_list1_2			bool
feature_list1_3			bool
feature_list1_4			bool
feature_list1_5			bool
feature_list1_6			bool
feature_list1_7			bool
feature_list1_8			bool
feature_list1_9			bool
feature_list3_0			bool
feature_list3_1			bool
feature_list3_2			bool
feature_list3_3			bool
feature_list3_4			bool
feature_list3_5			bool
feature_list3_6			bool
feature_list3_7			bool
feature_list3_8			bool
feature_list3_9			bool
feature_list5_0			bool
feature_list5_1			bool
feature_list5_2			bool
feature_list5_3			bool
feature_list5_4			bool
feature_list5_5			bool
feature_list5_6			bool
feature_list5_7			bool
feature_list5_8			bool
feature_list5_9			bool
user_encode			int64
item_count			float64
item_encode			int64
feature1_item			float64
feature2_item			float64
feature3_item			float64
feature4_item			float64
feature5_item			float64
featur

In [768]:
check_data[check_data.label==1].item_encode.isna().sum()

0

In [90]:
train_data.time_group.unique(), val_data_time.time_group.unique()
# val_data.time_group.unique(), val_data.shape,train_data.time_group.unique()

(array([11, 12, 16], dtype=int64), array([16], dtype=int64))

In [91]:
import xgboost as xgb

# train_data_shuffled = train_data#.sample(frac=1)
# train_data = train_data[trn_cols + ["label"]] 
# train_data = train_data.sample(frac=1)
X_trn = train_data[trn_cols]#.dropna(how='any',axis=0)
# X_trn = X_trn[:,:50]
y_trn = train_data["label"]
# X_trn = X_trn.drop(columns=["label"])
# print(X_trn.shape, len(train_data))#,len(X_val_time), len(X_val_user))#,len(val_group_user),len(X_val_user)

model = xgb.XGBClassifier(device="cuda")#device="gpu")#tree_method="gpu_hist", device="cuda")
model.fit(X_trn,y_trn)


In [63]:
train_data.shape,X_trn.shape,y_trn.sum()

((12115983, 84), (12115983, 21), 1101453)

In [20]:
import pickle
# pickle.dump(model,open("xgb_sample4_all_feature_1.pkl",'wb'))

# pickle.dump(model,open("xgb_model/xgb_sample10_51fea_all_data_top5w.pkl",'wb'))
# model = pickle.load(open("xgb_sample8_50_feature_all_data.pkl",'rb'))
# model = pickle.load(open("xgb_sample8_80_feature_trn_data.pkl",'rb'))
# model = pickle.load(open("xgb_sample10_50_feature_trn_data.pkl",'rb'))
# model = pickle.load(open("xgb_model/xgb_sample10_51fea_all_data_top5w.pkl",'rb')
model = pickle.load(open("xgb_model/xgb_all_sample8_16fea_all_data_top5w.pkl",'rb'))

In [92]:
trn_preds = []
seg = len(X_trn)//3
import cupy as cp
# trn_preds.append(model.predict_proba(cp.array(X_trn[:seg].astype(np.float16)))[:,1])
# trn_preds.append(model.predict_proba(cp.array(X_trn[seg:seg*2].astype(np.float16)))[:,1])
# trn_preds.append(model.predict_proba(cp.array(X_trn[seg*2:].astype(np.float16)))[:,1])
# trn_preds.append(model.predict_proba(X_trn[:seg])[:,1])
# trn_preds.append(model.predict_proba(X_trn[seg:seg*2])[:,1])
# trn_preds.append(model.predict_proba(X_trn[seg*2:])[:,1])
# trn_pred = np.concatenate(trn_preds)
# X_trn = train_data[trn_cols]#.dropna(how='any',axis=0)
# y_trn = train_data["label"]
trn_pred = model.predict_proba(X_trn)[:,1]

X_val_time = val_data_time[trn_cols]#.dropna(how='any',axis=0)
y_val_time = val_data_time["label"]
val_pred_time = model.predict_proba(cp.array(X_val_time.astype(np.float32)))[:,1]
# val_pred_user = model.predict_proba(X_val_user)[:,1]

In [59]:
len(y_trn), len(trn_preds[0]), len(trn_preds[1]), len(trn_preds[2]),len(trn_pred)

(3455903, 1151967, 1151967, 1151969, 3455903)

In [20]:
sum(y_trn),y_trn.shape

(1331654, (5326616,))

In [93]:
import numpy as np
# select top preds so that the total number of pred 1s equal to that of label
tt = np.zeros(trn_pred.shape)
true_count = y_trn.sum()
tt[np.argpartition(trn_pred, -true_count)[-true_count:]] = 1
trn_pred = tt

tt = np.zeros(val_pred_time.shape)
true_count = y_val_time.sum()
tt[np.argpartition(val_pred_time, -true_count)[-true_count:]] = 1
val_pred_time = tt

tt = np.zeros(val_pred_user.shape)
true_count = y_val_user.sum()
tt[np.argpartition(val_pred_user, -true_count)[-true_count:]] = 1
val_pred_user = tt


In [44]:
trn_pred.shape, len(X_trn), val_pred_time.shape, len(y_val_time)

((43944582,), 14648194, (3364368,), 3364368)

In [120]:
print(X_trn.shape,X_val_time.shape, X_val_user.shape)
print(y_trn.sum(),y_val_time.sum(), y_val_user.sum())
print(trn_pred.sum(),val_pred_time.sum(),val_pred_user.sum())

NameError: name 'X_val_time' is not defined

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,recall_score

print(accuracy_score(trn_pred, y_trn))#,sample_weight=[0,1]))
# print(accuracy_score(val_pred_user, y_val_user,sample_weight=[0,1]))
print(accuracy_score(val_pred_time, y_val_time))#,sample_weight=[0,1]))
print(recall_score(trn_pred, y_trn))
# print(recall_score(val_pred_user, y_val_user))
print(recall_score(val_pred_time, y_val_time))


In [40]:
labels = y_val_user
preds = val_pred_user
assert(len(labels)==len(preds))
TP = sum((labels==preds)&labels)
TP_FN = sum(labels)
TP_FP = sum(preds)
print("TP", TP, "FN", TP_FN-TP,"FP",TP_FP-TP,"total",len(preds))
print("accuracy", TP/TP_FP, "recall",TP/TP_FN)

TP 82396 FN 218148 FP 218148.0 total 3535506
accuracy 0.27415619676320274 recall 0.27415619676320274


In [94]:
labels = y_val_time
preds = val_pred_time
assert(len(labels)==len(preds))
TP = sum((labels==preds)&labels)
TP_FN = sum(labels)
TP_FP = sum(preds)
print("TP", TP, "FN", TP_FN-TP,"FP",TP_FP-TP,"total",len(preds))
print("accuracy", TP/TP_FP, "recall",TP/TP_FN)

TP 249012 FN 140037 FP 140037.0 total 4279539
accuracy 0.6400530524432655 recall 0.6400530524432655


In [95]:
labels = y_trn.values
preds = trn_pred
assert(len(labels)==len(preds))
TP = sum((labels==preds)&labels)
TP_FN = sum(labels)
TP_FP = sum(preds)
print("TP", TP, "FN", TP_FN-TP,"FP",TP_FP-TP,"total",len(preds))
print("accuracy", TP/TP_FP, "recall",TP/TP_FN,"total",len(preds))

TP 666091 FN 373623 FP 373623.0 total 11436854
accuracy 0.6406482936653733 recall 0.6406482936653733 total 11436854


### construct test set and recall

In [104]:
# test_candidates = list(item_hist_test.item_id.value_counts()[:30000].index)
test_group = 15
test_candidates = item[item.time_group.isin(list(range(test_group,test_group+1)))]\
    .item_id.value_counts().index[:50000]
# test_candidates = item.item_id.value_counts().index[:50000]

test_df = pd.DataFrame({"item_id":test_candidates})
test_df["time_group"] = test_group

item_hist_test = item[item.time_group>=test_group-10]
item_hist_test = item_hist_test[item_hist_test.time_group<test_group]
# item_hist_test = item

item_fea_test = make_item_features(item_hist_test)
test_df = test_df.merge(item_fea_test,on="item_id",how="left")

# user_id_test = list(test_user)
user_id_test = list(set(item[item.time_group==test_group].user_id))#.intersection(user_id_test))
# user_id_test = list(test_user-item_user)
# user_id_test = list(set(item.user_id).intersection(user_id_test))
print("test user for this group",len(user_id_test))
np.random.shuffle(user_id_test)

test_user_feature = pd.DataFrame({"user_id":user_id_test})

test_user_feature = make_user_feature(user_feature,item_hist_test, test_user_feature)
test_user_feature = test_user_feature.set_index("user_id")

test user for this group 156510
add user count 44308
add user time 44308


In [105]:
from tqdm import tqdm
item_user_feature = item_hist_test[["user_id","item_id"]].merge(user_feature,on=["user_id"])
test_candidate_df = pd.DataFrame({"item_id":test_candidates})
feature_item_dict = {}
for feature_id in range(1,7):
    feature_name = "feature"+str(feature_id)
    print(feature_name)
    item_fea_group = item_user_feature.groupby([feature_name,"item_id"])["user_id"]\
        .count().rename(f"feature{feature_id}_item")
    feature_item_dict[feature_name] = dict()
    for feat_v in (item_user_feature[feature_name].unique()):
        feature_item_dict[feature_name][feat_v] = test_candidate_df.merge(item_fea_group\
            .loc[feat_v],on="item_id",how="left").fillna({f"feature{feature_id}_item":0})

# for feature_id in [1,3,5]:
#     feature_type = "feature_list"+str(feature_id)
#     # print(feature_name)
#     for feature_id_i in range(10):
#         feature_name = feature_type +"_"+ str(feature_id_i)
#         item_fea_group = item_user_feature.groupby([feature_name,"item_id"])["user_id"]\
#             .count().rename(f"{feature_name}_item")
#         feature_item_dict[feature_name] = dict()
#         for feat_v in (item_user_feature[feature_name].unique()):
#             feature_item_dict[feature_name][feat_v] = test_candidate_df.merge(item_fea_group\
#                 .loc[feat_v],on="item_id",how="left").fillna({f"{feature_name}_item":0})


feature1
feature2
feature3
feature4
feature5
feature6


In [56]:
# Check the item and user features are close to that of training data
# train_data[train_data.item_id=="I1025"]["item_count"].unique(),\
#     train_data[train_data.item_id=="I123826"].item_count.unique()
# item_fea_test[item_fea_test.item_id=="I1025"],item_fea_test[item_fea_test.item_id=="I123826"]
test_user_feature.loc[14895].user_count,test_user_feature.loc[134543].user_count
train_data[train_data.user_id==14895].user_count.unique(),\
    train_data[train_data.user_id==134543].user_count.unique()


In [296]:
feature_item_dict["feature1"][0].set_index("item_id").loc["I103751"]

feature1_item    10369.0
Name: I103751, dtype: float64

In [297]:
train_data.set_index(["feature1","item_id"]).loc[0,"I103751"].feature1_item.unique()

  train_data.set_index(["feature1","item_id"]).loc[0,"I103751"].feature1_item.unique()


array([ 9640.,  9784., 10368.], dtype=float16)

In [103]:
import pickle
pickle.dump(model,open("xgb_model/xgb_group16_sample10_21fea_all_data_top5w.pkl",'wb'))
# model = pickle.load(open("xgb_model/xgb_sample10_51fea_all_data_top5w.pkl",'rb'))
# model1 = pickle.load(open("xgb_sample8_50_feature_all_data.pkl",'rb'))
# model2 = 

In [99]:
time_all_data[11].shape,time_all_data[11].label.sum()

((3260224, 84), 296384)

In [None]:

import cupy as cp
# trn_preds.append(model.predict_proba(cp.array(X_trn[:seg].astype(np.float16)))[:,1])
recalled = 0
oof = 0
hit = 0
gt = 0
rand_hit = 0
item_indexed = item.set_index("user_id")
user_with_item = set(item.user_id)
user_buffer = []
trn_buffer = []
agg_size = 100
ret = []
with tqdm(total = len(user_id_test)) as pbar:
    for ii in range(len(user_id_test)):
        user_t = user_id_test[ii]
        user_buffer.append(user_t)
        test_df["user_id"] = user_t
        user_feature_t = test_user_feature.loc[user_t]
        for user_feature_col in user_feature_t.index:
            test_df[user_feature_col] = user_feature_t[user_feature_col]

        for jj in range(1,7):
            test_df[f"feature{jj}_item"] = feature_item_dict[f"feature{jj}"]\
                [user_feature_t[f"feature{jj}"]].values[:,1].astype(np.int32)
        
        # for feature_id in [1,3,5]:
        #     feature_type = "feature_list"+str(feature_id)
        #     # print(feature_name)
        #     for feature_id_i in range(10):    
        #         feature_name = feature_type +"_"+ str(feature_id_i)
        #         test_df[f"{feature_name}_item"] = feature_item_dict[feature_name]\
        #             [user_feature_t[feature_name]].values[:,1].astype(int)
        
        # # print("***", test_df.shape,test_df.feature1_item.isna().sum())
        trn_buffer.append(test_df[trn_cols])
        pbar.update(1)
        if pbar.n % agg_size == 0 or pbar.n==len(user_id_test):
            trn_agg = np.concatenate(trn_buffer)
            trn_buffer = []
            # preds_agg = model.predict_proba(cp.array(trn_agg))[:,1]
            preds_agg = model.predict_proba(trn_agg)[:,1]
            # print("pred agg", trn_agg.shape)
            for kk in range(len(user_buffer)):
                preds = preds_agg[kk*len(test_candidates):(kk+1)*len(test_candidates)]
                user_t = user_buffer[kk]
                topk_idx = np.argsort(preds)[-72:][::-1]        
                item_pred = np.array(test_candidates)[topk_idx]
                if user_t in user_with_item:        
                    rand_75 = np.random.choice(test_candidates,75)
                    item_gt = item_indexed.loc[user_t].item_id
                    recalled += 75
                    hit += len(set(item_pred).intersection(set(item_gt)))
                    rand_hit += len(set(rand_75).intersection(set(item_gt)))
                    gt += len(item_gt)
                else: oof+=1
                ret.append([user_t,item_pred])
                pbar.set_postfix(recall=recalled,hit=hit,rand_hit=rand_hit,oof=oof,gt=gt,r=hit/gt)
            user_buffer = []

In [45]:
import numpy as np
ret_user = [x[0] for x in ret]
ret_items = [list(x[1][:72]) for x in ret]
ret_df = pd.DataFrame({"user_id":ret_user, "recalled_item":ret_items})

In [46]:
ret_df.to_csv("result/xgb_group_oof_2.csv",index=None)