In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
n_files = 40

In [2]:
import glob
import pandas as pd
import numpy as np
import cudf
import cupy
import gc

from datetime import datetime

In [3]:
train_files = sorted(glob.glob('/raid/recsys2021_pre_TE_2/*'))[0:n_files]
valid_files = sorted(glob.glob('/raid/recsys2021_pre_TE_3/*'))[0:10]

In [4]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"

In [5]:
import dask as dask, dask_cudf
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import subprocess

In [6]:
cluster = LocalCUDACluster(local_directory='/raid/dask9450/', device_memory_limit=0.4)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:35307  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 429.50 GB


In [7]:
train = dask_cudf.read_parquet(sorted(train_files))
valid = dask_cudf.read_parquet(sorted(valid_files))

In [8]:
train, valid = dask.persist(train, valid)

In [9]:
list(train.columns)

['TE_a_user_id_like',
 'TE_a_user_id_reply',
 'TE_a_user_id_retweet',
 'TE_a_user_id_retweet_comment',
 'TE_b_is_verified_tweet_type_like',
 'TE_b_is_verified_tweet_type_reply',
 'TE_b_is_verified_tweet_type_retweet',
 'TE_b_is_verified_tweet_type_retweet_comment',
 'TE_b_user_id_a_user_id_like',
 'TE_b_user_id_a_user_id_reply',
 'TE_b_user_id_a_user_id_retweet',
 'TE_b_user_id_a_user_id_retweet_comment',
 'TE_b_user_id_like',
 'TE_b_user_id_reply',
 'TE_b_user_id_retweet',
 'TE_b_user_id_retweet_comment',
 'TE_b_user_id_tweet_type_language_like',
 'TE_b_user_id_tweet_type_language_reply',
 'TE_b_user_id_tweet_type_language_retweet',
 'TE_b_user_id_tweet_type_language_retweet_comment',
 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_like',
 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_reply',
 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_retweet',
 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_retweet_comment',
 '

In [10]:
label_names = ['reply', 'retweet', 'retweet_comment', 'like']
DONT_USE = ['timestamp','a_account_creation','b_account_creation','engage_time',
            'fold','b_user_id','a_user_id', 'dt_dow', 'a_account_creation', 
            'b_account_creation', 'elapsed_time', 'links','domains','hashtags','id', 'date', 'is_train', 
            'tw_original_http0', 'tw_original_user0', 'tw_original_user1', 'tw_original_user2',
            'tw_rt_count_char', 'tw_rt_count_words', 'tw_rt_user0', 'tw_tweet', 'tw_word0',
            'tw_word1', 'tw_word2', 'tw_word3', 'tw_word4', 'tw_count_hash', 'dt_minute', 'dt_second']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

RMV = [c for c in DONT_USE if c in train.columns and c not in label_names]
RMV = list(set(RMV))

In [11]:
%%time

train = train.drop(RMV,axis=1)
wait(train)

CPU times: user 942 ms, sys: 367 ms, total: 1.31 s
Wall time: 10.9 s


DoneAndNotDoneFutures(done={<Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 4)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 10)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 39)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 16)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 2)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 13)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 5)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 19)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-953e62dce43ee905f304c940b793eb51', 8)>, <Future: finished, type: cudf.DataFrame, key: ('read-parqu

In [12]:
%%time

valid = valid.drop(RMV,axis=1)
wait(valid)

CPU times: user 17.2 ms, sys: 4.95 ms, total: 22.2 ms
Wall time: 20.3 ms


DoneAndNotDoneFutures(done={<Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 6)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 8)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 5)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 3)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 7)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 2)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 9)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 1)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2aa09ce7662b59df1695a6157797b8d1', 0)>, <Future: finished, type: cudf.DataFrame, key: ('read-parquet-2a

In [13]:
Y_train = train[label_names]
Y_train, = dask.persist(Y_train)
Y_train.head()    
    
train = train.drop(['tweet_id']+label_names,axis=1)
train, = dask.persist(train)
train.head()


features = [c for c in train.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),train.shape[1])
np.asarray(features)

Using 84 features: 84


array(['TE_a_user_id_like', 'TE_a_user_id_reply', 'TE_a_user_id_retweet',
       'TE_a_user_id_retweet_comment', 'TE_b_is_verified_tweet_type_like',
       'TE_b_is_verified_tweet_type_reply',
       'TE_b_is_verified_tweet_type_retweet',
       'TE_b_is_verified_tweet_type_retweet_comment',
       'TE_b_user_id_a_user_id_like', 'TE_b_user_id_a_user_id_reply',
       'TE_b_user_id_a_user_id_retweet',
       'TE_b_user_id_a_user_id_retweet_comment', 'TE_b_user_id_like',
       'TE_b_user_id_reply', 'TE_b_user_id_retweet',
       'TE_b_user_id_retweet_comment',
       'TE_b_user_id_tweet_type_language_like',
       'TE_b_user_id_tweet_type_language_reply',
       'TE_b_user_id_tweet_type_language_retweet',
       'TE_b_user_id_tweet_type_language_retweet_comment',
       'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_like',
       'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_reply',
       'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified

In [14]:
Y_valid = valid[label_names]
Y_valid, = dask.persist(Y_valid)
Y_valid.head()    
    
valid = valid.drop(['tweet_id']+label_names,axis=1)
valid, = dask.persist(valid)
valid.head()


features = [c for c in valid.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),valid.shape[1])
np.asarray(features)

Using 84 features: 84


array(['TE_a_user_id_like', 'TE_a_user_id_reply', 'TE_a_user_id_retweet',
       'TE_a_user_id_retweet_comment', 'TE_b_is_verified_tweet_type_like',
       'TE_b_is_verified_tweet_type_reply',
       'TE_b_is_verified_tweet_type_retweet',
       'TE_b_is_verified_tweet_type_retweet_comment',
       'TE_b_user_id_a_user_id_like', 'TE_b_user_id_a_user_id_reply',
       'TE_b_user_id_a_user_id_retweet',
       'TE_b_user_id_a_user_id_retweet_comment', 'TE_b_user_id_like',
       'TE_b_user_id_reply', 'TE_b_user_id_retweet',
       'TE_b_user_id_retweet_comment',
       'TE_b_user_id_tweet_type_language_like',
       'TE_b_user_id_tweet_type_language_reply',
       'TE_b_user_id_tweet_type_language_retweet',
       'TE_b_user_id_tweet_type_language_retweet_comment',
       'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_like',
       'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_reply',
       'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified

In [15]:
import xgboost as xgb
print('XGB Version',xgb.__version__)

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor' : 'gpu_predictor'
}

XGB Version 1.1.0


In [16]:
for col in train.columns:
    if train[col].dtype=='bool':
        train[col] = train[col].astype('int8')
    if valid[col].dtype=='bool':
        valid[col] = valid[col].astype('int8')
train, = dask.persist(train)
valid, = dask.persist(valid)
train.head()

Unnamed: 0,TE_a_user_id_like,TE_a_user_id_reply,TE_a_user_id_retweet,TE_a_user_id_retweet_comment,TE_b_is_verified_tweet_type_like,TE_b_is_verified_tweet_type_reply,TE_b_is_verified_tweet_type_retweet,TE_b_is_verified_tweet_type_retweet_comment,TE_b_user_id_a_user_id_like,TE_b_user_id_a_user_id_reply,...,tw_last_quest,tw_len_gif,tw_len_media,tw_len_photo,tw_len_quest,tw_len_retweet,tw_len_rt,tw_len_token,tw_len_video,tweet_type
3,0.397,0.025,0.188,0.015,0.247,0.006,0.113,0.006,0.456,0.024,...,0,0,0,0,0,0,0,15,0,1
8,0.397,0.01,0.045,0.0,0.247,0.006,0.113,0.006,0.397,0.028,...,0,0,1,0,0,0,0,41,1,1
11,0.852,0.014,0.116,0.008,0.485,0.04,0.078,0.008,0.397,0.028,...,0,0,0,0,0,0,0,75,0,0
12,0.397,0.019,0.013,0.001,0.485,0.04,0.078,0.008,0.397,0.028,...,0,0,0,0,0,0,0,17,0,0
16,0.253,0.046,0.103,0.046,0.485,0.04,0.078,0.008,0.397,0.028,...,0,0,1,0,0,0,0,22,1,0


In [17]:
import pickle

In [18]:
models_index = [0,1,2,3]

In [19]:
def get_colnames(train, name):
    cols = list(train.columns)
    cols = [col for col in cols if 'NN_' not in col and col!='group' and cols!='quantile']
    cols = [x for x in cols if ('TE_' in x and 'a_user_id' in x and name in x) or 'TE_' not in x or ('TE_' in x and 'a_user_id' not in x)]
    cols = cols + [x for x in train.columns if x not in cols and 'TE_switch_' in x]
    return(cols)

In [20]:
%%time
# TRAIN AND VALIDATE

NROUND = 1001
VERBOSE_EVAL = 50
ESR = 50
preds_out = []
best_trees_out = []

for i in models_index:
    name = label_names[i]
    print('#'*25);print('###',name);print('#'*25)
    cols = get_colnames(train, name)
       
    start = time.time(); print('Creating DMatrix...')
    preds = []
    best_trees = []
    dtrain = xgb.dask.DaskDMatrix(client,data=train[cols].values,label=Y_train.iloc[:, i])
    dvalid = xgb.dask.DaskDMatrix(client,data=valid[cols].values,label=Y_valid.iloc[:, i])
    print('Took %.1f seconds'%(time.time()-start))
    print(len(cols))
    print(cols)

    start = time.time(); print('Training...')
    model = xgb.dask.train(client, xgb_parms, 
                           dtrain=dtrain,
                           evals=[(dvalid,'valid')],
                           num_boost_round=NROUND,
                           early_stopping_rounds=ESR,
                           verbose_eval=VERBOSE_EVAL
                          ) 
    print('Took %.1f seconds'%(time.time()-start))
    logloss_valid = model["history"]['valid']['logloss'][::VERBOSE_EVAL]
    for ik in range(len(logloss_valid)):
        print(f"{str(VERBOSE_EVAL*ik).zfill(4)} valid-logloss:{logloss_valid[ik]:.5f}")
    for group in [0,1,2,3,4]:
        preds.append(xgb.dask.predict(client,model,valid[valid['group']==group][cols].values))
    start = time.time(); print('Predicting...')
    print('Took %.1f seconds'%(time.time()-start))
    best_trees.append(model['booster'].best_ntree_limit)
    preds_out.append(preds)
    best_trees_out.append(best_trees)

#########################
### reply
#########################
Creating DMatrix...
Took 11.5 seconds
77
['TE_a_user_id_reply', 'TE_b_is_verified_tweet_type_like', 'TE_b_is_verified_tweet_type_reply', 'TE_b_is_verified_tweet_type_retweet', 'TE_b_is_verified_tweet_type_retweet_comment', 'TE_b_user_id_a_user_id_reply', 'TE_b_user_id_like', 'TE_b_user_id_reply', 'TE_b_user_id_retweet', 'TE_b_user_id_retweet_comment', 'TE_b_user_id_tweet_type_language_like', 'TE_b_user_id_tweet_type_language_reply', 'TE_b_user_id_tweet_type_language_retweet', 'TE_b_user_id_tweet_type_language_retweet_comment', 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_like', 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_reply', 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_retweet', 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_retweet_comment', 'TE_media_tweet_type_language_a_is_verified_b_is_verified_b_follows_a_like', 'TE_media_tweet_type_language

  [<function predict.<locals>.mapped_predict at 0x7f ... titions>, True]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


Predicting...
Took 0.0 seconds
#########################
### retweet
#########################
Creating DMatrix...
Took 12.3 seconds
79
['TE_a_user_id_retweet', 'TE_a_user_id_retweet_comment', 'TE_b_is_verified_tweet_type_like', 'TE_b_is_verified_tweet_type_reply', 'TE_b_is_verified_tweet_type_retweet', 'TE_b_is_verified_tweet_type_retweet_comment', 'TE_b_user_id_a_user_id_retweet', 'TE_b_user_id_a_user_id_retweet_comment', 'TE_b_user_id_like', 'TE_b_user_id_reply', 'TE_b_user_id_retweet', 'TE_b_user_id_retweet_comment', 'TE_b_user_id_tweet_type_language_like', 'TE_b_user_id_tweet_type_language_reply', 'TE_b_user_id_tweet_type_language_retweet', 'TE_b_user_id_tweet_type_language_retweet_comment', 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_like', 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_reply', 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_retweet', 'TE_domains_language_b_follows_a_tweet_type_media_a_is_verified_retweet_comme

Took 121.7 seconds
0000 valid-logloss:0.67825
0050 valid-logloss:0.52205
0100 valid-logloss:0.51367
0150 valid-logloss:0.51133
0200 valid-logloss:0.51005
0250 valid-logloss:0.50917
0300 valid-logloss:0.50896
0350 valid-logloss:0.50875
0400 valid-logloss:0.50850
0450 valid-logloss:0.50842
0500 valid-logloss:0.50831
Predicting...
Took 0.0 seconds
CPU times: user 38.1 s, sys: 16.2 s, total: 54.3 s
Wall time: 10min 31s


In [21]:
print(best_trees_out)

[[642], [998], [479], [483]]


In [22]:
%%time

import cupy

oof = []
yquantile = []

for i in range(5):
    yquantile.append(valid[valid['group']==i]['group'].values.compute())

yquantile = cupy.concatenate(yquantile)
    
for preds in preds_out:
    oof.append(cupy.concatenate([cupy.array(i.values.compute()) for i in preds]))

oof = cupy.asarray(oof).T

yvalid = []
for i in range(5):
    yvalid.append(Y_valid[valid['group']==i][label_names].values.compute())

yvalid = cupy.concatenate(yvalid)

CPU times: user 9.36 s, sys: 9.29 s, total: 18.7 s
Wall time: 38.6 s


# Compute Validation Metrics

In [23]:
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score

def display_score(rce,ap):
    print('Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment')
    for i in range(5):
        print(f'{i:9}      ' + \
              ' '.join([f"{ap[engage_type][i]:10.4f}  {rce[engage_type][i]:10.4f}" for engage_type in ['retweet','reply','like','retweet_comment']]))

    print('     Average   ' + ' '.join([f"{np.mean(list(ap[engage_type])):10.4f}  {np.mean(list(rce[engage_type])):10.4f}" for engage_type in ['retweet','reply','like','retweet_comment']]))            

def precision_recall_curve(y_true,y_pred):
    y_true = y_true.astype('float32')
    ids = cupy.argsort(-y_pred) 
    y_true = y_true[ids]
    y_pred = y_pred[ids]
    y_pred = cupy.flip(y_pred,axis=0)

    acc_one = cupy.cumsum(y_true)
    sum_one = cupy.sum(y_true)
    
    precision = cupy.flip(acc_one/cupy.cumsum(cupy.ones(len(y_true))),axis=0)
    precision[:-1] = precision[1:]
    precision[-1] = 1.

    recall = cupy.flip(acc_one/sum_one,axis=0)
    recall[:-1] = recall[1:]
    recall[-1] = 0
    n = (recall==1).sum()
    
    return precision[n-1:],recall[n-1:],y_pred[n:]

def compute_prauc(pred, gt):
    prec, recall, thresh = precision_recall_curve(gt, pred)
    recall, prec = cupy.asnumpy(recall), cupy.asnumpy(prec)
    prauc = auc(recall, prec)
    return prauc

def log_loss(y_true,y_pred,eps=1e-15, normalize=True, sample_weight=None):
    y_true = y_true.astype('int32')
    y_pred = cupy.clip(y_pred, eps, 1 - eps)
    if y_pred.ndim == 1:
        y_pred = cupy.expand_dims(y_pred, axis=1)
    if y_pred.shape[1] == 1:
        y_pred = cupy.hstack([1 - y_pred, y_pred])

    y_pred /= cupy.sum(y_pred, axis=1, keepdims=True)
    loss = -cupy.log(y_pred)[cupy.arange(y_pred.shape[0]), y_true]
    return _weighted_sum(loss, sample_weight, normalize).item()

def _weighted_sum(sample_score, sample_weight, normalize):
    if normalize:
        return cupy.average(sample_score, weights=sample_weight)
    elif sample_weight is not None:
        return cupy.dot(sample_score, sample_weight)
    else:
        return sample_score.sum()

# FAST METRIC FROM GIBA
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [24]:
%%time

txt = ''
rce_output = {}
ap_output = {}
for i, ind in enumerate(models_index):
    prauc_out = []
    rce_out = []
    ap_out = []
    for j in range(5):
        yvalid_tmp = yvalid[yquantile==j][:, i]
        oof_tmp = oof[yquantile==j][:, i]
        prauc = 0
        rce   = compute_rce_fast(oof_tmp, yvalid_tmp).item()
        ap    = 0
        prauc_out.append(prauc)
        rce_out.append(rce)
        ap_out.append(ap)
    rce_output[label_names[ind]] = rce_out
    ap_output[label_names[ind]] = ap_out

CPU times: user 8.37 s, sys: 77 ms, total: 8.45 s
Wall time: 8.58 s


In [25]:
%%time
display_score(rce_output, ap_output)

Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment
        0          0.0000     28.2828     0.0000     24.0233     0.0000     20.6895     0.0000     15.2361
        1          0.0000     29.2267     0.0000     25.6013     0.0000     20.9900     0.0000     16.1554
        2          0.0000     29.1276     0.0000     27.3326     0.0000     21.5337     0.0000     16.3607
        3          0.0000     30.2888     0.0000     29.1862     0.0000     22.5099     0.0000     18.0549
        4          0.0000     30.3547     0.0000     27.9048     0.0000     26.3409     0.0000     19.6805
     Average       0.0000     29.4561     0.0000     26.8097     0.0000     22.4128     0.0000     17.0975
CPU times: user 1.01 ms, sys: 688 µs, total: 1.7 ms
Wall time: 1.24 ms


```
Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment
        0          0.0000     28.3237     0.0000     23.9158     0.0000     21.1112     0.0000     14.8323
        1          0.0000     29.0424     0.0000     25.3712     0.0000     21.4308     0.0000     15.7522
        2          0.0000     28.7857     0.0000     27.0025     0.0000     21.5827     0.0000     15.8705
        3          0.0000     30.0527     0.0000     28.9209     0.0000     22.9882     0.0000     17.6046
        4          0.0000     30.4045     0.0000     27.8814     0.0000     26.3568     0.0000     19.6959
     Average       0.0000     29.3218     0.0000     26.6184     0.0000     22.6939     0.0000     16.7511
CPU times: user 2.25 ms, sys: 0 ns, total: 2.25 ms
Wall time: 1.5 ms
```

In [27]:
client.close()