In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create TE Map Parquet

In [1]:
import pandas as pd
import numpy as np
import cudf, cupy
import gc, os
import dask, dask_cudf                    
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, wait
from dask.delayed import delayed
print('cudf version',cudf.__version__)

cudf version 0.19.1


In [2]:
features = ['tw_word0']
targets = ['reply','retweet','retweet_comment','like']

# USE SAME SMOOTH AND FILT THAT YOU TRAINED MODELS WITH
VER = 146
SMOOTH = 20
FILTER = 1
BYTES = 1 #use 1 for int8, 2 for int16, 4 for float32

In [3]:
try: os.mkdir('./te%i'%VER)
except: print('Note: ver %i folder already exists'%VER)

Note: ver 98 folder already exists


# Read Parquets

In [4]:
cluster = LocalCUDACluster( 
    local_directory='/home/cdeotte/RecSys/Work/May-2021/may-20-21-RECSYS/',
    CUDA_VISIBLE_DEVICES = '0,1,2,3',
    #device_memory_limit = 0.90,
)
client = Client(cluster)
path = '/raid/RecSys/recsys2021/parquet7'
df = dask_cudf.read_parquet(f'{path}/*.parquet', columns=features+targets)
_ = wait(df)
df.head()

Unnamed: 0,tw_word0,reply,retweet,retweet_comment,like
0,53050233,0,0,0,0
1,1694712663,0,0,0,0
2,990650098,0,0,0,1
3,-108567334,0,0,0,0
4,-108567334,0,1,0,1


In [5]:
a = df.shape
print( a[0].compute() )

663946617


# Compute Target Means

In [6]:
#M = {}
#for ti in targets:
#    print(ti,'...')
#    M[ti] = df[ti].compute().mean()

In [7]:
M = {'reply': 0.027490660442660256,
     'retweet': 0.08691603891401407,
     'retweet_comment': 0.00680711051804335,
     'like': 0.395523962734492}

In [None]:
# MY TRAIN VALIDATION SUBSET
#M = {'reply': 0.02604706011607502,
#     'retweet': 0.09169025434904013,
#     'retweet_comment': 0.0069172984326772895,
#     'like': 0.4068115207982652}

In [8]:
M

{'reply': 0.027490660442660256,
 'retweet': 0.08691603891401407,
 'retweet_comment': 0.00680711051804335,
 'like': 0.395523962734492}

# Remove Rows Not Needed

In [9]:
if FILTER>0:
    for COL in features:
        dt = df[[COL,targets[0]]].groupby(COL).agg({
            targets[0]: ['count']}).reset_index()
        dt.columns = [COL,'count']
        df = df.merge(dt,on=COL,how='left')
        df = df.loc[df['count']>FILTER]
        del df['count']
        gc.collect()
        
    if len(features)>1:
        df['tmp'] = 0
        for COL in features:
            df['tmp'] = df['tmp'] + df[COL].astype('int64')
        df['tmp'] = df['tmp'].astype('int32')
        dt = df.tmp.value_counts().reset_index()
        dt.columns = ['tmp','count']
        df = df.merge(dt,on='tmp',how='left')
        df = df.loc[df['count']>FILTER]
        del df['count']; del df['tmp']; 
        
    del dt
    gc.collect()

# Compute TE

In [10]:
# COMPUTE TE
dt = df[features+targets].groupby(features).agg({
    'reply': ['sum', 'count'],
    'retweet': ['sum'],
    'retweet_comment': ['sum'],
    'like': ['sum']}).reset_index()
dt.columns = features + ['reply_sum', 'count', 'retweet_sum', 'retweet_comment_sum', 'like_sum']

# REMOVE ROWS
dt = dt.loc[(dt['count']>FILTER)]

# SMOOTH TE
cols = []
for ti, target in enumerate(targets):
    fname = 'TE_'+'_'.join(features)+'_'+target
    cols.append(fname)
    print(fname,'...')
    dt[fname] = ((dt[target+'_sum']) + (M[target]*SMOOTH)) / (dt['count']+SMOOTH)
    dt[fname] = dt[fname].astype('float32')
    del dt[target+'_sum']
del dt['count']
gc.collect()

TE_tw_word0_reply ...
TE_tw_word0_retweet ...
TE_tw_word0_retweet_comment ...
TE_tw_word0_like ...


0

In [11]:
# WRITE TO PARQUET
if BYTES==1:
    for c in cols:
        dt[c] = (dt[c]*255)-128
        dt[c] = dt[c].astype('int8')
elif BYTES==2:
    for c in cols:
        dt[c] = (dt[c]*65535)-32768
        dt[c] = dt[c].astype('int16')
        
fname = './te%i/'%VER + '_'.join(features)  + '_s%i_f%i.parquet'%(SMOOTH,FILTER)
dt.compute().to_parquet(fname)  

In [12]:
dt.columns

Index(['tw_word0', 'TE_tw_word0_reply', 'TE_tw_word0_retweet',
       'TE_tw_word0_retweet_comment', 'TE_tw_word0_like'],
      dtype='object')