In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import glob
import pandas as pd
import numpy as np
import cudf
import cupy
import gc

from datetime import datetime

In [2]:
files = glob.glob('/raid/recsys2021_pre_1/*') + glob.glob('/raid/recsys2021_pre_2/*') + glob.glob('/raid/recsys2021_pre_3/*')

In [3]:
files

['/raid/recsys2021_pre_1/part-00118.parquet',
 '/raid/recsys2021_pre_1/part-00101.parquet',
 '/raid/recsys2021_pre_1/part-00249.parquet',
 '/raid/recsys2021_pre_1/part-00158.parquet',
 '/raid/recsys2021_pre_1/part-00037.parquet',
 '/raid/recsys2021_pre_1/part-00239.parquet',
 '/raid/recsys2021_pre_1/part-00115.parquet',
 '/raid/recsys2021_pre_1/part-00100.parquet',
 '/raid/recsys2021_pre_1/part-00014.parquet',
 '/raid/recsys2021_pre_1/part-00144.parquet',
 '/raid/recsys2021_pre_1/part-00206.parquet',
 '/raid/recsys2021_pre_1/part-00201.parquet',
 '/raid/recsys2021_pre_1/part-00041.parquet',
 '/raid/recsys2021_pre_1/part-00055.parquet',
 '/raid/recsys2021_pre_1/part-00017.parquet',
 '/raid/recsys2021_pre_1/part-00221.parquet',
 '/raid/recsys2021_pre_1/part-00236.parquet',
 '/raid/recsys2021_pre_1/part-00085.parquet',
 '/raid/recsys2021_pre_1/part-00046.parquet',
 '/raid/recsys2021_pre_1/part-00204.parquet',
 '/raid/recsys2021_pre_1/part-00065.parquet',
 '/raid/recsys2021_pre_1/part-0009

In [4]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"

In [5]:
import dask as dask, dask_cudf
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import subprocess

In [6]:
cluster = LocalCUDACluster(local_directory='/raid/dask5/', device_memory_limit=0.5)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:45641  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 429.50 GB


In [7]:
ddf = dask_cudf.read_parquet(files, columns=['a_follower_count',
    'a_following_count',
    'b_follower_count',
    'b_following_count',
    'tw_count_char',
    'tw_count_words',
    'tw_len_token', 'a_account_creation', 'b_account_creation'])

In [8]:
ddf['a_ff_rate'] = (ddf['a_following_count'] / (1+ddf['a_follower_count'])).astype('float32')
ddf['b_ff_rate'] = (ddf['b_follower_count']  / (1+ddf['b_following_count'])).astype('float32')
ddf['ab_fing_rate'] = (ddf['a_following_count'] / (1+ddf['b_following_count'])).astype('float32')
ddf['ab_fer_rate'] = (ddf['a_follower_count'] / (1+ddf['b_follower_count'])).astype('float32')
ddf['ab_age_dff'] = (ddf['a_account_creation']-ddf['b_account_creation'])
ddf['ab_age_rate'] = ddf['a_account_creation']/(1+ddf['b_account_creation'])

In [9]:
# !rm -r /raid/NN_encodings

# !mkdir /raid/NN_encodings

In [10]:
# col = 'user_id'
# cdf = dask_cudf.concat([ddf['a_user_id'], ddf['b_user_id']]).reset_index(drop=True)
# cdf = cdf.value_counts().reset_index()
# cdf.columns = [col, 'count']
# cdf = cdf.reset_index()
# cdf.columns = [col + '_', col, 'count']
# cdf[col + '_'] = cdf[col + '_'] + 1
# cdf.to_parquet('/raid/NN_encodings/' + col + '.parquet')

In [11]:
# col = 'muser_id'
# cdf = dask_cudf.concat([ddf['tw_original_user0'], ddf['tw_original_user1'], ddf['tw_original_user2']]).reset_index(drop=True)
# cdf = cdf.value_counts().reset_index()
# cdf.columns = [col, 'count']
# cdf = cdf.reset_index()
# cdf.columns = [col + '_', col, 'count']
# cdf[col + '_'] = cdf[col + '_'] + 1
# cdf.to_parquet('/raid/NN_encodings/' + col + '.parquet')
# client.close()

In [12]:
# col = 'word'
# cdf = dask_cudf.concat([ddf['tw_word0'], ddf['tw_word1'], ddf['tw_word2'], ddf['tw_word3'], ddf['tw_word4']]).reset_index(drop=True)
# cdf = cdf.value_counts().reset_index()
# cdf.columns = [col, 'count']
# cdf = cdf.reset_index()
# cdf.columns = [col + '_', col, 'count']
# cdf[col + '_'] = cdf[col + '_'] + 1
# cdf.to_parquet('/raid/NN_encodings/' + col + '.parquet')
# client.close()

In [13]:
# for col in [
#     'hashtags',  
#     'links', 
#     'domains', 
#     'tw_rt_user0',
#     'tw_original_http0',
#     'dt_day', 
#     'dt_dow',
#     'tw_len_media', 
#     'tw_len_photo',
#     'tw_len_video',
#     'tw_len_gif', 
#     'tw_len_quest',
#     'tw_count_special1', 
#     'tw_last_quest', 
#     'tw_len_retweet',
#     'tw_len_rt', 
#     'tw_count_at',  
#     'len_hashtags', 
#     'len_links', 
#     'len_domains'
# ]:
#     print(col)
#     cdf = ddf[col].reset_index(drop=True)
#     cdf = cdf.value_counts().reset_index()
#     cdf.columns = [col, 'count']
#     cdf = cdf.reset_index()
#     cdf.columns = [col + '_', col, 'count']
#     cdf[col + '_'] = cdf[col + '_'] + 1
#     cdf.to_parquet('/raid/NN_encodings/' + col + '.parquet')

In [14]:
num_stats = {}

In [15]:
meta = ('float32', 'float32')

def log(x):
    x = cudf.log((x+1).astype('float32'))
    return(x)

NUM_LOG_COLS = [
    'a_follower_count',
    'a_following_count',
    'b_follower_count',
    'b_following_count',
    'tw_count_char',
    'tw_count_words',
    'tw_len_token'
]

In [16]:
for col in NUM_LOG_COLS:
    print(col)
    xmean = ddf[col].map_partitions(log, meta=meta).mean().compute()
    xstd = ddf[col].map_partitions(log, meta=meta).std().compute()
    num_stats[col] = [xmean, xstd]

a_follower_count
a_following_count
b_follower_count
b_following_count
tw_count_char
tw_count_words
tw_len_token


In [17]:
NUM_COLS = [
    'a_ff_rate',
    'b_ff_rate',
    'ab_fing_rate',
    'ab_fer_rate',
    'ab_age_dff',
    'ab_age_rate'
]

In [18]:
for col in NUM_COLS:
    print(col)
    xmean = ddf[col].mean().compute()
    xstd = ddf[col].std().compute()
    num_stats[col] = [xmean, xstd]

a_ff_rate
b_ff_rate
ab_fing_rate
ab_fer_rate
ab_age_dff
ab_age_rate


In [19]:
num_stats

{'a_follower_count': [9.235696768523635, 3.111560771160793],
 'a_following_count': [6.2578431551824485, 1.8617217780232085],
 'b_follower_count': [5.266129734214283, 1.6605541058218978],
 'b_following_count': [5.834243150040552, 1.1939579003693612],
 'tw_count_char': [4.322766427792716, 0.8206800778231492],
 'tw_count_words': [2.535984523111658, 0.9972112373754571],
 'tw_len_token': [3.6257291983777318, 0.6890229560424759],
 'a_ff_rate': [0.6012308582841046, 1.3345731364618305],
 'b_ff_rate': [1.748865708799117, 142.32999985661687],
 'ab_fing_rate': [17.177006023261985, 388.5535341232384],
 'ab_fer_rate': [43108.85218092591, 788990.7412391531],
 'ab_age_dff': [-4.1780779072087965e-06, nan],
 'ab_age_rate': [0.0, inf]}