In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# categorify

In [2]:
import nvtabular as nvt
import glob
import pandas as pd
import numpy as np
import cudf
import cupy
import gc

In [3]:
train_lst = sorted(glob.glob('/raid/recsys/train_proc3/*parquet'))
len(train_lst)

232

In [4]:
train_dataset = nvt.Dataset(train_lst)
# valid_dataset = nvt.Dataset(sorted(glob.glob('/raid/recsys_pre_TE_w_tok/valid_norm_20parts/*parquet')), part_size="128MB")

In [47]:
CATEGORICAL_COLUMNS = ['media', 'tweet_type', 'language', 'a_user_id','b_user_id']

cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify(freq_threshold = 
                                                         {'media':0, 'tweet_type':0, 'language':0, 'a_user_id':50,'b_user_id':50})
# cont_features = nvt.ColumnGroup(np.setdiff1d(valid.columns, CATEGORICAL_COLUMNS+label_names).tolist())
# labels = nvt.ColumnGroup(label_names)
workflow = nvt.Workflow(cat_features)
workflow

<nvtabular.workflow.Workflow at 0x7f703e222610>

In [48]:
%%time
workflow.fit(train_dataset) 

CPU times: user 49.5 s, sys: 25.9 s, total: 1min 15s
Wall time: 1min 32s


In [49]:
nvt.ops.get_embedding_sizes(workflow) # emb size changed after fitting

# first 20 parts
# thr=5, {'a_user_id': (1336817, 512),'b_user_id': (1846029, 512), 'language': (67, 17), 'media': (14, 16), 'tweet_type': (4, 16)}
# thr=6, {'a_user_id': (1071277, 512),'b_user_id': (1462504, 512),
# thr=7, {'a_user_id': (888552, 512), 'b_user_id': (1200582, 512),
# thr=8, {'a_user_id': (755702, 512), 'b_user_id': (1011381, 512),
# thr=9, {'a_user_id': (656096, 512), 'b_user_id': (868034, 512),

# 164 parts
# thr=20, {'a_user_id': (3194801, 512), 'b_user_id': (4538284, 512), 'language': (67, 17), 'media': (15, 16), 'tweet_type': (4, 16)}
# thr=30, {'a_user_id': (2179810, 512), 'b_user_id': (3056202, 512), 'language': (67, 17), 'media': (15, 16), 'tweet_type': (4, 16)}
# thr=50, {'a_user_id': (1299012, 512), 'b_user_id': (1791273, 512), 'language': (67, 17), 'media': (15, 16), 'tweet_type': (4, 16)}

# 232 parts
# thr=100, {'a_user_id': (873946, 512), 'b_user_id': (1182355, 512), 'language': (67, 17), 'media': (15, 16), 'tweet_type': (4, 16)}
# thr=50,  {'a_user_id': (1796156, 512),'b_user_id': (2505030, 512), 'language': (67, 17), 'media': (15, 16), 'tweet_type': (4, 16)}

{'a_user_id': (1796156, 512),
 'b_user_id': (2505030, 512),
 'language': (67, 17),
 'media': (15, 16),
 'tweet_type': (4, 16)}

In [50]:
workflow.save('/raid/recsys_pre_TE_w_tok/workflow_232parts_thr50')

## manually combine a_user_id and b_user_id

In [51]:
!ls -lrth /raid/recsys_pre_TE_w_tok/workflow_232parts_thr50/categories/

total 33M
-rw-rw-r-- 1 bo bo 415 May 27 11:36 unique.tweet_type.parquet
-rw-rw-r-- 1 bo bo 443 May 27 11:36 unique.media.parquet
-rw-rw-r-- 1 bo bo 670 May 27 11:36 unique.language.parquet
-rw-rw-r-- 1 bo bo 14M May 27 11:36 unique.a_user_id.parquet
-rw-rw-r-- 1 bo bo 20M May 27 11:36 unique.b_user_id.parquet


In [52]:
dfa = pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_thr50/categories/unique.a_user_id.parquet')
dfb = pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_thr50/categories/unique.b_user_id.parquet')
dfa.shape, dfb.shape

((1796156, 1), (2505030, 1))

In [60]:
np.intersect1d(dfa.a_user_id.values, dfb.b_user_id.values).shape

(529770,)

In [63]:
np.union1d(dfa.a_user_id.values, dfb.b_user_id.values).shape

(3771416,)

In [70]:
1796156 + 2505030

4301186

In [None]:
4558283

# categorify: combine a_user and b_user

In [3]:
import nvtabular as nvt
import glob
import pandas as pd
import numpy as np
import cudf
import cupy
import gc

In [4]:
train_lst = sorted(glob.glob('/raid/recsys/train_proc3/*parquet'))
len(train_lst)

232

In [5]:
train_dataset = nvt.Dataset(train_lst)

cat_features1 = [['a_user_id','b_user_id']] >> nvt.ops.Categorify(freq_threshold = 10, encode_type = 'joint')
cat_features2 = [['media'], ['tweet_type'], ['language']] >> nvt.ops.Categorify()

workflow = nvt.Workflow(cat_features1 + cat_features2)
workflow.fit(train_dataset) 
nvt.ops.get_embedding_sizes(workflow) # emb size changed after fitting

{'language': (67, 17),
 'media': (15, 16),
 'tweet_type': (4, 16),
 'a_user_id': (0, 16),
 'b_user_id': (0, 16)}

In [6]:
workflow.save('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr10')

In [7]:
!ls -lrth /raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr10/categories

total 118M
-rw-rw-r-- 1 bo bo  415 Jun  7 16:56 unique.tweet_type.parquet
-rw-rw-r-- 1 bo bo  443 Jun  7 16:56 unique.media.parquet
-rw-rw-r-- 1 bo bo  670 Jun  7 16:56 unique.language.parquet
-rw-rw-r-- 1 bo bo 118M Jun  7 16:56 unique.a_user_id_b_user_id.parquet


In [12]:
pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr25/categories/unique.a_user_id_b_user_id.parquet').shape

(8244536, 1)

In [8]:
pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr10/categories/unique.a_user_id_b_user_id.parquet').shape

(15453524, 1)

In [10]:
nvt.ops.get_embedding_sizes(workflow) # emb size changed after fitting

# 232 parts
# thr=100, {'a_user_id': (873946, 512), 'b_user_id': (1182355, 512), 'language': (67, 17), 'media': (15, 16), 'tweet_type': (4, 16)}
# thr=50,  {'a_user_id': (1796156, 512),'b_user_id': (2505030, 512), 'language': (67, 17), 'media': (15, 16), 'tweet_type': (4, 16)}
# thr=100, joint {'a_user_id_b_user_id: 2278664}
# thr=50,  joint {'a_user_id_b_user_id: 4558283}
# thr=25,  joint {'a_user_id_b_user_id: 8244536}

# first 2 parts 
# thr=0, combo    {'a_user_id_b_user_id': (6139816, 512), 'language': (67, 17), 'media': (14, 16), 'tweet_type': (4, 16)}
# thr=0, separate {'a_user_id': (2612617, 512), 'b_user_id': (3769546, 512), 'language': (67, 17), 'media': (14, 16), 'tweet_type': (4, 16)}

{'language': (67, 17),
 'media': (15, 16),
 'tweet_type': (4, 16),
 'a_user_id': (0, 16),
 'b_user_id': (0, 16)}

# categorify: combine a_user and b_user (positive rows only)

In [1]:
import nvtabular as nvt
import glob
import pandas as pd
import numpy as np
import cudf
import cupy
import gc

In [2]:
train_lst = sorted(glob.glob('/raid/recsys/train_proc3_pos/*parquet'))
len(train_lst)

232

In [11]:
train_dataset = nvt.Dataset(train_lst)

cat_features1 = [['a_user_id','b_user_id']] >> nvt.ops.Categorify(freq_threshold = 3, encode_type = 'joint')
cat_features2 = [['media'], ['tweet_type'], ['language']] >> nvt.ops.Categorify()

workflow = nvt.Workflow(cat_features1 + cat_features2)
workflow.fit(train_dataset) 
nvt.ops.get_embedding_sizes(workflow) # emb size changed after fitting

{'language': (67, 17),
 'media': (14, 16),
 'tweet_type': (4, 16),
 'a_user_id': (0, 16),
 'b_user_id': (0, 16)}

In [12]:
workflow.save('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr3_pos')

In [13]:
!ls -lrth /raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr3_pos/categories

total 151M
-rw-rw-r-- 1 bo bo  415 Jun 10 10:48 unique.tweet_type.parquet
-rw-rw-r-- 1 bo bo  439 Jun 10 10:48 unique.media.parquet
-rw-rw-r-- 1 bo bo  670 Jun 10 10:48 unique.language.parquet
-rw-rw-r-- 1 bo bo 151M Jun 10 10:48 unique.a_user_id_b_user_id.parquet


In [12]:
pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr25/categories/unique.a_user_id_b_user_id.parquet').shape

(8244536, 1)

In [8]:
pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr10/categories/unique.a_user_id_b_user_id.parquet').shape

(15453524, 1)

In [6]:
pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr10_pos/categories/unique.a_user_id_b_user_id.parquet').shape

(8753003, 1)

In [6]:
pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr5_pos/categories/unique.a_user_id_b_user_id.parquet').shape

(14194639, 1)

In [10]:
pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr4_pos/categories/unique.a_user_id_b_user_id.parquet').shape

(16397664, 1)

In [14]:
pd.read_parquet('/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr3_pos/categories/unique.a_user_id_b_user_id.parquet').shape

(19688213, 1)

In [16]:
19688213/15453524 * 4.1

5.223512339321439