In [1]:
import nvtabular as nvt
import pandas as pd

### (1) Example for frequency hashing:

In [2]:
# Create toy dataset
df = pd.DataFrame({
    'author': ['User_A', 'User_B', 'User_C', 'User_C', 'User_A', 'User_B', 'User_A'],
    'productID': [100, 101, 102, 101, 102, 103, 103],
    'label': [0, 0, 1, 1, 1, 0, 0]
})
display(df)

Unnamed: 0,author,productID,label
0,User_A,100,0
1,User_B,101,0
2,User_C,102,1
3,User_C,101,1
4,User_A,102,1
5,User_B,103,0
6,User_A,103,0


In [3]:
"""convert to nvt dataset"""
dataset = nvt.Dataset(df)



### (2) Categorify

In [4]:
CATEGORICAL_COLUMNS = ['author', 'productID']

In [5]:
categorify = nvt.ops.Categorify(
    freq_threshold={"author": 1, "productID": 1},
    num_buckets={"author": 100, "productID": 20})



In [6]:
categorify.set_storage_path("./ntv-proc-categorify/")

In [7]:
cat_features = CATEGORICAL_COLUMNS >> categorify

### (3) Parameters

        freq_threshold (int or dictionary:{column: freq_limit_value}, default 0) – Categories with a count/frequency below this threshold will be omitted from the encoding and corresponding data will be mapped to the “null” category. Can be represented as both an integer or a dictionary with column names as keys and frequency limit as value. If dictionary is used, all columns targeted must be included in the dictionary.map

        num_buckets (int, or dictionary:{column: num_hash_buckets}) – Column-wise modulo to apply after hash function. Note that this means that the corresponding value will be the categorical cardinality of the transformed categorical feature. If given as an int, that value will be used as the number of “hash buckets” for every feature. If a dictionary is passed, it will be used to specify explicit mappings from a column name to a number of buckets. In this case, only the columns specified in the keys of num_buckets will be transformed.

In [9]:
# Initialize the workflow and execute it
proc = nvt.Workflow(cat_features)

In [10]:
proc.fit(dataset)



<nvtabular.workflow.workflow.Workflow at 0x168d1b6a0>

In [11]:
proc.transform(dataset).to_parquet("./test")

In [12]:
ddf = proc.transform(dataset).to_ddf()

In [13]:
print(ddf.compute())

   author  productID
0       1          4
1       2          1
2       3          2
3       3          1
4       1          2
5       2          3
6       1          3



###  4) Example with multi-hot:

In [14]:
# Create toy dataset
df = pd.DataFrame({
    'userID': [10001, 10002, 10003],
    'productID': [30003, 30005, 40005],
    'categories': [['Cat A', 'Cat B'], ['Cat C'], ['Cat A', 'Cat C', 'Cat D']],
    'label': [0,0,1]
})
dataset = nvt.Dataset(df)



In [15]:
dataset.head()

Unnamed: 0,userID,productID,categories,label
0,10001,30003,"[Cat A, Cat B]",0
1,10002,30005,[Cat C],0
2,10003,40005,"[Cat A, Cat C, Cat D]",1


In [16]:
CATEGORICAL_COLUMNS = ['userID', 'productID', 'categories']
cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify()

In [17]:
proc = nvt.Workflow(cat_features)

In [18]:
proc.fit(dataset)



<nvtabular.workflow.workflow.Workflow at 0x168afbe50>

In [19]:
ddf = proc.transform(dataset).to_ddf()

# Print results
print(ddf.compute())

   userID  productID categories
0       1          1     [1, 3]
1       2          2        [2]
2       3          3  [1, 2, 4]
