Allowing to pass in a vocab in Categorify (#935)

Allow to pass in vocabs in Categorify to fix make_feature_column_workflow
NVIDIA-Merlin · Jul 20, 2021 · df8b4db · df8b4db
1 parent 0c80385
commit df8b4db
Show file tree

Hide file tree

Showing 7 changed files with 176 additions and 121 deletions.
diff --git a/.github/workflows/cpu-ci.yml b/.github/workflows/cpu-ci.yml
@@ -1,46 +1,46 @@
-name: CPU CI
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        python-version: [3.8]
-        os: [ubuntu-latest]
-
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install Ubuntu packages
-      run: |
-        sudo apt-get update -y
-        sudo apt-get install -y protobuf-compiler
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip setuptools wheel 
-        python -m pip install -r requirements.txt pybind11
-        python -m pip install -r requirements-dev.txt
-    - name: Lint with flake8
-      run: |
-        flake8 .
-    - name: Lint with black
-      run: |
-        black --check .
-    - name: Lint with isort
-      run: |
-        isort -c .
-    - name: Build
-      run: |
-        python setup.py develop
-    - name: Run unittests
-      run: |
-        python -m pytest -svv tests/unit/test_cpu_workflow.py
+name: CPU CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: [3.8]
+        os: [ubuntu-latest]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install Ubuntu packages
+      run: |
+        sudo apt-get update -y
+        sudo apt-get install -y protobuf-compiler
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip setuptools wheel 
+        python -m pip install -r requirements.txt pybind11
+        python -m pip install -r requirements-dev.txt
+    - name: Lint with flake8
+      run: |
+        flake8 .
+    - name: Lint with black
+      run: |
+        black --check .
+    - name: Lint with isort
+      run: |
+        isort -c .
+    - name: Build
+      run: |
+        python setup.py develop
+    - name: Run unittests
+      run: |
+        python -m pytest -svv tests/unit/test_cpu_workflow.py
diff --git a/.gitignore b/.gitignore
@@ -71,3 +71,5 @@ ipython_config.py
 .dmypy.json
 dmypy.json
 
+# PyCharm
+.idea
diff --git a/nvtabular/dispatch.py b/nvtabular/dispatch.py
@@ -105,6 +105,10 @@ def _is_cpu_object(x):
     return isinstance(x, (pd.DataFrame, pd.Series))
 
 
+def is_series_or_dataframe_object(maybe_series_or_df):
+    return _is_series_object(maybe_series_or_df) or _is_dataframe_object(maybe_series_or_df)
+
+
 def _hex_to_int(s, dtype=None):
     def _pd_convert_hex(x):
         if pd.isnull(x):
@@ -320,11 +324,28 @@ def _make_df(_like_df=None, device=None):
         return pd.DataFrame(_like_df)
     elif isinstance(_like_df, (cudf.DataFrame, cudf.Series)):
         return cudf.DataFrame(_like_df)
+    elif isinstance(_like_df, dict) and len(_like_df) > 0:
+        is_pandas = all(isinstance(v, pd.Series) for v in _like_df.values())
+
+        return pd.DataFrame(_like_df) if is_pandas else cudf.DataFrame(_like_df)
     if device == "cpu":
         return pd.DataFrame()
     return cudf.DataFrame()
 
 
+def _add_to_series(series, to_add, prepend=True):
+    if isinstance(series, pd.Series):
+        series_to_add = pd.Series(to_add)
+    elif isinstance(series, cudf.Series):
+        series_to_add = cudf.Series(to_add)
+    else:
+        raise ValueError("Unrecognized series, please provide either a pandas a cudf series")
+
+    series_to_concat = [series_to_add, series] if prepend else [series, series_to_add]
+
+    return _concat(series_to_concat)
+
+
 def _detect_format(data):
     """Utility to detect the format of `data`"""
     from nvtabular import Dataset

diff --git a/nvtabular/framework_utils/tensorflow/feature_column_utils.py b/nvtabular/framework_utils/tensorflow/feature_column_utils.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 #
 
-import os
 import warnings
 
-import cudf
+import pandas as pd
 import tensorflow as tf
 from tensorflow.python.feature_column import feature_column_v2 as fc
 
@@ -227,7 +226,7 @@ def _get_parents(column):
         features += features_replaced_buckets
 
     if len(categorifies) > 0:
-        features += categorifies.keys() >> Categorify()
+        features += categorifies.keys() >> Categorify(vocabs=pd.DataFrame(categorifies))
 
     if len(hashes) > 0:
         features += hashes.keys() >> HashBucket(hashes)
@@ -282,22 +281,4 @@ def _get_parents(column):
 
     workflow = nvt.Workflow(features)
 
-    # create stats for Categorify op if we need it
-    if len(categorifies) > 0:
-        if category_dir is None:
-            category_dir = "/tmp/categories"  # nosec
-        if not os.path.exists(category_dir):
-            os.makedirs(category_dir)
-
-        stats = {"categories": {}}
-        for feature_name, categories in categorifies.items():
-            categories.insert(0, None)
-            df = cudf.DataFrame({feature_name: categories})
-
-            save_path = os.path.join(category_dir, f"unique.{feature_name}.parquet")
-            df.to_parquet(save_path)
-            stats["categories"][feature_name] = save_path
-
-        workflow.stats = stats
-
     return workflow, numeric_columns + new_feature_columns