Skip to content

Commit

Permalink
Allowing to pass in a vocab in Categorify (#935)
Browse files Browse the repository at this point in the history
 Allow to pass in vocabs in Categorify to fix make_feature_column_workflow
  • Loading branch information
marcromeyn committed Jul 20, 2021
1 parent 0c80385 commit df8b4db
Show file tree
Hide file tree
Showing 7 changed files with 176 additions and 121 deletions.
92 changes: 46 additions & 46 deletions .github/workflows/cpu-ci.yml
Original file line number Diff line number Diff line change
@@ -1,46 +1,46 @@
name: CPU CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.8]
os: [ubuntu-latest]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install Ubuntu packages
run: |
sudo apt-get update -y
sudo apt-get install -y protobuf-compiler
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
python -m pip install -r requirements.txt pybind11
python -m pip install -r requirements-dev.txt
- name: Lint with flake8
run: |
flake8 .
- name: Lint with black
run: |
black --check .
- name: Lint with isort
run: |
isort -c .
- name: Build
run: |
python setup.py develop
- name: Run unittests
run: |
python -m pytest -svv tests/unit/test_cpu_workflow.py
name: CPU CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.8]
os: [ubuntu-latest]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install Ubuntu packages
run: |
sudo apt-get update -y
sudo apt-get install -y protobuf-compiler
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
python -m pip install -r requirements.txt pybind11
python -m pip install -r requirements-dev.txt
- name: Lint with flake8
run: |
flake8 .
- name: Lint with black
run: |
black --check .
- name: Lint with isort
run: |
isort -c .
- name: Build
run: |
python setup.py develop
- name: Run unittests
run: |
python -m pytest -svv tests/unit/test_cpu_workflow.py
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,5 @@ ipython_config.py
.dmypy.json
dmypy.json

# PyCharm
.idea
21 changes: 21 additions & 0 deletions nvtabular/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ def _is_cpu_object(x):
return isinstance(x, (pd.DataFrame, pd.Series))


def is_series_or_dataframe_object(maybe_series_or_df):
return _is_series_object(maybe_series_or_df) or _is_dataframe_object(maybe_series_or_df)


def _hex_to_int(s, dtype=None):
def _pd_convert_hex(x):
if pd.isnull(x):
Expand Down Expand Up @@ -320,11 +324,28 @@ def _make_df(_like_df=None, device=None):
return pd.DataFrame(_like_df)
elif isinstance(_like_df, (cudf.DataFrame, cudf.Series)):
return cudf.DataFrame(_like_df)
elif isinstance(_like_df, dict) and len(_like_df) > 0:
is_pandas = all(isinstance(v, pd.Series) for v in _like_df.values())

return pd.DataFrame(_like_df) if is_pandas else cudf.DataFrame(_like_df)
if device == "cpu":
return pd.DataFrame()
return cudf.DataFrame()


def _add_to_series(series, to_add, prepend=True):
if isinstance(series, pd.Series):
series_to_add = pd.Series(to_add)
elif isinstance(series, cudf.Series):
series_to_add = cudf.Series(to_add)
else:
raise ValueError("Unrecognized series, please provide either a pandas a cudf series")

series_to_concat = [series_to_add, series] if prepend else [series, series_to_add]

return _concat(series_to_concat)


def _detect_format(data):
"""Utility to detect the format of `data`"""
from nvtabular import Dataset
Expand Down
23 changes: 2 additions & 21 deletions nvtabular/framework_utils/tensorflow/feature_column_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,9 @@
# limitations under the License.
#

import os
import warnings

import cudf
import pandas as pd
import tensorflow as tf
from tensorflow.python.feature_column import feature_column_v2 as fc

Expand Down Expand Up @@ -227,7 +226,7 @@ def _get_parents(column):
features += features_replaced_buckets

if len(categorifies) > 0:
features += categorifies.keys() >> Categorify()
features += categorifies.keys() >> Categorify(vocabs=pd.DataFrame(categorifies))

if len(hashes) > 0:
features += hashes.keys() >> HashBucket(hashes)
Expand Down Expand Up @@ -282,22 +281,4 @@ def _get_parents(column):

workflow = nvt.Workflow(features)

# create stats for Categorify op if we need it
if len(categorifies) > 0:
if category_dir is None:
category_dir = "/tmp/categories" # nosec
if not os.path.exists(category_dir):
os.makedirs(category_dir)

stats = {"categories": {}}
for feature_name, categories in categorifies.items():
categories.insert(0, None)
df = cudf.DataFrame({feature_name: categories})

save_path = os.path.join(category_dir, f"unique.{feature_name}.parquet")
df.to_parquet(save_path)
stats["categories"][feature_name] = save_path

workflow.stats = stats

return workflow, numeric_columns + new_feature_columns
Loading

0 comments on commit df8b4db

Please sign in to comment.