In [9]:
# # Initialize feast feature store
!feast init feast_example
import os

os.chdir("feast_example/feature_repo")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implement

In [2]:
# Download Question Answering dataset
import pandas as pd
from datasets import load_dataset
import datetime

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

path = "./data/qa.parquet"


def save_qa_to_parquet(path):
    # Load SQuAD dataset
    squad = load_dataset("squad", split="train[:5000]")
    # Extract questions and answers
    ids = squad["id"]
    questions = squad["question"]
    answers = [answer["text"][0] for answer in squad["answers"]]
    # Create dataframe
    qa = pd.DataFrame(
        zip(ids, questions, answers),
        columns=["question_id", "questions", "answers"],
    )
    # Add embeddings and timestamps
    qa["embeddings"] = qa.questions.apply(lambda x: model.encode(x))
    qa["created"] = datetime.datetime.utcnow()
    qa["datetime"] = qa["created"].dt.floor("h")
    # Save to parquet
    qa.to_parquet(path)


save_qa_to_parquet(path)

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset squad (/Users/mattsharp/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [3]:
%%writefile ./qa.py

from feast import Entity, FeatureView, Field, FileSource, ValueType
from feast.types import Array, Float32, String
import os
from datetime import timedelta

path = './data/qa.parquet'

question = Entity(name="question_id", value_type=ValueType.STRING)

question_feature = Field(
    name="questions",
    dtype=String
)

answer_feature = Field(
    name="answers",
    dtype=String
)

embedding_feature = Field(name=f"embeddings", dtype=Array(Float32))

questions_view = FeatureView(
    name="qa",
    entities=[question],
    ttl=timedelta(days=1),
    schema=[question_feature, answer_feature, embedding_feature],
    source=FileSource(
      path=path,
      event_timestamp_column="datetime",
      created_timestamp_column="created",
      timestamp_field="datetime",
    ),
    tags={},
    online=True,
)

Writing ./qa.py


In [4]:
# Register the features
!feast apply

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implement

In [5]:
# Materialize the features (Don't forget to update the end date)
!feast materialize-incremental 2023-11-30T00:00:00 --views qa

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implement

In [6]:
# Query the Feature Store
from feast import FeatureStore

store = FeatureStore(repo_path=".")

ids = pd.read_parquet(path, columns=["question_id"])

feature_vectors = store.get_online_features(
    features=["qa:questions", "qa:answers", "qa:embeddings"],
    entity_rows=[{"question_id": _id} for _id in ids.question_id.to_list()],
).to_df()
print(feature_vectors.head())

                question_id  \
0  5733be284776f41900661182   
1  5733be284776f4190066117f   
2  5733be284776f41900661180   
3  5733be284776f41900661181   
4  5733be284776f4190066117e   

                                           questions  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   
2  The Basilica of the Sacred heart at Notre Dame...   
3                  What is the Grotto at Notre Dame?   
4  What sits on top of the Main Building at Notre...   

                                          embeddings  \
0  [-0.018169419839978218, 0.05504932999610901, -...   
1  [0.03145020082592964, 0.07019893825054169, 0.0...   
2  [0.05156071484088898, 0.06941291689872742, 0.0...   
3  [0.01869131810963154, 0.051365502178668976, 0....   
4  [0.017544344067573547, 0.01992807164788246, 0....   

                                   answers  
0               Saint Bernadette Soubirous  
1                a copper statue of Christ  
2   