In [None]:
# Reads the records from the last level of the hierarchy
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/" + "Files/masterdata/product_hier/dairy_products.csv"
df = pd.read_csv("/lakehouse/default/" + "Files/masterdata/product_hier/dairy_products.csv")
display(df)


This how the DataFrame looks:
(first record)

ID	    product_hier	description

DR001	Whole milk	    All forms of whole milk, meaning full fat.


Will add a new column that will be a concatenation of the product_hier and description, will look like:
Category: Whole Milk Description: All forms of whole milk, meaning full fat.


In [None]:
# Create new column text_to_embedd
df['text_to_embedd'] = "Category: " + df['product_hier'].map(str) + ". " + "Description:" + df['description'].map(str)
display(df)

In [None]:
# Now let's prepare to call the OpenAI model to generate and embedding
!pip install openai
!pip install tiktoken
import openai
import re
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken

# s is input text , and all special characters are removed. Including double spaces, double dots, etc.
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    return s

# We need to use tokenizer to get the number of tokens before calling the embedding
tokenizer = tiktoken.get_encoding("cl100k_base")

openai.api_type = "azure"
openai.api_base = "https://yourservicename.openai.azure.com/"
openai.api_version = "2023-03-15-preview"
openai.api_key = "yoursecretkey" #Never share this key with anyone or leave it in a notebook, repo, etc.

# remove double spaces, dots, etc.
df['text_to_embedd'] = df['text_to_embedd'].apply(lambda x : normalize_text(x))
# add new column with number of tokens
df['n_tokens'] = df["text_to_embedd"].apply(lambda x: len(tokenizer.encode(x)))


#Now we call the OpenAI A, model for getting the embeddings
df['vector'] = df["text_to_embedd"].apply(lambda x : get_embedding(x, engine = 'dep-ada002')) # engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model

display(df)



In [None]:
# Now we simulate that we are loading a new product, 
# in this case Emmental cheese, and we will try to locate the category for
# this product using embeddings

# Define the data
data = {
    "Product_ID": [1],
    "Article": ["Emmental Cheese"],
    "Description": [
        "Swiss cheese, world-famous for its distinctive holes and one-of-a-kind flavor. Aged minimum for 4 four months"
    ],
}

# Create the DataFrame
df_newproduct = pd.DataFrame(data)

# Create new column text_to_embedd
df_newproduct['text_to_embedd'] = "Article: " + df_newproduct['Article'].map(str) + ". " + "Description:" + df_newproduct['Description'].map(str)
display(df_newproduct)

#Now we call the OpenAI A, model for getting the embeddings
df_newproduct['vector'] = df_newproduct["text_to_embedd"].apply(lambda x : get_embedding(x, engine = 'dep-ada002')) # engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model

display(df_newproduct)

In [26]:
# Search for a description in the product hierarchy
def search_hier_node(df, v_to_search, top_n=3, to_print=True,):

    embedding = v_to_search
    df["similarities"] = df.vector.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print:
        print(res)
    return res

res = search_hier_node(df,df_newproduct['vector'].iloc[0],  top_n=3, to_print=False )

res = pd.DataFrame(res,
                columns =['ID', 'product_hier',
                'similarities'])
display(res)

StatementMeta(, 2281c244-32b8-44dd-ab97-e9917cfc0fdc, 28, Finished, Available)



SynapseWidget(Synapse.DataFrame, 737bab8b-4d4c-427f-8279-7040594cf2b8)