In [2]:
import json as json
import pprint as pp
from opensearchpy import OpenSearch
import os
import pickle
import search
import index
import recipe_parser as rp
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util


## Index Setup

In [3]:
host = 'api.novasearch.org'
port = 443
user = 'user201' # Add your user name here.
password = 'Lrr1531' # Add your user password here. For testing only. Don't store credentials in code. 
index_name = user

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    url_prefix = 'opensearch',
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

### Index Deletion


In [None]:
index.delete_index(client, index_name)

### Index Creation

In [None]:
index.create_index(client, index_name)

## Recipes Setup

In [4]:
# check if a pickle file exists

if os.path.exists('./pickle_files/recipe_titles.pkl'):
    titles = pickle.load(open('./pickle_files/recipe_titles.pkl', 'rb'))
else:
    titles = rp.get_recipe_titles()

if os.path.exists('./pickle_files/recipe_descs.pkl'):
    descs = pickle.load(open('./pickle_files/recipe_descs.pkl', 'rb'))
else:
    descs = rp.get_recipe_descs()
    
if os.path.exists('./pickle_files/recipe_steps.pkl'):
    steps = pickle.load(open('./pickle_files/recipe_steps.pkl', 'rb'))
else:
    steps = rp.get_recipe_steps()



### Index Recipes

In [None]:
index.index_document(client, index_name, rp.get_recipes())

# Text-based Search

### Simple query search and response containing title and description

In [None]:
query = "carrot"
search.search_titleTxt(client, index_name, query)

### Search Recipes with duration

In [None]:
recipe = "Doughnut"
search.search_titleTotalTime(client, index_name, recipe)

### Search Recipes by duration

In [None]:
max_time = 60
search.search_recipeByTime(client, index_name, max_time)

### Text-Based Search using term queries

In [None]:
query = "chicken"
search.search_titleTxt_terms(client, index_name, query)

### Text-Based Search using boolean queries


In [None]:
query = "chocolate"
search.search_titleIngredients_bool(client, index_name, query)

# Encoding - Dual Encoders

In [None]:
#Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    return embeddings


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/msmarco-distilbert-base-v2")





### Index the embeddings

In [None]:
titles_emb = encode(titles)


with open('./pickle_files/title_embeddings.pickle', 'wb') as f:
    pickle.dump(titles_emb, f)
    

index.index_titleEmbeddings(client, index_name, titles)


In [None]:
descs_emb = encode(descs)

with open('./pickle_files/desc_embeddings.pickle', 'wb') as f:
   pickle.dump(descs_emb, f)

index.index_descEmbeddings(client, index_name, descs)

### Embedding Title Search

### Title embedding

In [None]:
query = "cake"
query_emb = encode(query)

search.search_titleEmbedding(client, index_name, query_emb)

### Description embedding

In [None]:
query = "chicken marsala"
query_emb = encode(query)

search.search_title_descEmbedding(client, index_name, query_emb)

### Sentence Transformers


In [5]:

model = SentenceTransformer("all-MiniLM-L6-v2")

description_emb_titles = model.encode(titles)

# Print the embeddings
for sentence, embedding in zip(titles, description_emb_titles):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")


Sentence: How To Make Chicken Parmesan
Embedding: [-7.17017502e-02 -8.04599896e-02 -3.01727764e-02 -6.04595505e-02
 -3.34300883e-02 -6.17256984e-02  3.56401317e-03 -9.54566617e-03
 -2.17431982e-04 -5.10126799e-02  2.20372491e-02 -4.74636331e-02
 -5.47231510e-02 -2.00091749e-02 -1.89878792e-02 -3.91439721e-02
 -9.01709031e-03 -1.35850208e-03  4.83918749e-02 -6.45890385e-02
 -1.74076017e-02 -5.14411367e-02  9.30754642e-04 -2.53194179e-02
 -1.02665666e-02  2.02082307e-03  7.51807615e-02  5.31816036e-02
  2.51335558e-02  1.80641044e-04  1.05545133e-01 -5.95339760e-02
  3.13966274e-02  1.33084115e-02 -2.68376153e-02  2.84859743e-02
 -2.21805880e-03  3.18310373e-02  9.38644260e-02  5.94721781e-03
  5.28021269e-02 -3.64296958e-02  6.89317212e-02 -7.85260722e-02
  9.55068842e-02  5.22513734e-03 -5.88852540e-03  7.89289698e-02
  1.29615460e-02 -7.65259862e-02 -1.24328248e-02 -2.25321855e-02
 -2.98503451e-02 -4.77050915e-02  1.18702259e-02  3.78689282e-02
 -1.35318354e-01 -3.65905724e-02 -5.4418

### Can be used to also compare sentence similiaritiesa

In [8]:

# Compute cosine similarity between all pairs
cos_sim_titles = util.cos_sim(description_emb_titles, description_emb_titles)


In [9]:
# Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim_titles) - 1):
    for j in range(i + 1, len(cos_sim_titles)):
        all_sentence_combinations.append([cos_sim_titles[i][j], i, j])

# Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    #ignore if its a 10 character string
    print("{} \t {} \t {:.4f}".format(titles[i], titles[j], cos_sim_titles[i][j]))

Top-5 most similar pairs:
Stuffed Bell Peppers 	 Stuffed Bell Peppers 	 1.0000
Nacho Dog 	 Nacho Dog 	 1.0000
Basil Oil 	 Basil Oil 	 1.0000
Banana Bread 	 Banana Bread 	 1.0000
Banana Bread 	 Banana Bread 	 1.0000


### By trying to search a recipe

In [74]:
question_emb = model.encode("Mousse")
corpus_emb = model.encode(titles)
cos_sin = util.cos_sim(question_emb, corpus_emb)

#Print best 5 matches 
results = util.semantic_search(question_emb, corpus_emb, top_k=5)
idx =[]
for i in range(len(results[0])):
    idx.append(results[0][i]['corpus_id'])
    
print ("The 5 most similar recipes are:")
for i in idx:
    print(titles[i], "-- similiarity of --", cos_sin[0][i])

The 5 most similar recipes are:
Gnocchi I -- similiarity of -- tensor(0.4197)
English Muffins -- similiarity of -- tensor(0.4189)
English Muffin Pizzas -- similiarity of -- tensor(0.3840)
English Muffin Mummy Pizzas -- similiarity of -- tensor(0.3826)
Lemon Linguine -- similiarity of -- tensor(0.3768)


### Check next steps

In [85]:
recipe_steps = rp.get_steps(rp.get_recipes()[str(idx[0])])
question_emb = model.encode("gnocchi")
corpus_emb = model.encode(recipe_steps)
cos_sin = util.cos_sim(question_emb, corpus_emb)


#Print best 5 matches 
results = util.semantic_search(question_emb, corpus_emb, top_k=3)
idx2 =[]
for i in range(len(results[0])):
    idx2.append(results[0][i]['corpus_id'])
    
print (f"You are in this step:  {recipe_steps[idx2[0]]}")
print("The following steps are :")
for i in range(len(recipe_steps)):
    if i > idx2[0]:
        print()
        print(recipe_steps[i], "-- similiarity of --", cos_sin[0][i])
    elif i== len(recipe_steps)-1:
        print("Congratulations! You have completed the recipe!")

    


You are in this step:  Bring a large pot of lightly salted water to a boil. Drop in gnocchi and cook for 3 to 5 minutes or until gnocchi have risen to the top; drain and serve.
The following steps are :
Congratulations! You have completed the recipe!
