# Connect to the API

In [1]:
import pprint as pp
import requests
from opensearchpy import OpenSearch
from opensearchpy import helpers

host = 'api.novasearch.org'
port = 443
user = 'user201' # Add your user name here.
password = 'Lrr1531' # Add your user password here. For testing only. Don't store credentials in code. 
index_name = user

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    url_prefix = 'opensearch',
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

<module 'opensearchpy.helpers.errors' from '/Users/joao/anaconda3/envs/nlp-cv-ir/lib/python3.9/site-packages/opensearchpy/helpers/errors.py'>

# Delete an index

In [17]:
response = client.indices.delete(
        index = user
    )
print(response)

{'acknowledged': True}


# Index creation

In [3]:
index_body = {
    "settings": {
    "index": {
      "number_of_replicas":0,
      "number_of_shards":4,
      #"refresh_interval":"-1", -- it's crashing the search
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
    "mappings": {
        "properties": {
            "title": {
                "type": "text"
            },
            "description": {
                "type": "text"
            },
            "title_embedding": {
                "type":"knn_vector",
            "dimension": 768,
            "method":{
               "name":"hnsw",
               "space_type":"innerproduct",
               "engine":"faiss",
               "parameters":{
                  "ef_construction":256,
                  "m":48
               }
            },
            },
            "ingredients": {
                "type": "keyword"
            },
            "duration": {
                "type": "integer"
            },
            "steps": {
                "type": "text"
            },
        
        }
    }
}

if client.indices.exists(index=index_name):
    print("Index already existed. You may force the new mappings.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)


Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'user201'}


# Documents creation

In [4]:
import json as json
import doc


with open("./recipes/recipes_data.json", "r") as read_file:
    data = json.load(read_file)

titles = [data[key]['displayName'] for key in data] 


doc.index_document(client, index_name, data)




created: How To Make Chicken Parmesan
created: How to Make Pesto
created: How To Make Corn Tortillas From Scratch
created: How To Make Elote (Mexican Street Corn)
created: How To Make Macarons
created: How To Make Meringue
created: How To Make Handmade Pasta
created: How to Make Perfect Polenta
created: How To Make Miso Soup
created: How to Make Perfect Guacamole
created: How to Cook Trout
created: How to Cook a Turkey
created: How to Cook: Boiled Eggs
created: How To Cook A Perfect Risotto
created: How to Cook: Crispy Tofu
created: How to Cook: Brown Rice
created: How To Cook Salmon in the Oven
created: How To Cook Brown Rice
created: How To Cook Filet Mignon
created: How To Cook Lentils on the Stove
created: Sylvia's World Famous Talked About Spareribs
created: Holiday Eggnog
created: Holiday Pizza
created: Holiday Salad
created: Holiday Seafood Pot
created: Holiday Chicken Salad
created: Holiday milk punch
created: Holiday Manhattan
created: Snickerdoodles I
created: Holiday Pumpkin

AttributeError: module 'doc' has no attribute 'embeddings_index'

#Searches and embeddings

In [5]:
data['993']

{'displayName': 'Full House Martini',
 'description': None,
 'canonicalName': 'Full House Martini',
 'prepTimeMinutes': 5,
 'cookTimeMinutes': None,
 'totalTimeMinutes': 5,
 'cookingMethod': None,
 'difficultyLevel': None,
 'images': [{'url': 'https://m.media-amazon.com/images/S/alexa-kitchen-msa-na-prod/recipes/foodnetwork/f38926516590827e25b36d00327c0d976de77bc4876c4c2b45666b9a7cfe4481.jpg',
   'hdURL': None,
   'fourKURL': None,
   'description': None,
   'type': 'UNKNOWN'}],
 'videos': [{'providerId': None,
   'title': None,
   'url': 'https://d3891lgtxgmxvq.cloudfront.net/recipes/foodnetwork/50edcb738ae2e6e885eb4a19c789fe37226f6c62c8d71975ef78eea52d7f434c.m3u8',
   'mobileUrl': 'https://d3891lgtxgmxvq.cloudfront.net/recipes/foodnetwork/50edcb738ae2e6e885eb4a19c789fe37226f6c62c8d71975ef78eea52d7f434c.mp4',
   'hdURL': None,
   'fourKURL': None,
   'description': None,
   'type': 'EPISODE'}],
 'tools': [],
 'cuisines': None,
 'meals': None,
 'courses': None,
 'occasions': None,
 'di

In [6]:
qtxt = "martini"

query_bm25 = {
  'size': 5,
  '_source': ['title', 'description'],
  'query': {
    'multi_match': {
      'query': qtxt,
      'fields': ['title']  #some recipes don't have a description
    }
  }
}

response = client.search(
    body = query_bm25,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response['hits']['hits'])
print(response['hits']['total']['value'])




Search results:
[{'_id': 'gekmsI4B7xTIhwxmYF0W',
  '_index': 'user201',
  '_score': 5.739658,
  '_source': {'description': None, 'title': 'Dirty Martini (Vodka)'},
  '_type': '_doc'},
 {'_id': 'v-kmsI4B7xTIhwxmb10F',
  '_index': 'user201',
  '_score': 5.3225455,
  '_source': {'description': None, 'title': 'Flat white martini'},
  '_type': '_doc'},
 {'_id': 'fukmsI4B7xTIhwxmX11q',
  '_index': 'user201',
  '_score': 5.1140876,
  '_source': {'description': None, 'title': 'Vodka Martini'},
  '_type': '_doc'},
 {'_id': 'gOkmsI4B7xTIhwxmX13i',
  '_index': 'user201',
  '_score': 4.8062778,
  '_source': {'description': None, 'title': 'The Classic Vodka Martini'},
  '_type': '_doc'},
 {'_id': '6ekmsI4B7xTIhwxmeF3O',
  '_index': 'user201',
  '_score': 4.55963,
  '_source': {'description': None, 'title': 'Cotton Candy Martini'},
  '_type': '_doc'}]
7


In [7]:
qtxt = 'strawberry'

query_bm25 = {
  'size': 5,
  '_source': ['title', 'description'],
  'query': {
    'multi_match': {
      'query': qtxt,
      'fields': ['title']  #some recipes don't have a description
    }
  }
}

response = client.search(
    body = query_bm25,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response['hits']['hits'])
# number of results
print(response['hits']['total']['value'])




Search results:
[{'_id': 'qekmsI4B7xTIhwxmLFxN',
  '_index': 'user201',
  '_score': 5.3225455,
  '_source': {'description': 'Strawberry in two forms infuses this layer cake '
                             'with beautiful color and natural flavor.',
              'title': 'Double Strawberry Cake'},
  '_type': '_doc'},
 {'_id': 'ZOkmsI4B7xTIhwxmG1wJ',
  '_index': 'user201',
  '_score': 5.0756874,
  '_source': {'description': None,
              'title': 'Strawberry Pudding and Macerated Strawberries with '
                       'Strawberry Sugar'},
  '_type': '_doc'},
 {'_id': 'ZekmsI4B7xTIhwxmG1xD',
  '_index': 'user201',
  '_score': 4.93254,
  '_source': {'description': None,
              'title': 'Strawberry Sauce for Strawberry Shortcake'},
  '_type': '_doc'},
 {'_id': '2-kmsI4B7xTIhwxmN1z0',
  '_index': 'user201',
  '_score': 4.8062778,
  '_source': {'description': None, 'title': 'Strawberry Cake from Scratch'},
  '_type': '_doc'},
 {'_id': 'e-klsI4B7xTIhwxm4VuW',
  '_index': 'use

### Boolean queries

In [8]:
qtxt = "Ingredients for chicken marsala"

query_bm25 = {
    "size": 5,
    "_source": ["title", "ingredients"],
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "description": "chicken marsala"
                    }
                }
            ],
            "should": [
                {
                    "multi_match": {
                        "query": qtxt,
                        "fields": ["title", "description"]
                    }
                }
            ]
        }
    }
}



response = client.search(
    body = query_bm25,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response)




Search results:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4},
 'hits': {'hits': [{'_id': 'a-kmsI4B7xTIhwxmHFyS',
                    '_index': 'user201',
                    '_score': 12.884975,
                    '_source': {'ingredients': ['butter',
                                                'oil',
                                                'chicken breast',
                                                'salt',
                                                'flour',
                                                'cream',
                                                'chicken broth',
                                                'Alcoholic beverage, wine, '
                                                'table, all',
                                                'garlic',
                                                'mushrooms',
                                                'butter',
                                                'l

### Dual encoders


In [11]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import pickle

In [13]:



#Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    return embeddings


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/msmarco-distilbert-base-v2")


doc_emb = encode(titles)

with open('embeddings.pickle', 'wb') as f:
    pickle.dump(doc_emb, f)











In [9]:
doc.index_embeddings(client, index_name,titles)

created: How To Make Chicken Parmesan
created: How to Make Pesto
created: How To Make Corn Tortillas From Scratch
created: How To Make Elote (Mexican Street Corn)
created: How To Make Macarons
created: How To Make Meringue
created: How To Make Handmade Pasta
created: How to Make Perfect Polenta
created: How To Make Miso Soup
created: How to Make Perfect Guacamole
created: How to Cook Trout
created: How to Cook a Turkey
created: How to Cook: Boiled Eggs
created: How To Cook A Perfect Risotto
created: How to Cook: Crispy Tofu
created: How to Cook: Brown Rice
created: How To Cook Salmon in the Oven
created: How To Cook Brown Rice
created: How To Cook Filet Mignon
created: How To Cook Lentils on the Stove
created: Sylvia's World Famous Talked About Spareribs
created: Holiday Eggnog
created: Holiday Pizza
created: Holiday Salad
created: Holiday Seafood Pot
created: Holiday Chicken Salad
created: Holiday milk punch
created: Holiday Manhattan
created: Snickerdoodles I
created: Holiday Pumpkin

In [14]:
# Compute the query embedding
query = "cake"
query_emb = encode(query)

query_denc = {
  'size': 5,
  '_source': ['title', 'description'],
   "query": {
        "knn": {
          "title_embedding": {
            "vector": query_emb[0].numpy(),
            "k": 2
          }
        }
      }
}

response = client.search(
    body = query_denc,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response)


Search results:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4},
 'hits': {'hits': [{'_id': 'qeknsI4B7xTIhwxm5V-L',
                    '_index': 'user201',
                    '_score': 1.7172348,
                    '_source': {'title': 'My Birthday Cake'},
                    '_type': '_doc'},
                   {'_id': 'E-kosI4B7xTIhwxmQmG5',
                    '_index': 'user201',
                    '_score': 1.689884,
                    '_source': {'title': "A Chocolate Cake That's Got It All"},
                    '_type': '_doc'},
                   {'_id': 'hekosI4B7xTIhwxmHWD0',
                    '_index': 'user201',
                    '_score': 1.6858522,
                    '_source': {'title': 'Christmas Cake Cookies'},
                    '_type': '_doc'},
                   {'_id': 'CuknsI4B7xTIhwxm_mAw',
                    '_index': 'user201',
                    '_score': 1.6652473,
                    '_source': {'title': 'Red Velvet Cake'

In [None]:
'''
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

#Define your train examples. You need more than just two examples to get good results from the model im using



#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)
'''

"\nfrom sentence_transformers import SentenceTransformer, InputExample, losses\nfrom torch.utils.data import DataLoader\n\n#Define the model. Either from scratch of by loading a pre-trained model\nmodel = SentenceTransformer('distilbert-base-nli-mean-tokens')\n\n#Define your train examples. You need more than just two examples to get good results from the model im using\n\n\n\n#Define your train dataset, the dataloader and the train loss\ntrain_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)\ntrain_loss = losses.CosineSimilarityLoss(model)\n\n#Tune the model\nmodel.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)\n"