# Movies recommendation with Azure Open AI & Azure Cognitive Search
## Part 1: Embeddings generation with Azure Open AI and Azure Cognitive Search ingestion

<img src="https://github.com/retkowsky/images/blob/master/movies_search.png?raw=true">

In [None]:
#%pip install azure-search-documents --pre --upgrade

In [None]:
import json
import math
import openai
import os
import pandas as pd
import pickle
import pytz
import requests
import sys
import tiktoken
import time

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    VectorSearch,
    SimpleField,
    SemanticSettings,
    SemanticField,
    SemanticConfiguration,
    SearchIndex,
    SearchFieldDataType,
    SearchField,
    SearchableField,
    PrioritizedFields,
    HnswVectorSearchAlgorithmConfiguration,
)

from datetime import datetime
#from dotenv import load_dotenv
from openai.embeddings_utils import get_embedding, cosine_similarity
from tqdm import tqdm

In [None]:
sys.version

In [None]:
local_tz = pytz.timezone(requests.get("https://ipinfo.io").json()["timezone"])
print("Local time:", datetime.now(local_tz).strftime("%d-%b-%Y %H:%M:%S"))

In [None]:
print("Open AI version:", openai.__version__)

Load configuration settings from GitHub Codespace secrets

In [None]:
#load_dotenv("azure.env")


openai.api_type: str = "azure"
# openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
# openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
# openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_version = os.getenv("AZURE_OPENAI_MODEL_CHAT_VERSION")

acs_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
acs_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

Make sure you include your embedding model deployment name (it may be different)

In [None]:
# Azure Open AI embeddings model to use
embeddings_engine = "text-embedding-ada-002-Unai"

- Vector search is in public preview
- Model name: text-embedding-ada-002
- Model version: 2
- API version: 2023-05-15

In [None]:
# Azure Cognitive Search index name to create
index_name = "moviereview"

## 0. Azure Cognitive Search vector store
<img src="https://github.com/retkowsky/images/blob/master/vector_search_architecture.png?raw=true">

## 1. Data

In [None]:
EXCEL_FILE = "movies.xlsx"

!ls $EXCEL_FILE -lh

In [None]:
df = pd.read_excel(EXCEL_FILE)

In [None]:
df["title"] = df["title"].astype(str)
df["year"] = df["year"].astype(str)

columns_to_drop = ["tagline", "website"]
df = df.drop(columns_to_drop, axis=1)

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df = df.drop_duplicates()
df.shape

In [None]:
df.head(5)

In [None]:
# Removing some extra spaces
df["description"] = df["description"].str.replace("  ", " ")
df["title"] = df["title"].str.replace("  ", " ")
df["cast"] = df["cast"].str.replace("  ", " ")
df["director"] = df["director"].str.replace("  ", " ")
df["genres"] = df["genres"].str.replace("  ", " ")

Analyze number of tokens needed for description field embedding

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")
df["nb_tokens"] = df["description"].apply(lambda x: len(tokenizer.encode(x)))
df = df[df.nb_tokens < 8192]
len(df)

In [None]:
df.head(5)

In [None]:
df["nb_tokens"].describe()

In [None]:
df = df.drop("nb_tokens", axis=1)

In [None]:
df.shape

## 2. Generating text embeddings with Azure Open AI

### Vectors embeddings

In [None]:
print("Embedding engine:", embeddings_engine)

In [None]:
def openai_text_embeddings(text):
    """
    Generating embeddings from text using Azure Open AI
    Input: text
    Output: text embeddings
    """
    embeddings = openai.Embedding.create(
        input=text,
        deployment_id=embeddings_engine,
    )
    embeddings = embeddings["data"][0]["embedding"]

    return embeddings

In [None]:
emb = openai_text_embeddings("My name is James Bond")
emb[:5]

In [None]:
print("Size of the embeddings =", len(emb))

### Running the embedding for the 'description/overview' column (for me it took 20 minutes , maybe faster on other machines)

In [None]:
print("Running the embedding process...")
df["embed_overview"] = None

with tqdm(total=len(df)) as pbar:
    def apply_embedding(x):
        """
        Azure Open AI text embedding
        """
        global pbar
        embedding = get_embedding(x["description"], engine=embeddings_engine)
        pbar.update(1)  # Update the progress bar
        return embedding
    df["embed_overview"] = df.apply(apply_embedding, axis=1)

### Running the embedding for the 'title' column

In [None]:
print("Running the embedding process...")
df["embed_title"] = None

with tqdm(total=len(df)) as pbar:
    def apply_embedding(x):
        """
        Azure Open AI text embedding
        """
        global pbar
        embedding = get_embedding(x["title"], engine=embeddings_engine)
        pbar.update(1)  # Update the progress bar
        return embedding
    df["embed_title"] = df.apply(apply_embedding, axis=1)

### Saving the documents (initial data + embeddings) into a file

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
documents = df.to_dict(orient="records")
print("Number of documents =", len(documents))

In [None]:
# Saving the documents into a pkl file
PKL_DIR = "embeddings"
PKL_FILE = "movies.pkl"

os.makedirs(PKL_DIR, exist_ok=True)

print("Saving documents...")
with open(os.path.join(PKL_DIR, PKL_FILE), 'wb') as f:
    pickle.dump(documents, f)
print("Done")

os.listdir(PKL_DIR)

## 3. Cosine similarity principles

In [None]:
def get_cosine_similarity(vector1, vector2):
    """
    Get cosine similarity value between two embedded vectors
    Inputs: 2 embedded vectors
    Output: cosine similarity value
    """
    if len(vector1) != len(vector2):
        return None
    
    dot_product = sum(x * y for x, y in zip(vector1, vector2))
    magnitude1 = math.sqrt(sum(x * x for x in vector1))
    magnitude2 = math.sqrt(sum(x * x for x in vector2))
    cosine_similarity = round(dot_product / (magnitude1 * magnitude2), 15)

    if cosine_similarity == 1:
        print(
            "\033[1;31;34mStrictly identical text: Cosine similarity =",
            cosine_similarity,
        )
    
    elif cosine_similarity >= 0.8:
        print("\033[1;31;32mSame semantic text: Cosine similarity =", cosine_similarity)
    
    else:
        print(
            "\033[1;31;91mDifferent semantic text: Cosine similarity =",
            cosine_similarity,
        )
    
    return cosine_similarity

In [None]:
emb1 = openai_text_embeddings("My name is James Bond")
emb2 = openai_text_embeddings("Sean Connery.")
emb3 = openai_text_embeddings("Azure Open AI is great!")

In [None]:
get_cosine_similarity(emb1, emb1)

In [None]:
get_cosine_similarity(emb1, emb2)

In [None]:
get_cosine_similarity(emb1, emb3)

In [None]:
get_cosine_similarity(emb2, emb1)

In [None]:
get_cosine_similarity(emb2, emb3)

## 4. Quick local tests (without Azure Cognitive Search)

In [None]:
def quick_search(df, user_query, top_n=3):
    """
    Searching documents
    Inputs: dataframe, query and topn
    Output: results
    """
    embedding = get_embedding(
        user_query,
        engine=embeddings_engine,
    )
    df["cosine_similarity"] = df.embed_overview.apply(
        lambda x: cosine_similarity(x, embedding)
    )
    results = df.sort_values("cosine_similarity", ascending=False).head(top_n)
    display(results)

    return results

In [None]:
results = quick_search(df, "I want to see some Terminator movies", top_n=3)

In [None]:
results = quick_search(df, "Je veux voir un film de James Bond", top_n=3)

In [None]:
results = quick_search(df, "Quiero ver películas de ciencia ficción", top_n=5)

In [None]:
results = quick_search(df, "Voglio vedere dei film musicali", top_n=5)

In [None]:
results = quick_search(df, "音楽映画が観たい", top_n=5)

## 5. Azure Cognitive Search functions

In [None]:
def delete_index(index_name):
    """
    Deleting an Azure Cognitive Search index
    Input: Azure Cognitive Search index
    Output: None
    """
    start = time.time()
    search_client = SearchIndexClient(
        endpoint=acs_endpoint, credential=AzureKeyCredential(acs_key)
    )
    
    try:
        print("Deleting the Azure Cognitive Search index:", index_name)
        search_client.delete_index(index_name)
        print("Done. Elapsed time:", round(time.time() - start, 2), "secs")
    except:
        print("Cannot delete index. Check the index name.")

In [None]:
def index_stats(index_name):
    """
    Get statistics about Azure Cognitive Search index
    Input: Azure Cognitive Search index
    Output: Get Azure Cognitive Search index stats
    """
    url = (
        acs_endpoint
        + "/indexes/"
        + index_name
        + "/stats?api-version=2021-04-30-Preview"
    )
    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key,
    }
    response = requests.get(url, headers=headers)
    print("Azure Cognitive Search index status for:", index_name, "\n")

    if response.status_code == 200:
        res = response.json()
        print(json.dumps(res, indent=2))
        document_count = res["documentCount"]
        storage_size = res["storageSize"]

    else:
        print("Request failed with status code:", response.status_code)

    return document_count, storage_size

In [None]:
def index_status(index_name):
    """
    Azure Cognitive Search index status
    Input: Azure Cognitive Search index
    Output: Get Azure Cognitive Search index status
    """
    print("Azure Cognitive Search Index:", index_name, "\n")

    headers = {"Content-Type": "application/json", "api-key": acs_key}
    params = {"api-version": "2021-04-30-Preview"}
    index_status = requests.get(
        acs_endpoint + "/indexes/" + index_name, headers=headers, params=params
    )

    try:
        print(json.dumps((index_status.json()), indent=5))
    except:
        print("Request failed with status code:", response.status_code)

## 6. Creating an Azure Cognitive Search index

In [None]:
try:
    # Setting the Azure Cognitive Search client
    print("Setting the Azure Cognitive Search client")
    search_client = SearchIndexClient(
        endpoint=acs_endpoint,
        credential=AzureKeyCredential(acs_key)
    )
    print("Done. Azure Cognitive Search client defined.")
    print(search_client)

except:
    print("Request failed. Cannot create Azure Cognitive Search client:", acs_endpoint)

### Removing any existing index

In [None]:
delete_index(index_name)

### Creating search index

In [None]:
vector_search_dim = len(openai_text_embeddings("Hello"))
print("Vector embeddings size =", vector_search_dim)

In [None]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=acs_endpoint, credential=AzureKeyCredential(acs_key)
)
fields = [
    # Index
    SimpleField(
        name="imdb_id",
        type=SearchFieldDataType.String,
        key=True,
        sortable=True,
        filterable=True,
        facetable=True,
    ),
    # Searchable fields
    SearchableField(name="title", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="cast", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="director", type=SearchFieldDataType.Single, filterable=True),
    SearchableField(name="description", type=SearchFieldDataType.String),
    SearchableField(name="genres", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="year", type=SearchFieldDataType.String, filterable=True),
    # Vectors embeddings
    SearchField(
        name="embed_overview",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=vector_search_dim,
        vector_search_configuration="my-vector-config",
    ),
    SearchField(
        name="embed_title",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=vector_search_dim,
        vector_search_configuration="my-vector-config",
    ),
]


# Configuration
vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            # HNSW is a graph-based Approximate Nearest Neighbors (ANN)
            # algorithm optimized for high-recall, low-latency applications
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine",  # Cosine similarity metric
            },
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="genres")],
        prioritized_content_fields=[SemanticField(field_name="description")],
    ),
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search,
    semantic_settings=semantic_settings,
)

try:
    result = index_client.create_or_update_index(index)
    print(f"Done. The {result.name} Azure Cognitive Search index has been created!")

except:
    print(f"Error. The {result.name} Azure Cognitive Search index cannot be created.")

## 7. Uploading the documents into the index

In [None]:
print("Number of documents to load =", len(documents))

In [None]:
def upload_documents(docs):
    """
    Uploading documents into the Azure Cognitive Search index
    Inputs: documents
    Outputs: loading documents to Azure Cognitive Search index
    """
    search_client = SearchClient(
        endpoint=acs_endpoint,
        index_name=index_name,
        credential=AzureKeyCredential(acs_key),
    )
    result = search_client.upload_documents(docs)

In [None]:
def chunk_list(input_list, chunk_size):
    """
    Chunk a list according to the chunk_size value
    Inputs: documents (list), chunk size list
    Outputs: chunk list of documents
    """
    return [
        input_list[i : i + chunk_size] for i in range(0, len(input_list), chunk_size)
    ]

In [None]:
start = time.time()

chunk_size = 500  # We will load documents chunk by chunk
chunks = chunk_list(documents, chunk_size)
idx = 1

print("Loading the documents into the Azure Cognitive Search index...")
print("Total number of documents to load =", len(documents))
print()

loaded_docs = chunk_size

for chunk in chunks:
    pct_done = round(loaded_docs / len(documents) * 100)
    if pct_done >= 100:
        pct_done = 100

    print(
        f"Processing chunk {idx:03}",
        f"| Number of loaded documents = {loaded_docs:06}",
        "of",
        len(documents),
        "| Done:",
        pct_done,
        "%",
    )
    upload_documents(chunk)
    loaded_docs += chunk_size
    idx += 1

elapsed = time.time() - start
print("\nDone")
print(
    "Elapsed time: "
    + time.strftime(
        "%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:15], time.gmtime(elapsed)
    )
)

In [None]:
print(f"Elapsed time to process {len(documents)} documents = {round(elapsed)} seconds")
print(f"Time per processed document in second = {round(elapsed / len(documents), 5)}")
print(f"Number of processed documents per second = {int(len(documents) / elapsed)}")

## 8. Azure Cognitive Search Index informations

In [None]:
index_name

In [None]:
index_status(index_name)

In [None]:
document_count, storage_size = index_stats(index_name)

In [None]:
print("Number of documents in the index =", f"{document_count:,}")
print("Size of the index =", round(storage_size / (1024 * 1024), 2), "MB")

Note: Please wait some time in order to have the updated results

> Go to the next notebook