# Experiment 2 - Compare embeddings, home grown vs Open AI or Open Source

Objective: Determine if open source or OpenAI embeddings can yield similar outcomes as high-end embeddings in RAG.

Before even getting to RAG, we will just do a comparison with cosine similarity measures to see how close we get. 

## Setup, imports, logging, keys


In [17]:
import os
from dotenv import load_dotenv, find_dotenv
import openai
import pandas as pd
import ipywidgets as widgets
import time
import logging
from openai.embeddings_utils import cosine_similarity

# delete log file if exists to start fresh
log_file_path = 'exp_2.log'

if os.path.exists(log_file_path):
    os.remove(log_file_path)

logger = logging.getLogger()
fhandler = logging.FileHandler(filename=log_file_path, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [18]:
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key  = os.environ['OPENAI_API_KEY']

# Step 1 - Create OpenAI ada embeddings using the API

In [19]:
def get_embedding(text, model="text-embedding-ada-002"):
   
    text = text.replace("\n", " ")

    retry_count = 0
    max_retries = 4
    wait_time = 7  # Initial wait time in seconds

    while retry_count < max_retries:
        try:
            logger.debug(f"Try to embed: {text}")
            embedding = openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
            return embedding

        except Exception as e:
            print(f"An error occurred: {str(e)[:30]}")
            print(f"Retrying in {wait_time} seconds...")

            logger.debug(f"An error occurred: {str(e)[:30]}")
            logger.debug(f"Retrying in {wait_time} seconds...")
            
            time.sleep(wait_time)
            retry_count += 1
            wait_time *= 3  # Exponential backoff: double the wait time for each retry

    print("Exceeded maximum number of retries. Aborting.")
    logger.debug("Exceeded maximum number of retries. Aborting.")
    return None


In [20]:
df = pd.read_csv('bot_question_intent_map.csv')
len(df)

seen_questions = {}

In [21]:
# Create embeddings for the original question
df['question_embedding'] = df.apply(lambda row: get_embedding(row['question']), axis=1)

In [22]:
# Create embeddings for the intent question
df['intent_embedding'] = df.apply(lambda row: get_embedding(row['intent question']), axis=1)

# Step 2 - Calcualte the similarity and persist in a new file

In [23]:
# Calculate the distance
df['embedding_distance'] = df.apply(lambda row: cosine_similarity(row['question_embedding'], row['intent_embedding']), axis=1)

In [24]:
# Save embeddings
df.to_csv('bot_questions_embeddigns_ada.csv', index=False)