#Embedding-Based Retrieval with Deep Lake and OpenAI
Copyright 2024 Denis Rothman

**November 1,2024** OpenAI Reasoning Models such as o1-preview can be used for RAG-Driven Generative AI in the ecosystem of this Chapter.





# 1. Installing the environment

*First run the following cells and restart Google Colab session if prompted. Then run the notebook again cell by cell to explore the code.*

In [1]:
try:
  import deeplake
except:
  !pip install deeplake==3.9.18
  import deeplake



In [2]:
!pip install openai==1.40.3



In [4]:
# For Google Colab and Activeloop while waiting for Activeloop (April 2024) pending new version
#This line writes the string "nameserver 8.8.8.8" to the file. This is specifying that the DNS server the system
#should use is at the IP address 8.8.8.8, which is one of Google's Public DNS servers.
#with open('/etc/resolv.conf', 'w') as file:
 #  file.write("nameserver 8.8.8.8")

In [3]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import os
import pickle

# Scopes for Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive']

def authenticate_google_drive():
    creds = None
    # Check for previously saved credentials
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # If no valid credentials, authenticate user
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for future use
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return build('drive', 'v3', credentials=creds)

def list_files(service):
    results = service.files().list(
        pageSize=10, fields="files(id, name)").execute()
    items = results.get('files', [])
    if not items:
        print('No files found.')
    else:
        print('Files:')
        for item in items:
            print(f"{item['name']} ({item['id']})")

if __name__ == '__main__':
    drive_service = authenticate_google_drive()
    list_files(drive_service)




Files:
activeloop.txt (1eRoLD9eFtfsTjv_SPYYuNiMXKHe6HIDC)
activeloop.txt (19iifJ1d_Do5qVVwry0JijpGFBmI0Ox-V)
api_key.txt (1u4M-skJmfqO7Xp5QirzWQYu7WoExcuC4)
api_key.txt (1x055GOhnKBrqBNQPOn79JJ6ivEfh5FDsJqhSyfNo1tE)
openai_api_key.txt (1i15toAA1iEn7GXFD357wyvMT3vRl7kw4)
RAG_book (1JYnqwBSgAlTNJRwwuDz9bfHgupxoFMNW)
openai_api_key.txt (15eZSHywfoScnR9vJJr3zi5HKmpbfiJd0)
AdvancedCyber_RAG_v2.ipynb (1rb8k02TdNLxirA0w0naTKCl7IXVmnioG)
On Being Human - Reading Group.pdf (1BDK8x1LgH8ocn2qpEOwspQW0MlFHM6R-)
Finding_Ranking_v3.ipynb (1244BZvf_rcuKMz_-GeaKkr8B0_1_s3QX)


In [4]:

def download_file(service, file_id, output_path):
    request = service.files().get_media(fileId=file_id)
    with open(output_path, 'wb') as f:
        f.write(request.execute())
    print(f"File downloaded to {output_path}")


file_id = "1u4M-skJmfqO7Xp5QirzWQYu7WoExcuC4" 
download_file(drive_service, file_id, 'api_key.txt')

# Read the API key from the file
with open('api_key.txt', 'r') as file:
    api_key = file.read().strip()
    OPENAI_API_KEY = api_key

print(f"API Key: {OPENAI_API_KEY}")




File downloaded to api_key.txt
API Key: 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


In [5]:
file_id = "1eRoLD9eFtfsTjv_SPYYuNiMXKHe6HIDC" 
download_file(drive_service, file_id, 'activeloop.txt')

# Read the API key from the file
with open('activeloop.txt', 'r') as file:
    activeloop_api_key = file.read().strip()


os.environ['ACTIVELOOP_TOKEN'] =activeloop_api_key


File downloaded to activeloop.txt


In [6]:
!pip install sentence-transformers==3.0.1

Collecting sentence-transformers==3.0.1
  Using cached sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers==3.0.1)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers==3.0.1)
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.15.1 (from sentence-transformers==3.0.1)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub>=0.15.1->sentence-transformers==3.0.1)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.15.1->sentence-transformers==3.0.1)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting pyyaml>=5.1 (from huggingface-hub>=0.15.1->sentence-transformers==3.0.1)
  Downloading PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_

# Retrieval Augmented Generation

### Initiating the query process

**Replace `hub://denis76/space_exploration_v1` by your organization and dataset name**

In [7]:
vector_store_path = "hub://stoneygalatia/text_embedding" 

In [8]:
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
import deeplake.util
ds = deeplake.load(vector_store_path)

\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/stoneygalatia/text_embedding



|

hub://stoneygalatia/text_embedding loaded successfully.



 

In [9]:
vector_store = VectorStore(path=vector_store_path)

Deep Lake Dataset in hub://stoneygalatia/text_embedding already exists, loading from the storage


## Input and Query Retrieval

## Input

### Retrieval query

In [10]:
def embedding_function(texts, model="text-embedding-3-small"):
   if isinstance(texts, str):
       texts = [texts]
   texts = [t.replace("\n", " ") for t in texts]
   return [data.embedding for data in openai.embeddings.create(input = texts, model=model).data]

In [12]:
def get_user_prompt():
    # Request user input for the search prompt
    return input("Enter your search query: ")

def search_query(prompt):
    # Assuming `vector_store` and `embedding_function` are already defined
    search_results = vector_store.search(embedding_data=prompt, embedding_function=embedding_function)
    return search_results

# Get the user's search query
#user_prompt = get_user_prompt()
# or enter prompt if it is in a queue
import openai
import os
from openai import OpenAI
client = OpenAI(api_key=api_key)

user_prompt="Tell me about space exploration on the Moon and Mars."
openai.api_key = api_key
# Perform the search
search_results = search_query(user_prompt)

# Print the search results
print(search_results)

{'id': ['9c2009e6-c7bd-11ef-9ec1-00155dbe8695', '9c202070-c7bd-11ef-9ec1-00155dbe8695', '9c201094-c7bd-11ef-9ec1-00155dbe8695', '9c2019f4-c7bd-11ef-9ec1-00155dbe8695'], 'metadata': [{'source': 'llm.txt'}, {'source': 'llm.txt'}, {'source': 'llm.txt'}, {'source': 'llm.txt'}], 'text': ['Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars \'s surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok

In [13]:
print(user_prompt)

Tell me about space exploration on the Moon and Mars.


In [14]:
# Function to wrap text to a specified width
def wrap_text(text, width=80):
    lines = []
    while len(text) > width:
        split_index = text.rfind(' ', 0, width)
        if split_index == -1:
            split_index = width
        lines.append(text[:split_index])
        text = text[split_index:].strip()
    lines.append(text)
    return '\n'.join(lines)

In [15]:
import textwrap

# Assuming the search results are ordered with the top result first
top_score = search_results['score'][0]
top_text = search_results['text'][0].strip()
top_metadata = search_results['metadata'][0]['source']

# Print the top search result
print("Top Search Result:")
print(f"Score: {top_score}")
print(f"Source: {top_metadata}")
print("Text:")
print(wrap_text(top_text))

Top Search Result:
Score: 0.6017717719078064
Source: llm.txt
Text:
Exploration of space, planets, and moons "Space Exploration" redirects here.
For the company, see SpaceX . For broader coverage of this topic, see
Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11
mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on
Spaceflight History History of spaceflight Space Race Timeline of spaceflight
Space probes Lunar missions Mars missions Applications Communications Earth
observation Exploration Espionage Military Navigation Settlement Telescopes
Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft
Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space
stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and
reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight
types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of
space organizatio

## Augmented Input

In [16]:
augmented_input=user_prompt+" "+top_text

In [17]:
print(augmented_input)

Tell me about space exploration on the Moon and Mars. Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of space organizations Space agen

# Generation and  output with OpenAI Reasoning models

The ecosystem of this chapter enables the pipeline to perform embedded retrieval, augment the prompt and generate an output with `o1-preview`.

In [19]:
import openai
from openai import OpenAI
import time

import openai
import os
from openai import OpenAI
client = OpenAI(api_key=api_key)


gpt_model="o1-preview"
start_time = time.time()  # Start timing before the request

def call_gpt4_with_full_text(itext):
    # Join all lines to form a single string
    text_input = '\n'.join(itext)
    prompt = f"Read the following text as a space exploration expert, then summarize or elaborate on the following content with as much explanation as possibl and different sections:\n{text_input}"


    try:
        response = client.chat.completions.create(
            model=gpt_model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return str(e)

gpt4_response = call_gpt4_with_full_text(augmented_input)

response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

print(gpt_model, "Response:", gpt4_response)

Response Time: 31.50 seconds
o1-preview Response: # Space Exploration on the Moon and Mars

Space exploration has been a pinnacle of human achievement, pushing the boundaries of our scientific understanding and technological capabilities. Among the celestial bodies that have captivated our imagination, the Moon and Mars stand out as primary targets for exploration due to their relative proximity and potential for scientific discovery. This comprehensive overview will delve into the past, present, and future of lunar and Martian exploration, the spacecraft and missions involved, the applications of space exploration, and the organizations that drive these endeavors.

---

## Introduction

The quest to explore the Moon and Mars encapsulates humanity's desire to understand our place in the cosmos. The Moon, our closest celestial neighbor, has been the subject of fascination and study for millennia. Mars, often called the "Red Planet," has intrigued scientists with its similarities and dif

### Formatted response

In [20]:
import textwrap
import re
from IPython.display import display, Markdown, HTML
import markdown

def print_formatted_response(response):
    # Check for markdown by looking for patterns like headers, bold, lists, etc.
    markdown_patterns = [
        r"^#+\s",           # Headers
        r"^\*+",            # Bullet points
        r"\*\*",            # Bold
        r"_",               # Italics
        r"\[.+\]\(.+\)",    # Links
        r"-\s",             # Dashes used for lists
        r"\`\`\`"           # Code blocks
    ]

    # If any pattern matches, assume the response is in markdown
    if any(re.search(pattern, response, re.MULTILINE) for pattern in markdown_patterns):
        # Markdown detected, convert to HTML for nicer display
        html_output = markdown.markdown(response)
        display(HTML(html_output))  # Use display(HTML()) to render HTML in Colab
    else:
        # No markdown detected, wrap and print as plain text
        wrapper = textwrap.TextWrapper(width=80)
        wrapped_text = wrapper.fill(text=response)

        print("Text Response:")
        print("--------------------")
        print(wrapped_text)
        print("--------------------\n")

print_formatted_response(gpt4_response)

# Evaluating the output with  Cosine Similarity

with initial user prompt

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])
    return similarity[0][0]

similarity_score = calculate_cosine_similarity(user_prompt, gpt4_response)

print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.466


with augmented user prompt

In [22]:
similarity_score = calculate_cosine_similarity(augmented_input, gpt4_response)

print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.573


In [23]:
%pip install sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

Note: you may need to restart the kernel to use updated packages.


  from tqdm.autonotebook import tqdm, trange


In [24]:
def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]


similarity_score = calculate_cosine_similarity_with_embeddings(augmented_input, gpt4_response)
print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.691
