In [29]:
import sys
from pathlib import Path
import numpy as np
from retrieval import load_passages_from_zip, embed_passages
import os
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()  # loads .env file
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [30]:
path = "../data/corpora/passage_level/clapnq.jsonl.zip"

In [31]:
passages = load_passages_from_zip(path, limit = 2)
print(f"Loaded {len(passages)} passages\n")

for p in passages:
    print(f"ID: {p['id']}")
    print(f"Title: {p['title']}")
    print(f"Text: {p['text'][:150]}...\n")


Loaded 2 passages

ID: 837799097_6931-7548-0-617
Title: French Revolution
Text: French Revolution
After the Thermidorian Reaction , an executive council known as the Directory assumed control of the French state in 1795 . They sus...

ID: 837799097_7549-7959-0-406
Title: French Revolution
Text: French Revolution
The modern era has unfolded in the shadow of the French Revolution . Almost all future revolutionary movements looked back to the Re...



In [32]:
embedded_passages = embed_passages(passages)
print(f"Embedded {len(embedded_passages)} passages.")
print(len(embedded_passages[0]))

Embedded 2 passages.
1536


In [33]:
vec1 = embedded_passages[0]
vec2 = embedded_passages[1]

print("Embedding for passage 0 (first 10 dimensions: ", vec1[:10])
print("Embedding for passage 1 (first 10 dimensions: ", vec2[:10])

Embedding for passage 0 (first 10 dimensions:  [-0.0364680290222168, -0.0012849429622292519, 0.04229118674993515, -0.003382875816896558, -0.0010735131800174713, 0.021344585344195366, -0.0027152029797434807, -0.008310563862323761, -0.02869684249162674, 0.06292358785867691]
Embedding for passage 1 (first 10 dimensions:  [0.000643163628410548, -0.019829612225294113, 0.03963711857795715, -0.006830936297774315, 0.025068873539566994, 0.0143140172585845, 0.03921709209680557, -0.00244139670394361, -0.020492808893322945, 0.06335748732089996]


In [34]:
cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
print("Cosine similarity:", cos_sim)

Cosine similarity: 0.5553952146334091


In [35]:
Example_passages = [
    {
        "id": "synthetic_1",
        "title": "French Revolution Overview",
        "text": (
            "The French Revolution began in 1789 as widespread frustration with monarchy and "
            "economic inequality grew. Citizens demanded political reform, eventually leading "
            "to the fall of the Bourbon king and the rise of the National Assembly."
        ),
    },
    {
        "id": "synthetic_2",
        "title": "French Revolution Causes and Outcomes",
        "text": (
            "In 1789, popular discontent with royal authority and financial crisis sparked the "
            "French Revolution. Reformers pushed for democratic changes, ultimately overthrowing "
            "the monarchy and establishing a new political order in France."
        ),
    },
    {
        "id": "synthetic_3",
        "title": "Cloud Load Balancing",
        "text": (
            "Cloud load balancers distribute incoming traffic across multiple servers to ensure "
            "application availability and reliability. They help maintain efficient resource "
            "utilization and prevent service outages under high demand."
        ),
    },
    {
        "id": "synthetic_4",
        "title": "Coral Reef Ecosystems",
        "text": (
            "Coral reefs are diverse underwater ecosystems formed by colonies of tiny marine "
            "organisms. They support a vast range of sea life and are vital to ocean health, "
            "but climate change poses a major risk to their survival."
        ),
    },
]

In [36]:
Example_embedded = embed_passages(Example_passages)

In [37]:
Example_vec1 = Example_embedded[0]
Example_vec2 = Example_embedded[1]
Example_vec3 = Example_embedded[2]
Example_vec4 = Example_embedded[3]

In [38]:
cos_sim1 = np.dot(Example_vec1, Example_vec2) / (np.linalg.norm(Example_vec1) * np.linalg.norm(Example_vec2))
cos_sim2 = np.dot(Example_vec3, Example_vec4) / (np.linalg.norm(Example_vec3) * np.linalg.norm(Example_vec4))
print("Cosine similarity:", cos_sim1)
print("Cosine similarity:", cos_sim2)


Cosine similarity: 0.784966798948737
Cosine similarity: 0.1523986586338686
