In [1]:
# Load the environment variables
from dotenv import load_dotenv # type: ignore
load_dotenv()
import os
from openai import OpenAI # type: ignore
# Pinecone for Scalable data storage and efficient retrieval
from pinecone import Pinecone, ServerlessSpec # type: ignore

  from tqdm.autonotebook import tqdm


In [None]:
# Pinecone index
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
# Dimensions should be equal to the embedding model's dimensions
pc.create_index(
    name="rag", 
    dimension=1536, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    )
)

In [None]:
# Open json file and read it
import json
data = json.load(open("reviews.json"))
data['reviews']

In [None]:
processed_data = []
client = OpenAI()

# Embeddings: captures the semantic representation of the text in the numerical form
# for example: man and uncle are more related semantically than woman and man
for professor in data["reviews"]:
    response = client.embeddings.create(
        input=professor["review"],
        model="text-embedding-3-small",
    )

    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": professor["professor"],
        "metadata": {
            "review": professor["review"],
            "subject": professor["subject"],
            "starts": professor["stars"]
        }
    })


In [None]:
processed_data[0]

In [None]:
# Add to the database
index = pc.Index('rag')     # Index = collection
index.upsert(
    vectors=processed_data,
    namespace="ns1"         # Namespace = document
)

In [None]:
# Index stats
index.describe_index_stats()