In [5]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Alice Johnson',
  'subject': 'Introduction to Psychology',
  'stars': 5,
  'review': 'Dr. Johnson is an amazing professor! Her lectures are engaging and she really cares about her students.'},
 {'professor': 'Dr. Mark Thompson',
  'subject': 'Calculus I',
  'stars': 3,
  'review': "Dr. Thompson's lectures are clear, but his exams are extremely difficult. Be prepared to study hard."},
 {'professor': 'Dr. Emily Clark',
  'subject': 'Modern Literature',
  'stars': 4,
  'review': 'Dr. Clark has a deep knowledge of the subject, but sometimes her lectures can be a bit dry.'},
 {'professor': 'Dr. Robert Lee',
  'subject': 'Physics II',
  'stars': 2,
  'review': "Dr. Lee knows his stuff, but he isn't very good at explaining concepts. Office hours are a must."},
 {'professor': 'Dr. Maria Garcia',
  'subject': 'Organic Chemistry',
  'stars': 1,
  'review': "Dr. Garcia's course is very challenging, and she doesn't offer much help outside of class."},
 {'professor': 'Dr. John M

In [6]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model = "text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [7]:
processed_data[0]

{'values': [0.02203972265124321,
  -0.010445554740726948,
  0.01776369847357273,
  0.05536087229847908,
  0.008495187386870384,
  -0.014795499853789806,
  0.04114536568522453,
  0.04826449230313301,
  -0.010189675725996494,
  0.005953454412519932,
  0.029454531148076057,
  -0.009911051951348782,
  -0.04412493482232094,
  0.047855082899332047,
  -0.0013113805325701833,
  0.02102757804095745,
  -0.011429267935454845,
  -0.01667194627225399,
  0.03111490234732628,
  0.04046301916241646,
  0.02738475427031517,
  -0.017149588093161583,
  0.018537022173404694,
  -0.025496933609247208,
  -0.030955689027905464,
  -0.045944519340991974,
  0.017172332853078842,
  0.010582023300230503,
  0.041372813284397125,
  0.0010633199708536267,
  0.08033467829227448,
  0.008961455896496773,
  -0.02938629686832428,
  -0.03302546590566635,
  -0.023040493950247765,
  0.015022948384284973,
  0.0018608099780976772,
  0.0217667855322361,
  0.004523374605923891,
  0.008677145466208458,
  -0.0024180577602237463,
  

In [8]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [9]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}