In [37]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec, PineconeException

In [38]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [39]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Dr. Johnson is passionate about biology and makes complex concepts easy to understand. Her lectures are engaging and well-structured.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Prof. Chen is an excellent teacher who goes above and beyond to help students. His coding assignments are challenging but rewarding.'},
 {'professor': 'Dr. Sarah Williams',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Dr. Williams knows her subject well, but her lectures can be a bit dry. More interactive elements would greatly improve the class.'},
 {'professor': 'Prof. David Martinez',
  'subject': 'Mathematics',
  'stars': 4,
  'review': "Prof. Martinez explains complex mathematical concepts clearly. He's always willing to provide extra help during office hours."},
 {'professor': 'Dr. Lisa Thompson',
  'subject': 'English Literature',
  'stars': 5,
  'review'

In [42]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'], 
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding, 
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"],
            }
    })

In [44]:
processed_data[0]

{'values': [0.026227903,
  0.0044832146,
  0.040132456,
  0.047862787,
  -0.008759367,
  0.00033863328,
  0.012599433,
  0.0361167,
  -0.008985253,
  0.011677064,
  0.041111298,
  0.00077491533,
  -0.02610241,
  0.017004214,
  0.009681736,
  0.025048275,
  -0.041337185,
  -0.0152222235,
  0.04078502,
  0.040308148,
  0.038224973,
  -0.029691493,
  0.03792379,
  -0.016853625,
  -0.025876526,
  -0.053760927,
  0.013390035,
  0.015448109,
  0.030444447,
  -0.021948613,
  0.07012513,
  0.00056197046,
  0.0060173585,
  -0.012938263,
  -0.02693066,
  0.010415866,
  -0.005270679,
  -0.007259733,
  0.021095267,
  0.026830267,
  0.0092676105,
  0.005220482,
  -0.03697005,
  0.004018893,
  0.03220134,
  0.013277092,
  -0.0026306335,
  -0.02005368,
  0.041487776,
  0.051276177,
  -0.026428692,
  0.015184576,
  0.024747094,
  0.0060957912,
  -0.0345104,
  0.01574929,
  0.015962629,
  0.046105895,
  -0.027658517,
  -0.014055144,
  0.04040854,
  0.0038432037,
  -0.0038620273,
  -0.008797014,
  -0.02

In [46]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [48]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}