In [8]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
import pinecone
from pinecone import Pinecone, ServerlessSpec

In [10]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [16]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Thompson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Engaging lectures and challenging assignments. Dr. Thompson makes complex concepts easier to understand.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Brilliant instructor! Prof. Chen's passion for coding is contagious. Highly recommend his classes."},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Knowledgeable but sometimes hard to follow. Office hours were helpful.'},
 {'professor': 'Prof. David Williams',
  'subject': 'Literature',
  'stars': 4,
  'review': 'Insightful analysis of classic works. Prof. Williams encourages critical thinking.'},
 {'professor': 'Dr. Rachel Lee',
  'subject': 'Chemistry',
  'stars': 5,
  'review': "Outstanding teacher! Dr. Lee's lab experiments are both educational and fun."},
 {'professor': 'Prof. Robert Taylor',
  'subject': 'History',
  'stars': 2,
  'review': 'Dry lectur

In [22]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review['review'],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [23]:
processed_data[0]

{'values': [-0.004566982,
  -0.007491067,
  -0.0163884,
  0.016861662,
  0.03650881,
  0.010702491,
  0.026610866,
  0.012561736,
  0.017564794,
  0.021188632,
  0.049408592,
  0.057278268,
  -0.026259301,
  -0.034264196,
  0.025326297,
  0.02147259,
  0.0012997813,
  -0.022716593,
  0.044135097,
  0.05016581,
  0.03818551,
  -0.014819873,
  0.026516214,
  0.0024203989,
  -0.05684557,
  -0.07491067,
  0.011182514,
  0.019092755,
  -0.02177007,
  -0.0054188543,
  0.06101028,
  -0.024920644,
  0.00516194,
  -0.02928818,
  -0.008937898,
  0.04134961,
  -0.022243332,
  0.03845595,
  0.0114664715,
  0.029423397,
  0.049679026,
  0.02239207,
  -0.050355114,
  0.00065580645,
  0.0008582106,
  -0.0067879343,
  -0.02473134,
  -0.04045717,
  0.040484216,
  0.03285793,
  -0.022067549,
  0.03453463,
  0.040862825,
  -0.011987059,
  -0.086377144,
  0.004546699,
  -0.004005828,
  0.066310816,
  -0.04751554,
  -0.012122277,
  0.012014103,
  -0.015225527,
  0.00440134,
  0.02815235,
  -0.047434412,
  

In [24]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace='ns1'
)

{'upserted_count': 20}

In [25]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}