In [11]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI 
from pinecone import Pinecone, ServerlessSpec

In [12]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [13]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Carter',
  'course': 'Computer Science',
  'rating': 4.5,
  'comment': 'Dr. Carter is an excellent professor. Her lectures are clear, and she’s always willing to help students during office hours.'},
 {'professor': 'Dr. John Anderson',
  'course': 'English Literature',
  'rating': 3.8,
  'comment': "Dr. Anderson's classes are interesting, but the grading is tough. Be prepared to work hard."},
 {'professor': 'Dr. Sophia Martinez',
  'course': 'Mathematics',
  'rating': 4.9,
  'comment': 'One of the best math professors I’ve had. She makes complex topics easy to understand.'},
 {'professor': 'Dr. Michael Nguyen',
  'course': 'Physics',
  'rating': 4.2,
  'comment': 'Dr. Nguyen is very knowledgeable, but his lectures can be a bit fast-paced.'},
 {'professor': 'Dr. Linda Zhang',
  'course': 'Biology',
  'rating': 4.7,
  'comment': 'Dr. Zhang is passionate about biology and makes the subject interesting. Her labs are well-organized.'},
 {'professor': 'Dr. Robert Ki

In [14]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['comment'],
        model="text-embedding-3-small",

    )

    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["comment"],
            "course": review["course"],
            "rating": review["rating"] 
        }
    })

In [15]:
processed_data[0]

{'values': [-0.01879041,
  -0.005179855,
  -0.010266745,
  0.054848943,
  0.019301714,
  0.019754916,
  0.0064203474,
  0.03425735,
  -0.02221847,
  -0.014839428,
  0.01727974,
  0.00024403131,
  0.021219103,
  0.004642405,
  0.013665753,
  0.029306998,
  0.012062119,
  0.0055778585,
  0.01919713,
  0.022904081,
  0.011882001,
  -0.01315445,
  0.04294951,
  -0.0015353636,
  -0.040183824,
  -0.001981302,
  0.050386656,
  0.04594761,
  -0.007472006,
  0.007721848,
  0.086828664,
  -0.015850415,
  0.0034222486,
  -0.01724488,
  -0.055127837,
  0.052571315,
  0.022753015,
  -0.0021599676,
  0.020359183,
  -0.0145489145,
  -0.004299599,
  0.062936835,
  0.0015774881,
  0.0032101737,
  0.025611667,
  -0.01750053,
  -0.04076485,
  0.009447498,
  0.03432707,
  0.016338477,
  -0.064191855,
  -0.01574583,
  0.04039299,
  -0.002353159,
  -0.04039299,
  0.04959646,
  0.016245512,
  -0.002088792,
  0.04229876,
  -0.042600896,
  0.043228403,
  0.017581875,
  -0.013456584,
  -0.045343343,
  -0.013317

In [16]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}