In [4]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [11]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [12]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Sarah Thompson',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Thompson's lectures are engaging and informative. She's always willing to help during office hours."},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Brilliant instructor! Prof. Chen makes complex topics easy to understand and provides great real-world examples.'},
 {'professor': 'Dr. Emily Rodriguez',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Decent teacher, but assignments can be unclear at times. Lectures are interesting though.'},
 {'professor': 'Prof. James Wilson',
  'subject': 'History',
  'stars': 2,
  'review': "Dry lectures and heavy workload. Prof. Wilson seems knowledgeable but isn't great at explaining concepts."},
 {'professor': 'Dr. Lisa Patel',
  'subject': 'Chemistry',
  'stars': 5,
  'review': "Dr. Patel is passionate about chemistry and it shows. Her labs are well-organized and she's always patient with questions."}

In [13]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [14]:
processed_data[0]

{'values': [0.020015849,
  -0.00041438028,
  0.023268092,
  0.047831796,
  0.005651767,
  0.028714942,
  0.03125328,
  0.029455291,
  -0.01562664,
  0.009697242,
  -0.006801951,
  0.017609715,
  0.02055789,
  -0.03670013,
  0.003349746,
  0.03469061,
  0.01981754,
  -0.025092524,
  0.023254873,
  0.061924856,
  0.040745605,
  -0.0070134792,
  0.02609728,
  -0.033183474,
  -0.043125294,
  -0.05206236,
  0.008679263,
  0.031702776,
  -0.014000517,
  -0.005182439,
  0.06805917,
  -0.029984111,
  0.007112633,
  -0.020809079,
  -0.04069272,
  0.050105724,
  0.0027184668,
  0.022236893,
  0.026150163,
  -0.0013121354,
  0.033527207,
  0.014212046,
  -0.017622937,
  0.016882587,
  0.0071324636,
  -0.031702776,
  -0.023545723,
  -0.017794803,
  0.03389738,
  0.02187994,
  -0.029666819,
  0.021073489,
  0.013425426,
  -0.029296644,
  -0.077102,
  0.020425683,
  -0.0028721553,
  0.0053510005,
  0.015613419,
  -0.03915914,
  0.040296108,
  -0.0074167047,
  -0.001167536,
  -0.023334196,
  -0.02934

In [16]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}