In [10]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [7]:
pc = Pinecone(api=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [9]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Jane Smith',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Dr. Smith is very knowledgeable, and her lectures are engaging. However, her grading can be tough.'},
 {'professor': 'Dr. Robert Johnson',
  'subject': 'Calculus I',
  'stars': 3,
  'review': "The material is challenging, and Dr. Johnson's explanations can be a bit unclear at times."},
 {'professor': 'Dr. Emily Davis',
  'subject': 'World History',
  'stars': 5,
  'review': 'Dr. Davis is passionate about history and makes every class interesting. Highly recommended!'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Organic Chemistry',
  'stars': 2,
  'review': "Dr. Brown's lectures are difficult to follow, and the exams are extremely hard."},
 {'professor': 'Dr. Sarah Wilson',
  'subject': 'English Literature',
  'stars': 4,
  'review': 'Dr. Wilson is a great professor, but she assigns a lot of reading and essays.'},
 {'professor': 'Dr. John Lee',
  'subject': 'Computer Science 101

In [14]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })


In [15]:
processed_data[0]

{'values': [-0.009643,
  0.02278261,
  0.009567877,
  0.0029212416,
  0.037124183,
  0.01838453,
  0.018944534,
  -0.0015903779,
  0.018889898,
  -0.0084615275,
  -0.003544417,
  -0.015543532,
  -0.021512358,
  -0.0051458925,
  0.036987595,
  0.012285946,
  -0.023847984,
  -0.038517363,
  -0.017442083,
  0.028082162,
  0.03832614,
  -0.012449849,
  0.04947159,
  -0.047859874,
  -0.028000211,
  -0.039582737,
  0.0395281,
  0.021703579,
  0.035512462,
  0.021840164,
  0.06441414,
  -0.013487906,
  0.015912315,
  -0.0135288825,
  -0.034365136,
  0.050072573,
  -0.014860599,
  0.04597498,
  0.034173917,
  0.008447869,
  0.0034334406,
  0.028246066,
  -0.046521325,
  -0.022017727,
  0.03496612,
  -0.031496823,
  -0.00088098226,
  -0.033682205,
  0.018029405,
  0.028000211,
  -0.06501512,
  0.063813165,
  0.03152414,
  -0.044964243,
  -0.033982694,
  -0.013569858,
  0.008516162,
  0.056273594,
  0.012859609,
  -0.03704223,
  0.04578376,
  -0.009854709,
  -0.005733214,
  -0.031442188,
  -0.03

In [16]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}