In [14]:
from dotenv import load_dotenv
load_dotenv()
import os
import openai as OPENAI
from pinecone import Pinecone, ServerlessSpec

In [15]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [16]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Jingyu Liu',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Dr. Liu's ability to simplify complex material for student is outstanding, making her classes more accessible and enjoyable."},
 {'professor': 'Abdullah Bal',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Highly regarded by students for his ability to make complex topics, like digital image processing, easy to understand and engaging. Students consistently describe his classes as well-organized, with effective use of lectures, reviews, and summaries. He is praised for being approachable, providing clear grading criteria, and offering helpful feedback. While his courses require attendance, those who participate actively tend to find his classes enjoyable and relatively easy to succeed in.'},
 {'professor': 'Pavel Skums',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Well-regarded for his engaging teaching style, especially in challenging subjects like algorithms. Students

In [17]:
processed_data = []

for review in data["reviews"]:
    response = OPENAI.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [18]:
processed_data[0]

{'values': [-0.009408313,
  -0.044443846,
  -0.05305804,
  0.026986647,
  0.050608378,
  -0.03518359,
  -0.022423819,
  0.05841499,
  0.0074633895,
  -0.0248331,
  -0.0074701197,
  0.011878163,
  -0.023540972,
  0.009906321,
  0.010370679,
  0.01983956,
  -0.047324218,
  -0.03550662,
  0.032518573,
  0.009966889,
  0.013311618,
  -0.031226445,
  0.07278992,
  -0.023944762,
  -0.02484656,
  -0.07015183,
  0.04390546,
  -0.016313126,
  -0.00952272,
  -0.0065245773,
  0.033083882,
  -0.023837084,
  0.0037956282,
  -0.01827824,
  -0.024806181,
  0.054807797,
  -0.034510605,
  0.012860719,
  0.018237859,
  -0.024631204,
  0.015667062,
  0.04377086,
  -0.05138904,
  0.0033026678,
  0.034456767,
  -0.0055251964,
  -0.026569398,
  0.0182244,
  0.009677506,
  0.045789815,
  -0.07171315,
  0.02196619,
  0.027538495,
  -0.04508991,
  -0.040675137,
  0.008943954,
  0.020499084,
  0.06606008,
  0.00022145371,
  -0.061914504,
  0.039786797,
  0.0040816464,
  -0.020027997,
  -0.00085553055,
  -0.0671

In [19]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 32}

In [20]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}