In [None]:
# Env imports.
from os import getenv
from dotenv import load_dotenv

In [None]:
# Retrieve the API keys from environment variables
load_dotenv()
openai_api_key: str = getenv('OPENAI_API_KEY')
pinecone_api_key: str = getenv('PINECONE_API_KEY')
pinecone_index_name: str = getenv('PINECONE_INDEX_NAME')
print(f'openai_api_key = {openai_api_key}')
print(f'pinecone_api_key = {pinecone_api_key}')
print(f'pinecone_index_name = {pinecone_index_name}')

In [None]:
import json

In [None]:
# Load the existing scraped data for cse.osu.edu as a json.
with open('domain_data.json', 'r') as f:
    domain_data = json.load(f)

In [None]:
domain_data

In [None]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone

In [None]:
# Init the Pinecone vectorstore object.
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(pinecone_index_name)

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
from langchain_core.documents import Document
from hashlib import sha256

In [None]:
# Prepare documents for embedding and upload.
documents = []
for datapoint in domain_data:
    documents.append(
        Document(
            id=sha256(datapoint['loc'].encode('utf-8')).hexdigest(),
            page_content=datapoint['text_content'],
            metadata={
                'scrape_timestamp': datapoint['scrape_timestimp'],
                'lastmod_timestamp': datapoint['lastmod'],
                'page_priority': datapoint['priority'],
                'url': datapoint['loc']
            }
        )
    )

In [None]:
documents

In [None]:
vector_store.add_documents(documents=documents)

In [None]:
# Test index by similarity search and retrieve top 5 matching documents.
results = vector_store.similarity_search(
    'ai research',
    k=5,
    filter={},
)
for res in results:
    print(f'* {res.page_content} [{res.metadata}]\n')