In [None]:
# Env imports.
from os import getenv
from dotenv import load_dotenv

# Retrieve the API keys from environment variables
load_dotenv()
openai_api_key: str = getenv('OPENAI_API_KEY')
pinecone_api_key: str = getenv('PINECONE_API_KEY')
pinecone_index_name: str = getenv('PINECONE_INDEX_NAME')

In [None]:
import json

# Load the existing scraped data for cse.osu.edu as a json.
with open('cse_osu_edu_domain_data.json', 'r') as f:
    domain_data = json.load(f)

print(len(domain_data))

In [None]:
from pprint import pprint

pprint(domain_data[0])

In [None]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone

# Init the Pinecone vectorstore object.
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(pinecone_index_name)

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size=300,
    chunk_overlap=50
)

In [None]:
# test_texts = text_splitter.split_text(domain_data[0]['text_content'])

In [None]:
from typing import List, Dict
from langchain_core.documents import Document
from uuid import uuid4
from datetime import datetime

class EmptySplitTextException(Exception):
    '''No text returned by the text splitter.'''
    pass

def datapoint_to_documents(text_splitter: RecursiveCharacterTextSplitter, datapoint: Dict[str, str]) -> List[Document]:
    documents: List[Document] = []
    texts: List[str] = text_splitter.split_text(datapoint['text_content'])
    if not texts:
        raise EmptySplitTextException
        
    document_type: str = 'split' if len(texts) > 1 else 'whole'
    lastmod_timestamp: str = datapoint['lastmod_timestamp']
    lastmod_year: int = datetime.fromisoformat(lastmod_timestamp.rstrip('Z')).year
    for text in texts:
        new_document = Document(
            id = uuid4(),
            page_content = text,
            metadata = {
                'src': datapoint['src'],
                'url': datapoint['url'],
                'type': document_type,
                'lastmod_year': lastmod_year,
                'lastmod_timestamp': lastmod_timestamp,
                'scrape_timestamp': datapoint['scrape_timestamp']
            }
        )
        documents.append(new_document)
                
    return documents

def domain_data_to_documents(text_splitter: RecursiveCharacterTextSplitter, domain_data: List[Dict[str, str]]) -> List[Document]:
    documents: List[Document] = []
    for datapoint in domain_data:
        processed_documents: List[Document] = datapoint_to_documents(text_splitter, datapoint)
        documents.extend(processed_documents)
    return documents

In [None]:
documents: List[Document] = domain_data_to_documents(text_splitter, domain_data)

In [None]:
pprint(documents[:3])

In [None]:
vector_store.delete(delete_all=True)
vector_store.add_documents(documents)

In [None]:
# Test index by similarity search and retrieve top k matching documents.
current_year = datetime.now().year
results = vector_store.similarity_search(
    'Professor computer vision research looking for grad student',
    k=3,
    filter={
        'lastmod_year': {'$gte': current_year - 1}
    },
)
for res in results:
    print(f'* {res.page_content} [{res.metadata}]\n')