In [None]:
import scrapy
import chromadb
import requests
import os
from bs4 import SoupStrainer, BeautifulSoup
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader,  WikipediaLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"]

In [None]:
queries = ["Six (musical)", "Catherine of Aragon", "Anne Boleyn", "Jane Seymour", "Anne of Cleves", "Catherine Howard", "Catherine Parr", "Henry VIII"]

# Initialize an empty list to hold all documents
all_docs = []
# Loop through the queries and load documents
for query in queries:
    loader = WikipediaLoader(
        query=query,
        load_max_docs=1,
        doc_content_chars_max=80000
    )
    all_docs.extend(loader.load())

print(len(all_docs[6].page_content))
# Optionally print the text content to verify
print(all_docs[6].page_content)

In [None]:
import json
serializable_docs = [{"title": doc.metadata.get("title", ""), "content": doc.page_content} for doc in all_docs]

# Save documents to a file
with open('documents.json', 'w') as f:
    json.dump(serializable_docs, f)

In [None]:
#not using WikipediaLoader
bs4_strainer = SoupStrainer(
    ["p", "h1", "h2", "h3", "h4", "h5", "h6"]
) 
loader = WebBaseLoader(
    web_paths=(
        "https://en.wikipedia.org/wiki/Six_(musical)",
        "https://en.wikipedia.org/wiki/Catherine_of_Aragon",
        "https://en.wikipedia.org/wiki/Anne_Boleyn",
        "https://en.wikipedia.org/wiki/Jane_Seymour ",
        "https://en.wikipedia.org/wiki/Anne_of_Cleves",
        "https://en.wikipedia.org/wiki/Catherine_Howard",
        "https://en.wikipedia.org/wiki/Catherine_Parr",
        "https://en.wikipedia.org/wiki/Henry_VIII",
    ),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()


print(len(docs[6].page_content))
# Optionally print the text content to verify
print(docs[6].page_content)

In [None]:
import json
from langchain.docstore.document import Document

with open('documents.json', 'r') as f:
    loaded_docs = json.load(f)

print(loaded_docs)
json_docs = []

for doc in loaded_docs:
    text = doc["content"]
    metadata = {"title": doc["title"]}
    json_docs.extend([Document(metadata=metadata, page_content=text)])

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1300, chunk_overlap=200, add_start_index=True)
json_documents = text_splitter.split_documents(json_docs)

print(len(json_documents))
print(len(json_documents[0].page_content))
print(json_documents[0].page_content)
# Convert loaded data back to Document objects
#all_docs = [JSONLoader(page_content=doc["content"], metadata={"title": doc["title"]}) for doc in loaded_docs]


In [None]:
vectorstore = Chroma.from_documents(documents=json_documents, embedding=OpenAIEmbeddings(model="text-embedding-ada-002"))
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.invoke("Anne Boleyn's relationship with Henry VIII")
print(retrieved_docs[0].metadata)
print(retrieved_docs[0].page_content)
print(retrieved_docs[1].metadata)
print(retrieved_docs[1].page_content)
print(retrieved_docs[2].metadata)
print(retrieved_docs[2].page_content)
print(retrieved_docs[3].metadata)
print(retrieved_docs[3].page_content)
print(retrieved_docs[4].metadata)
print(retrieved_docs[4].page_content)
print(retrieved_docs[5].metadata)
print(retrieved_docs[5].page_content)


In [None]:
retrieved_docs = retriever.invoke("Who is Anne Boleyn")
print(retrieved_docs[0].metadata)
print(retrieved_docs[0].page_content)
print(retrieved_docs[1].metadata)
print(retrieved_docs[1].page_content)
print(retrieved_docs[2].metadata)
print(retrieved_docs[2].page_content)
print(retrieved_docs[3].metadata)
print(retrieved_docs[3].page_content)
print(retrieved_docs[4].metadata)
print(retrieved_docs[4].page_content)
print(retrieved_docs[5].metadata)
print(retrieved_docs[5].page_content)

In [None]:
print(all_docs)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1300, chunk_overlap=100, add_start_index=True)
documents = text_splitter.split_documents(all_docs)

print(len(documents))
print(len(documents[1].page_content))
print(documents[1].metadata)
print(documents[1].page_content)

In [None]:
vectorstore = Chroma.from_documents(documents=documents, embedding=OpenAIEmbeddings(model="text-embedding-ada-002"))
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [None]:
retrieved_docs = retriever.invoke("Who was Anne Boleyn?")
print(retrieved_docs[0].metadata)
print(retrieved_docs[0].page_content)
print(retrieved_docs[1].metadata)
print(retrieved_docs[1].page_content)
print(retrieved_docs[2].metadata)
print(retrieved_docs[2].page_content)
print(retrieved_docs[3].metadata)
print(retrieved_docs[3].page_content)
print(retrieved_docs[4].metadata)
print(retrieved_docs[4].page_content)
print(retrieved_docs[5].metadata)
print(retrieved_docs[5].page_content)

In [None]:
llm = ChatOpenAI(model_name="gpt-4o-mini")
prompt = "Hello, World!"

response = llm(messages=[{"role": "user", "content": prompt}], temperature=0)

print(response.content)