In [37]:
import scrapy
import chromadb
import requests
import os
from bs4 import SoupStrainer, BeautifulSoup
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader,  WikipediaLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"]

queries = ["Six (musical)", "Catherine of Aragon", "Anne Boleyn", "Jane Seymour", "Anne of Cleves", "Catherine Howard", "Catherine Parr", "Henry VIII"]

# Initialize an empty list to hold all documents
all_docs = []
# Loop through the queries and load documents
for query in queries:
    loader = WikipediaLoader(
        query=query,
        load_max_docs=1,
        doc_content_chars_max=80000
    )
    all_docs.extend(loader.load())

print(len(all_docs[6].page_content))
# Optionally print the text content to verify
print(all_docs[6].page_content)

29126
Catherine Parr (she signed her letters as Kateryn; 1512 – 5 September 1548) was Queen of England and Ireland as the last of the six wives of King Henry VIII from their marriage on 12 July 1543 until Henry's death on 28 January 1547. Catherine was the final queen consort of the House of Tudor, and outlived Henry by a year and eight months. With four husbands, she is the most-married English queen. She was the first woman to publish in print an original work under her own name in England in the English language.
Catherine enjoyed a close relationship with Henry's three children, Mary, Elizabeth and Edward. She was personally involved in the education of Elizabeth and Edward. She was influential in Henry's passing of the Third Succession Act in 1543 that restored his daughters Mary and Elizabeth to the line of succession to the throne. Catherine was appointed regent from July to September 1544 while Henry was on a military campaign in France; in the event that he lost his life, she 

In [31]:
#not using WikipediaLoader
bs4_strainer = SoupStrainer(
    ["p", "h1", "h2", "h3", "h4", "h5", "h6"]
) 
loader = WebBaseLoader(
    web_paths=(
        "https://en.wikipedia.org/wiki/Six_(musical)",
        "https://en.wikipedia.org/wiki/Catherine_of_Aragon",
        "https://en.wikipedia.org/wiki/Anne_Boleyn",
        "https://en.wikipedia.org/wiki/Jane_Seymour ",
        "https://en.wikipedia.org/wiki/Anne_of_Cleves",
        "https://en.wikipedia.org/wiki/Catherine_Howard",
        "https://en.wikipedia.org/wiki/Catherine_Parr",
        "https://en.wikipedia.org/wiki/Henry_VIII",
    ),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()


print(len(docs[6].page_content))
# Optionally print the text content to verify
print(docs[6].page_content)

28220
ContentsCatherine Parr
Catherine Parr (she signed her letters as Kateryn; 1512 – 5 September 1548[2][4]) was Queen of England and Ireland as the last of the six wives of King Henry VIII from their marriage on 12 July 1543 until Henry's death on 28 January 1547. Catherine was the final queen consort of the House of Tudor, and outlived Henry by a year and eight months. With four husbands, she is the most-married English queen. She was the first woman to publish in print an original work under her own name in England in the English language.[a][6]
Catherine enjoyed a close relationship with Henry's three children, Mary, Elizabeth and Edward. She was personally involved in the education of Elizabeth and Edward. She was influential in Henry's passing of the Third Succession Act in 1543 that restored his daughters Mary and Elizabeth to the line of succession to the throne.[7] Catherine was appointed regent from July to September 1544 while Henry was on a military campaign in France; in

In [94]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1300, chunk_overlap=100, add_start_index=True)
documents = text_splitter.split_documents(docs)

print(len(documents))
print(len(documents[99].page_content))
print(documents[99].page_content)

397
769
Catherine and Henry married in 1509 but eventually he became dubious about the marriage's validity, claiming that Catherine's inability to provide an heir was a sign of God's displeasure. His feelings for Anne, and her refusals to become his mistress, probably contributed to Henry's decision that no pope had a right to overrule the Bible. This meant that he had been living in sin with Catherine, although Catherine hotly contested this and refused to concede that her marriage to Arthur had been consummated.[60] It also meant that his daughter Mary was a bastard, and that the new pope (Clement VII) would have to admit the previous pope's mistake and annul the marriage. Henry's quest for an annulment became euphemistically known as the "King's Great Matter".[61]


In [95]:
vectorstore = Chroma.from_documents(documents=documents, embedding=OpenAIEmbeddings(model="text-embedding-ada-002"))
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [103]:
retrieved_docs = retriever.invoke("Who was Anne Boleyn?")
print(retrieved_docs[5].page_content)

ContentsAnne Boleyn
Anne Boleyn (/ˈbʊlɪn, bʊˈlɪn/;[7][8][9] c. 1501 or 1507 – 19 May 1536) was Queen of England from 1533 to 1536, as the second wife of King Henry VIII. The circumstances of her marriage and execution, by beheading for treason, made her a key figure in the political and religious upheaval that marked the start of the English Reformation.
Anne was the daughter of Thomas Boleyn (later Earl of Wiltshire), and his wife, Elizabeth Howard, and was educated in the Netherlands and France. Anne returned to England in early 1522, to marry her cousin James Butler, 9th Earl of Ormond; the marriage plans were broken off, and instead, she secured a post at court as maid of honour to Henry VIII's wife, Catherine of Aragon. Early in 1523, Anne was secretly betrothed to Henry Percy, son of Henry Percy, 5th Earl of Northumberland, but the betrothal was broken off when the Earl refused to support it. Cardinal Thomas Wolsey refused the match in January 1524.


In [None]:
llm = ChatOpenAI(model="gpt-4o-mini")