# IR24 Project Testing Notebook

#### Import Libraries

In [1]:
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from IPython.display import Markdown, display
import IR24_data

import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
import getpass
import os
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import pipeline

llm = HuggingFacePipeline(
      pipeline=pipeline(
        model="Qwen/Qwen2.5-0.5B-Instruct",
        task="text-generation",
        temperature=0.2,
        do_sample=True,
        repetition_penalty=1.1,
        max_new_tokens=400,
        device_map="auto"
      )
    )

Some parameters are on the meta device because they were offloaded to the cpu.


## Testing Data
### Wikipedia
https://en.wikipedia.org/wiki/History_of_Singapore

https://en.wikipedia.org/wiki/Early_history_of_Singapore 

https://en.wikipedia.org/wiki/Kingdom_of_Singapura 

https://en.wikipedia.org/wiki/Founding_years_of_modern_Singapore 

https://en.wikipedia.org/wiki/Singapore_in_the_Straits_Settlements 

https://en.wikipedia.org/wiki/Fall_of_Singapore

https://en.wikipedia.org/wiki/Japanese_occupation_of_Singapore 

https://en.wikipedia.org/wiki/Operation_Tiderace 

https://en.wikipedia.org/wiki/Colony_of_Singapore 

https://en.wikipedia.org/wiki/Self-governance_of_Singapore 

https://en.wikipedia.org/wiki/Singapore_in_Malaysia 

https://en.wikipedia.org/wiki/History_of_the_Republic_of_Singapore 

## Create docs

In [3]:
#urls = ["https://en.wikipedia.org/wiki/History_of_Singapore","https://en.wikipedia.org/wiki/Early_history_of_Singapore","https://en.wikipedia.org/wiki/Kingdom_of_Singapura","https://en.wikipedia.org/wiki/Founding_years_of_modern_Singapore","https://en.wikipedia.org/wiki/Singapore_in_the_Straits_Settlements","https://en.wikipedia.org/wiki/Fall_of_Singapore","https://en.wikipedia.org/wiki/Japanese_occupation_of_Singapore","https://en.wikipedia.org/wiki/Operation_Tiderace","https://en.wikipedia.org/wiki/Colony_of_Singapore","https://en.wikipedia.org/wiki/Self-governance_of_Singapore","https://en.wikipedia.org/wiki/Singapore_in_Malaysia", "https://en.wikipedia.org/wiki/History_of_the_Republic_of_Singapore"] 
articles = ["History_of_Singapore",
            "Early_history_of_Singapore",
            "Kingdom_of_Singapura",
            "Founding_years_of_modern_Singapore",
            "Singapore_in_the_Straits_Settlements",
            "Fall_of_Singapore",
            "Japanese_occupation_of_Singapore",
            "Operation_Tiderace",
            "Colony_of_Singapore",
            "Self-governance_of_Singapore",
            "Singapore_in_Malaysia", 
            "History_of_the_Republic_of_Singapore"] 

for article in articles:
    IR24_data.wiki_to_text(article, "wikidata/" + article + ".txt")
    #IR24_data.html_to_markdown(url, "wikidata/"+url[30:]+".md")

History_of_Singapore's content saved successfully!
Early_history_of_Singapore's content saved successfully!
Kingdom_of_Singapura's content saved successfully!
Founding_years_of_modern_Singapore's content saved successfully!
Singapore_in_the_Straits_Settlements's content saved successfully!
Fall_of_Singapore's content saved successfully!
Japanese_occupation_of_Singapore's content saved successfully!
Operation_Tiderace's content saved successfully!
Colony_of_Singapore's content saved successfully!
Self-governance_of_Singapore's content saved successfully!
Singapore_in_Malaysia's content saved successfully!
History_of_the_Republic_of_Singapore's content saved successfully!


In [4]:
# Split docs into chunks
# https://python.langchain.com/docs/how_to/recursive_text_splitter/
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
    is_separator_regex=False,
)

splits = []

for article in articles:
    path = "wikidata/" + article + ".txt"
    with open(path, encoding='UTF-8') as f:
        article = f.read()
        print("Splitting: ", path)
        doc_texts = text_splitter.create_documents([article])

        # Adding source information
        for doc_text in doc_texts:
            doc_text.metadata['source'] = path
            splits.append(doc_text)
    

print(len(splits))

"""
print(splits[1000].metadata)
print(splits[1000].page_content)
"""

Splitting:  wikidata/History_of_Singapore.txt
Splitting:  wikidata/Early_history_of_Singapore.txt
Splitting:  wikidata/Kingdom_of_Singapura.txt
Splitting:  wikidata/Founding_years_of_modern_Singapore.txt
Splitting:  wikidata/Singapore_in_the_Straits_Settlements.txt
Splitting:  wikidata/Fall_of_Singapore.txt
Splitting:  wikidata/Japanese_occupation_of_Singapore.txt
Splitting:  wikidata/Operation_Tiderace.txt
Splitting:  wikidata/Colony_of_Singapore.txt
Splitting:  wikidata/Self-governance_of_Singapore.txt
Splitting:  wikidata/Singapore_in_Malaysia.txt
Splitting:  wikidata/History_of_the_Republic_of_Singapore.txt
304


'\nprint(splits[1000].metadata)\nprint(splits[1000].page_content)\n'

### Embedding and Storing Data

In [5]:
embedding_model = HuggingFaceEmbeddings(
    model_name="thenlper/gte-small",
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)




In [6]:
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)

: 

In [None]:
vectorstore.embeddings

In [None]:
# Adapted from: https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://en.wikipedia.org/wiki/History_of_Singapore"]
)
print(documents[0])

index = SummaryIndex.from_documents(documents)
print(index)

Doc ID: https://en.wikipedia.org/wiki/History_of_Singapore
Text: Jump to content  Main menu  Main menu  move to sidebar hide
Navigation    * [Main page](/wiki/Main_Page "Visit the main page
\[z\]")   * [Contents](/wiki/Wikipedia:Contents "Guides to browsing
Wikipedia")   * [Current events](/wiki/Portal:Current_events "Articles
related to current events")   * [Random article](/wiki/Special:Random
"Visit a ran...
<llama_index.core.indices.list.base.SummaryIndex object at 0x00000188176FA410>


In [None]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://en.wikipedia.org/wiki/History_of_Singapore",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [None]:
print(docs)

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/History_of_Singapore'}, page_content='')]
