## Initialization

In [34]:
import sys
import re
sys.path.append('../..')

## Articles loader

In [35]:
from langchain.document_loaders import PyPDFLoader
loaders = [
    PyPDFLoader("./articles/0dacb0b566cb38438b84aa092176c6d42bf4402d.pdf")
]

articles = []
for loader in loaders:
    article = loader.load()
    articles.append(article)

In [36]:
print(articles[0])

[Document(page_content='\xa0\n\xa0\xa0\xa0\xa0\xa0Death,\xa0Depression, \xa0Disability, \xa0and\xa0Dementia \xa0Associated \xa0With\xa0Self‐reported\xa0Hearing\xa0Problems: \xa0A\xa0\n25‐Year\xa0Study\xa0\n\xa0With\xa0approximately \xa030\xa0per\xa0cent\xa0of\xa0elders\xa0aged\xa065\xa0and\xa0older\xa0having\xa0some\xa0degree\xa0of\xa0hearing\xa0loss,\xa0and\xa0\n70\xa0per\xa0cent\xa0to\xa090\xa0per\xa0cent\xa0over\xa0the\xa0age\xa0of\xa085,\xa0hearing\xa0loss\xa0is\xa0the\xa0third\xa0\nmost\xa0prevalent \xa0chronic\xa0\xa0health\xa0\ncondition \xa0affecting\xa0older\xa0adults\xa0(1).\xa0A\xa0wide\xa0range\xa0of\xa0implications \xa0for\xa0health\xa0and\xa0general\xa0well‐being\xa0\nhave\xa0been\xa0reported, \xa0among\xa0which\xa0social\xa0isolation\xa0and\xa0depression \xa0(2,3),\xa0altered\xa0physical\xa0function\xa0(4),\xa0\nreduced\xa0activity\xa0participation \xa0(5),\xa0lower\xa0quality\xa0of\xa0life\xa0(6),\xa0falls\xa0(7),\xa0greater\xa0cognitive\xa0decline\xa0(8),\xa0or\xa0\nhi

## Articles Splitting

In [37]:
text = "\n".join([page.page_content for page in articles[0]])

In [38]:
seen_sources = set()
metadata_lines = []

for page in articles[0]:
    source = page.metadata.get('source')
    if source and source not in seen_sources:
        metadata_lines.append(source)
        seen_sources.add(source)

metadata = "\n".join(metadata_lines)

In [39]:
headers = [
    r"^\s*Abstract\s*$",
    r"^\s*Introduction\s*$",
    r"^\s*Methods\s*$",
    r"^\s*Methodology\s*$",
    r"^\s*Results\s*$",
    r"^\s*Discussion\s*$",
    r"^\s*Conclusion\s*$",
    r"^\s*References\s*$"
]

headers_to_split_on = [
    ("#", "Header"),
]

In [40]:
def convert_to_markdown(text, headers):
    lines = text.split("\n")
    markdown_text = []
    
    for line in lines:
        header_found = False
        for header in headers:
            if re.match(header, line.strip(), re.IGNORECASE):
                markdown_text.append(f"# {line.strip()}")
                header_found = True
                break
        if not header_found:
            markdown_text.append(line)
    
    return "\n".join(markdown_text)

# Convertendo o texto para formato Markdown
markdown_text = convert_to_markdown(text, headers)

In [41]:
# Dividindo o texto Markdown usando MarkdownHeaderTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_text)
print(md_header_splits)

[Document(page_content='Death,\xa0Depression, \xa0Disability, \xa0and\xa0Dementia \xa0Associated \xa0With\xa0Self‐reported\xa0Hearing\xa0Problems: \xa0A\n25‐Year\xa0Study\nWith\xa0approximately \xa030\xa0per\xa0cent\xa0of\xa0elders\xa0aged\xa065\xa0and\xa0older\xa0having\xa0some\xa0degree\xa0of\xa0hearing\xa0loss,\xa0and\n70\xa0per\xa0cent\xa0to\xa090\xa0per\xa0cent\xa0over\xa0the\xa0age\xa0of\xa085,\xa0hearing\xa0loss\xa0is\xa0the\xa0third\nmost\xa0prevalent \xa0chronic\xa0\xa0health\ncondition \xa0affecting\xa0older\xa0adults\xa0(1).\xa0A\xa0wide\xa0range\xa0of\xa0implications \xa0for\xa0health\xa0and\xa0general\xa0well‐being\nhave\xa0been\xa0reported, \xa0among\xa0which\xa0social\xa0isolation\xa0and\xa0depression \xa0(2,3),\xa0altered\xa0physical\xa0function\xa0(4),\nreduced\xa0activity\xa0participation \xa0(5),\xa0lower\xa0quality\xa0of\xa0life\xa0(6),\xa0falls\xa0(7),\xa0greater\xa0cognitive\xa0decline\xa0(8),\xa0or\nhigher\xa0risk\xa0of\xa0dementia \xa0(9).\xa0Despite\xa0such\xa0

In [42]:
for doc in md_header_splits:
    doc.metadata['source'] = metadata

In [43]:
len(md_header_splits)

5

In [44]:
print(md_header_splits)

[Document(page_content='Death,\xa0Depression, \xa0Disability, \xa0and\xa0Dementia \xa0Associated \xa0With\xa0Self‐reported\xa0Hearing\xa0Problems: \xa0A\n25‐Year\xa0Study\nWith\xa0approximately \xa030\xa0per\xa0cent\xa0of\xa0elders\xa0aged\xa065\xa0and\xa0older\xa0having\xa0some\xa0degree\xa0of\xa0hearing\xa0loss,\xa0and\n70\xa0per\xa0cent\xa0to\xa090\xa0per\xa0cent\xa0over\xa0the\xa0age\xa0of\xa085,\xa0hearing\xa0loss\xa0is\xa0the\xa0third\nmost\xa0prevalent \xa0chronic\xa0\xa0health\ncondition \xa0affecting\xa0older\xa0adults\xa0(1).\xa0A\xa0wide\xa0range\xa0of\xa0implications \xa0for\xa0health\xa0and\xa0general\xa0well‐being\nhave\xa0been\xa0reported, \xa0among\xa0which\xa0social\xa0isolation\xa0and\xa0depression \xa0(2,3),\xa0altered\xa0physical\xa0function\xa0(4),\nreduced\xa0activity\xa0participation \xa0(5),\xa0lower\xa0quality\xa0of\xa0life\xa0(6),\xa0falls\xa0(7),\xa0greater\xa0cognitive\xa0decline\xa0(8),\xa0or\nhigher\xa0risk\xa0of\xa0dementia \xa0(9).\xa0Despite\xa0such\xa0

In [45]:
# TODO: implement small sections

## Vectorstores and Embedding

### Embedding

In [46]:
from langchain.embeddings.ollama import OllamaEmbeddings
embedding = OllamaEmbeddings(model="llama3")

### Vectorstores

In [47]:
from langchain.vectorstores import Chroma

In [48]:
persist_directory = './chroma/'

In [49]:
vectordb = Chroma.from_documents(
    documents=md_header_splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [50]:
# vectordb = Chroma(persist_directory="./chroma_db", embedding_function=embedding)

In [51]:
vectordb.persist()

In [52]:
print(vectordb._collection.count())

10


## Retrival

In [53]:
from langchain_community.llms import Ollama
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [54]:
metadata_field_info = [
    AttributeInfo(
        name="Header",
        description="This is the header of the section from which this text originates",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Path of the PDF file from which this chunk is derived. It is possible to observe the hash that symbolizes the name of the PDF, in addition to the number of questions it contains. The PDFs are medical scientific articles in the field of medicine. The path format is 'articles/hash(n).pdf', where 'hash' represents the PDF name, 'n' represents the number of related questions, and '.pdf' represents the file extension.",
        type="string",
    ),
]

document_content_description = "Articles"

In [55]:
llm = Ollama(
    model="llama3",
    temperature = 0
)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [65]:
question = "what did they say about depression?"
docs_ss = vectordb.similarity_search(question,k=3)

In [66]:
docs_ss[0].page_content[:100]

'Participants\nMean age of the whole sample at baseline was 75.3 (SD\xa0=\xa06.8), 57.8 per cent were women.'

In [71]:
article_name = "0dacb0b566cb38438b84aa092176c6d42bf4402d"

question = f'what did they say about Depression in the article "{article_name}"?'
docs = retriever.invoke("What are some movies about dinosaurs")

## Usar retriver para pegar os splits de texto para as perguntas.

In [72]:
print(docs)

[Document(page_content='1. Chien W, Lin FR. Prevalence of hearing aid use among older adults in the United States. Arch Intern Med. 2012;172:292–293. doi: 10.1001/archinternmed.2011.1408.\n2. Li C-M, Zhang X, Hoffman HJ, Cotch MF, Themann CL, Wilson MR.\nHearing impairment associated with depression in US adults, national\nhealth and nutrition examination survey 2005–2010. JAMA Otolaryngol\nHead Neck Surg. 2014;140:293–302. doi:10.1001/jamaoto.2014.42.\n3. Kiely KM, Anstey KJ, Luszcz MA. Dual sensory loss and depressive symp-toms: the importance of hearing, daily functioning, and activity engage-ment. Front Hum Neurosci. 2013;7:837. doi: 10.3389/fnhum.2013.00837.\n4. Chen DS, Betz J, Yaffe K, et\xa0al.; Health ABC study. Association of hearing\nimpairment with declines in physical functioning and the risk of disabil-\nity in older adults. J Gerontol A\xa0Biol Sci Med Sci. 2015;70:654–661. doi: 10.1093/gerona/glu207.\n5. Resnick HE, Fries BE, Verbrugge LM. Windows to their world: the ef

## Question Answering

## Chat