In [1]:
import sys
import re
sys.path.append('../..')

In [2]:
from langchain.document_loaders import PyPDFLoader
import glob

pdf_files = glob.glob("./articles/*.pdf")

loaders = [PyPDFLoader(file_path) for file_path in pdf_files]

articles = []
for loader in loaders:
    article = loader.load()
    articles.append(article)


In [3]:
len(articles)

10

In [4]:
headers = [
    r"^\s*Abstract\s*$",
    r"^\s*Introduction\s*$",
    r"^\s*Methods\s*$",
    r"^\s*Methodology\s*$",
    r"^\s*Results\s*$",
    r"^\s*Discussion\s*$",
    r"^\s*Conclusion\s*$",
    r"^\s*References\s*$"
]

headers_to_split_on = [
    ("#", "Header"),
]

In [5]:
def convert_to_markdown(text, headers):
        lines = text.split("\n")
        markdown_text = []
        
        for line in lines:
            header_found = False
            for header in headers:
                if re.match(header, line.strip(), re.IGNORECASE):
                    markdown_text.append(f"# {line.strip()}")
                    header_found = True
                    break
            if not header_found:
                markdown_text.append(line)
        
        return "\n".join(markdown_text)

In [6]:
md_header_splits_list = []

for article in articles:
    print(article)
    text = "\n".join([page.page_content for page in article])

    seen_sources = set()
    metadata_lines = []

    for page in articles[0]:
        source = page.metadata.get('source')
        if source and source not in seen_sources:
            metadata_lines.append(source)
            seen_sources.add(source)

    metadata = "\n".join(metadata_lines)

    # Convertendo o texto para formato Markdown
    markdown_text = convert_to_markdown(text, headers)

    # Dividindo o texto Markdown usando MarkdownHeaderTextSplitter
    from langchain.text_splitter import MarkdownHeaderTextSplitter
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(markdown_text)

    for doc in md_header_splits:
        doc.metadata['source'] = metadata

    md_header_splits_list.append(md_header_splits)


[Document(page_content='The new england journal of medicine\nn engl j med 372;26 nejm.org june 25, 20152521\noriginal articlePembrolizumab versus Ipilimumab  \nin Advanced Melanoma\nCaroline Robert, M.D., Ph.D., Jacob Schachter, M.D., Georgina V. Long, M.D., Ph.D., \nAna Arance, M.D., Ph.D., Jean Jacques Grob, M.D., Ph.D., Laurent Mortier, M.D., Ph.D., \nAdil Daud, M.D., Matteo S. Carlino, M.B., B.S., Catriona McNeil, M.D., Ph.D., \nMichal Lotem, M.D., James Larkin, M.D., Ph.D., Paul Lorigan, M.D.,  \nBart Neyns, M.D., Ph.D., Christian U. Blank, M.D., Ph.D., Omid Hamid, M.D., \nChristine Mateus, M.D., Ronnie Shapira-Frommer, M.D., Michele Kosh, R.N., B.S.N., \nHonghong Zhou, Ph.D., Nageatte Ibrahim, M.D., Scot Ebbinghaus, M.D.,  \nand Antoni Ribas, M.D., Ph.D., for the KEYNOTE-006 investigators*\nThe authors’ affiliations are listed in the \nAppendix. Address reprint requests to Dr. Robert at Gustave Roussy and Paris-Sud University, 114 Rue Edouard Vaillant, 94805 Villejuif Paris-Sud, 

In [7]:
len(md_header_splits_list)

10

In [8]:
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
embedding = OllamaEmbeddings(model="llama3")

In [9]:
persist_directory = './chroma/'

In [10]:
for header_split in md_header_splits_list:
    vectordb = Chroma.from_documents(
        documents=header_split,
        embedding=embedding,
        persist_directory=persist_directory
    )



In [11]:
vectordb.persist()

In [12]:
print(vectordb._collection.count())

53
