## Initialization

In [None]:
import sys
import re
sys.path.append('../..')

## Articles loader

In [None]:
from langchain.document_loaders import PyPDFLoader
import glob

pdf_files = glob.glob("./articles/*.pdf")

loaders = [PyPDFLoader(file_path) for file_path in pdf_files]

articles = []
for loader in loaders:
    article = loader.load()
    articles.append(article)


In [None]:
len(articles)

In [None]:
print(articles[0])

## Articles Splitting

In [None]:
text = "\n".join([page.page_content for page in articles[0]])

In [None]:
seen_sources = set()
metadata_lines = []

for page in articles[0]:
    source = page.metadata.get('source')
    if source and source not in seen_sources:
        metadata_lines.append(source)
        seen_sources.add(source)

metadata = "\n".join(metadata_lines)

In [None]:
headers = [
    r"^\s*Abstract\s*$",
    r"^\s*Introduction\s*$",
    r"^\s*Methods\s*$",
    r"^\s*Methodology\s*$",
    r"^\s*Results\s*$",
    r"^\s*Discussion\s*$",
    r"^\s*Conclusion\s*$",
    r"^\s*References\s*$"
]

headers_to_split_on = [
    ("#", "Header"),
]

In [None]:
def convert_to_markdown(text, headers):
    lines = text.split("\n")
    markdown_text = []
    
    for line in lines:
        header_found = False
        for header in headers:
            if re.match(header, line.strip(), re.IGNORECASE):
                markdown_text.append(f"# {line.strip()}")
                header_found = True
                break
        if not header_found:
            markdown_text.append(line)
    
    return "\n".join(markdown_text)

# Convertendo o texto para formato Markdown
markdown_text = convert_to_markdown(text, headers)

In [None]:
# Dividindo o texto Markdown usando MarkdownHeaderTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_text)
print(md_header_splits)

In [None]:
for doc in md_header_splits:
    doc.metadata['source'] = metadata

In [None]:
len(md_header_splits)

In [None]:
for index in range(len(md_header_splits)-1):
    print(md_header_splits[index].metadata)

In [None]:
print(md_header_splits[0].page_content)

In [None]:
# TODO: implement small sections

## Vectorstores and Embedding

### Embedding

In [None]:
from langchain.embeddings.ollama import OllamaEmbeddings
embedding = OllamaEmbeddings(model="llama3")

### Vectorstores

In [None]:
from langchain.vectorstores import Chroma

In [None]:
persist_directory = './chroma/'

In [None]:
# vectordb = Chroma.from_documents(
#     documents=md_header_splits,
#     embedding=embedding,
#     persist_directory=persist_directory
# )

In [None]:
vectordb = Chroma(persist_directory="./chroma", embedding_function=embedding)

In [None]:
vectordb.persist()

In [None]:
print(vectordb._collection.count())

## Retrival

In [None]:
from langchain_community.llms import Ollama
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
metadata_field_info = [
    AttributeInfo(
        name="Header",
        description="This is the header of the section from which this text originates",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Path of the PDF file from which this chunk is derived. It is possible to observe the hash that symbolizes the name of the PDF, in addition to the number of questions it contains. The PDFs are medical scientific articles in the field of medicine. The path format is 'articles/hash(n).pdf', where 'hash' represents the PDF name, 'n' represents the number of related questions, and '.pdf' represents the file extension.",
        type="string",
    ),
]

document_content_description = "Articles"

In [None]:
llm = Ollama(
    model="llama3",
    temperature = 0
)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [None]:
question = "What is the main subject of the article?"
docs_ss = vectordb.similarity_search(question,k=3)

In [None]:
len(docs_ss)

In [None]:
for index in range(len(docs_ss)):
    print(docs_ss[index].page_content)

In [None]:
for index in range(len(docs_ss)):
    print(docs_ss[index].metadata)

In [None]:
# article_name = "1f90a31355e180e376a2a4f420ca51970a772882"

# question = f'what did they say about Melanoma in the article "{article_name}"?'
# docs = retriever.invoke(question)

In [None]:
# print(docs)

## Question Answering

In [None]:
question = "What are major topics for this article?"
docs = vectordb.similarity_search(question,k=3)
len(docs)
print(docs)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [None]:
# result = qa_chain({"query": question})

In [None]:
# result["result"]

In [None]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)


In [None]:
# question = "Is probability a class topic?"
# result = qa_chain({"query": question})

In [None]:
# result["result"]

## Chat

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}

)

In [None]:
# question = "Is immunity a article topic?"
# result = qa_chain({"query": question})
# result['result']

In [None]:
# question = "why are those prerequesites needed to understand this topic?"
# result = qa_chain({"query": question})
# result['result']

## Analyser test

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
article = ""
quizz_question = ""
possible_answers = ""
answer_key = ""

question = """The following question is based on the article: {article}. 
The question statement, possible answers, and the correct answer are defined as follows:
Question: {quizz_question}
Possible answers: {possible_answers}
Answer key: {answer_key}

Please analyze the entire article {article} and determine as accurately as possible which section of the article this question pertains to. Also, justify your choice."""

In [None]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

In [None]:
retriever=vectordb.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=False,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}

)

In [None]:
result = qa({"query": question}, return_only_outputs=True)
result['result']

In [None]:
section_counts = {
        "Introduction": 0,
        "Background": 0,
        "Methods": 0,
        "Results": 0,
        "Discussion": 0,
        "Conclusion": 0
    }

In [None]:
for output in result['result']:
        if "Introduction" in output or "Background" in output:
            section_counts["Introduction"] += 1
            section_counts["Background"] += 1
        if "Methods" in output:
            section_counts["Methods"] += 1
        if "Results" in output:
            section_counts["Results"] += 1
        if "Discussion" in output:
            section_counts["Discussion"] += 1
        if "Conclusion" in output:
            section_counts["Conclusion"] += 1

In [None]:
print(section_counts)

In [None]:
from transformers import pipeline

# Carregar o pipeline de classificação de texto
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Definir o texto e as categorias
text = result['result']

# Definir as seções como categorias
labels = ["Introduction", "Methods", "Results", "Discussion", "Conclusion"]

# Classificar o texto
classification = classifier(text, labels)

# Obter a seção mais provável
predicted_section = classification['labels'][0]

# Atualizar a contagem da seção
section_counts = {
    "Introduction": 0,
    "Methods": 0,
    "Results": 0,
    "Discussion": 0,
    "Conclusion": 0
}

section_counts[predicted_section] += 1

# Mostrar a seção predita e as contagens atualizadas
print(f"Predicted section: {predicted_section}")
print("Section counts:", section_counts)


## Analyser

In [None]:
def transform_data(questions_data, answers_data):
    questions = []
    
    # Criar um dicionário para mapear id de perguntas para respostas
    answer_map = {}
    for answer in answers_data:
        question_id = answer["question"]
        if question_id not in answer_map:
            answer_map[question_id] = []
        if float(answer["fraction"]) > 0:
            answer_map[question_id].append(answer["answer"])
    
    # Criar lista de perguntas formatadas
    for question in questions_data:
        q_id = question["id"]
        article_id = "1f90a31355e180e376a2a4f420ca51970a772882.pdf"  # Assuming article ID remains constant
        
        quizz_question = question["questiontext"]
        
        # Obter possíveis respostas para a pergunta
        possible_answers = answer_map.get(q_id, [])
        possible_answers_str = ";".join(possible_answers)
        
        # Obter a chave de respostas corretas (answer_key)
        answer_key = question.get("answer_key", "")  # Assuming answer_key is provided in the questions_data
        
        # Montar o dicionário para a pergunta atual
        question_dict = {
            "article": article_id,
            "quizz_question": quizz_question,
            "possible_answers": possible_answers_str,
            "answer_key": answer_key
        }
        
        # Adicionar à lista de perguntas formatadas
        questions.append(question_dict)
    
    return questions

# Uso da função para transformar os dados
questions = transform_data(questions_data, answers_data)

print(questions)

In [None]:
ch_question = questions[5]
question = ch_question['quizz_question']
respostas_possiveis = ch_question['possible_answers']

article = "1f90a31355e180e376a2a4f420ca51970a772882.pdf"
quizz_question = ch_question['quizz_question']
possible_answers = ch_question['possible_answers']

In [61]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

question = """The following question is based on the article: {article}. 
The question statement, possible answers, and the correct answer are defined as follows:
Question: {quizz_question}
Possible answers: {possible_answers}

Please analyze the entire article {article} and determine as accurately as possible which section of the article this question pertains to. Also, justify your choice."""

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

retriever=vectordb.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=False,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}

)

result = qa({"query": question}, return_only_outputs=True)
result['result']

from transformers import pipeline

# Carregar o pipeline de classificação de texto
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Definir o texto e as categorias
text = result['result']

# Definir as seções como categorias
labels = ["Introduction", "Methods", "Results", "Discussion", "Conclusion"]

# Classificar o texto
classification = classifier(text, labels)

# Obter a seção mais provável
predicted_section = classification['labels'][0]

# Atualizar a contagem da seção
section_counts = {
    "Introduction": 0,
    "Methods": 0,
    "Results": 0,
    "Discussion": 0,
    "Conclusion": 0
}

section_counts[predicted_section] += 1

# Mostrar a seção predita e as contagens atualizadas
print(f"Predicted section: {predicted_section}")
print("Section counts:", section_counts)


: 