## Initialization

In [None]:
import sys
import re
sys.path.append('../..')

## Articles loader

In [None]:
from langchain.document_loaders import PyPDFLoader
import glob

pdf_files = glob.glob("./articles/*.pdf")

loaders = [PyPDFLoader(file_path) for file_path in pdf_files]

articles = []
for loader in loaders:
    article = loader.load()
    articles.append(article)


In [None]:
len(articles)

In [None]:
print(articles[0])

## Articles Splitting

In [None]:
text = "\n".join([page.page_content for page in articles[0]])

In [None]:
seen_sources = set()
metadata_lines = []

for page in articles[0]:
    source = page.metadata.get('source')
    if source and source not in seen_sources:
        metadata_lines.append(source)
        seen_sources.add(source)

metadata = "\n".join(metadata_lines)

In [None]:
headers = [
    r"^\s*Abstract\s*$",
    r"^\s*Introduction\s*$",
    r"^\s*Methods\s*$",
    r"^\s*Methodology\s*$",
    r"^\s*Results\s*$",
    r"^\s*Discussion\s*$",
    r"^\s*Conclusion\s*$",
    r"^\s*References\s*$"
]

headers_to_split_on = [
    ("#", "Header"),
]

In [None]:
def convert_to_markdown(text, headers):
    lines = text.split("\n")
    markdown_text = []
    
    for line in lines:
        header_found = False
        for header in headers:
            if re.match(header, line.strip(), re.IGNORECASE):
                markdown_text.append(f"# {line.strip()}")
                header_found = True
                break
        if not header_found:
            markdown_text.append(line)
    
    return "\n".join(markdown_text)

# Convertendo o texto para formato Markdown
markdown_text = convert_to_markdown(text, headers)

In [None]:
# Dividindo o texto Markdown usando MarkdownHeaderTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_text)
print(md_header_splits)

In [None]:
for doc in md_header_splits:
    doc.metadata['source'] = metadata

In [None]:
len(md_header_splits)

In [None]:
for index in range(len(md_header_splits)-1):
    print(md_header_splits[index].metadata)

In [None]:
print(md_header_splits[0].page_content)

In [None]:
# TODO: implement small sections

## Vectorstores and Embedding

### Embedding

In [None]:
from langchain.embeddings.ollama import OllamaEmbeddings
embedding = OllamaEmbeddings(model="llama3")

### Vectorstores

In [None]:
from langchain.vectorstores import Chroma

In [None]:
persist_directory = './chroma/'

In [None]:
# vectordb = Chroma.from_documents(
#     documents=md_header_splits,
#     embedding=embedding,
#     persist_directory=persist_directory
# )

In [None]:
vectordb = Chroma(persist_directory="./chroma", embedding_function=embedding)

In [None]:
vectordb.persist()

In [None]:
print(vectordb._collection.count())

## Retrival

In [None]:
from langchain_community.llms import Ollama
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
metadata_field_info = [
    AttributeInfo(
        name="Header",
        description="This is the header of the section from which this text originates",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Path of the PDF file from which this chunk is derived. It is possible to observe the hash that symbolizes the name of the PDF, in addition to the number of questions it contains. The PDFs are medical scientific articles in the field of medicine. The path format is 'articles/hash(n).pdf', where 'hash' represents the PDF name, 'n' represents the number of related questions, and '.pdf' represents the file extension.",
        type="string",
    ),
]

document_content_description = "Articles"

In [None]:
llm = Ollama(
    model="llama3",
    temperature = 0
)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [None]:
question = "What is the main subject of the article?"
docs_ss = vectordb.similarity_search(question,k=3)

In [None]:
len(docs_ss)

In [None]:
for index in range(len(docs_ss)):
    print(docs_ss[index].page_content)

In [None]:
for index in range(len(docs_ss)):
    print(docs_ss[index].metadata)

In [None]:
# article_name = "1f90a31355e180e376a2a4f420ca51970a772882"

# question = f'what did they say about Melanoma in the article "{article_name}"?'
# docs = retriever.invoke(question)

In [None]:
# print(docs)

## Question Answering

In [None]:
question = "What are major topics for this article?"
docs = vectordb.similarity_search(question,k=3)
len(docs)
print(docs)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [None]:
# result = qa_chain({"query": question})

In [None]:
# result["result"]

In [None]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)


In [None]:
# question = "Is probability a class topic?"
# result = qa_chain({"query": question})

In [None]:
# result["result"]

## Chat

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}

)

In [None]:
# question = "Is immunity a article topic?"
# result = qa_chain({"query": question})
# result['result']

In [None]:
# question = "why are those prerequesites needed to understand this topic?"
# result = qa_chain({"query": question})
# result['result']

## Analyser test

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
article = "1f90a31355e180e376a2a4f420ca51970a772882.pdf"
quizz_question = "<p>Le risque alpha attribué à la seconde analyse intermédiaire pour la survie globale est de 0,005 en unilatéral. Il correspond à un risque alpha bilatéral de  (une seule réponse exacte) :</p>"
possible_answers = "<p>0,01</p>; <p>0,0025</p>; <p>0,05</p>;<p>0,1</p>;<p>0,005</p>"
answer_key = "0,01"

question = """The following question is based on the article: {article}. 
The question statement, possible answers, and the correct answer are defined as follows:
Question: {quizz_question}
Possible answers: {possible_answers}
Answer key: {answer_key}

Please analyze the entire article {article} and determine as accurately as possible which section of the article this question pertains to. Also, justify your choice."""

In [None]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

In [None]:
retriever=vectordb.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=False,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}

)

In [None]:
result = qa({"query": question}, return_only_outputs=True)
result['result']

In [None]:
section_counts = {
        "Introduction": 0,
        "Background": 0,
        "Methods": 0,
        "Results": 0,
        "Discussion": 0,
        "Conclusion": 0
    }

In [None]:
for output in result['result']:
        if "Introduction" in output or "Background" in output:
            section_counts["Introduction"] += 1
            section_counts["Background"] += 1
        if "Methods" in output:
            section_counts["Methods"] += 1
        if "Results" in output:
            section_counts["Results"] += 1
        if "Discussion" in output:
            section_counts["Discussion"] += 1
        if "Conclusion" in output:
            section_counts["Conclusion"] += 1

In [None]:
print(section_counts)

In [None]:
from transformers import pipeline

# Carregar o pipeline de classificação de texto
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Definir o texto e as categorias
text = result['result']

# Definir as seções como categorias
labels = ["Introduction", "Methods", "Results", "Discussion", "Conclusion"]

# Classificar o texto
classification = classifier(text, labels)

# Obter a seção mais provável
predicted_section = classification['labels'][0]

# Atualizar a contagem da seção
section_counts = {
    "Introduction": 0,
    "Methods": 0,
    "Results": 0,
    "Discussion": 0,
    "Conclusion": 0
}

section_counts[predicted_section] += 1

# Mostrar a seção predita e as contagens atualizadas
print(f"Predicted section: {predicted_section}")
print("Section counts:", section_counts)


## Analyser

In [None]:
questions_data = [
    {"id": "112338", "questiontext": "<p>Cette étude vise à démontrer la supériorité du pembrolizumab (une ou plusieurs réponses possibles) :</p>"},
    {"id": "112339", "questiontext": "<p>L'allocation aléatoire (randomisation) du traitement à l'étude vise à (une seule réponse possible) :</p>"},
    {"id": "112340", "questiontext": "<p>L’absence de double insu (une ou plusieurs réponses possibles) :</p>"},
    {"id": "112341", "questiontext": "<p>Parmi les critères de jugement étudiés dans cet essai, lequel est le plus cliniquement pertinent ?</p>"},
    {"id": "112342", "questiontext": "<p>Dans cette étude, les sources de multiplicité de comparaisons statistiques sont (une ou plusieurs réponses possibles) :</p>"},
    {"id": "112343", "questiontext": "<p>Le risque alpha attribué à la seconde analyse intermédiaire pour la survie globale est de 0,005 en unilatéral. Il correspond à un risque alpha bilatéral de (une seule réponse exacte) :</p>"},
    {"id": "112344", "questiontext": "<p>Parmi les propositions suivantes concernant la première analyse intermédiaire, laquelle est exacte ?</p>"},
    {"id": "112345", "questiontext": "<p>La deuxième analyse intermédiaire devait avoir lieu (une ou plusieurs réponses possibles) :</p>"},
    {"id": "112346", "questiontext": "<p>Dans la figure 1, il existe un nombre important de censures (identifiées par la zone grisée au départ de la courbe) dans le groupe contrôle « ipilimumab » au J0. Parmi les propositions suivantes concernant ces censures, laquelle (lesquelles) est (sont) exacte(s) ?</p>"},
    {"id": "112347", "questiontext": "<p>Le résultat obtenu sur la survie globale avec l’administration du pembrolizumab toutes les 2 semaines (une ou plusieurs réponses possibles) :</p>"},
    {"id": "112348", "questiontext": "<p>A l’issue de la 2<sup>ème</sup> analyse intermédiaire (une ou plusieurs réponses possibles) :</p>"},
    {"id": "112349", "questiontext": "<p>Parmi les propositions suivantes concernant la figure 1 panneau B, laquelle (lesquelles) est (sont) exacte(s) ?</p>"},
    {"id": "112350", "questiontext": "<p>Parmi les propositions suivantes concernant les analyses en sous-groupes sur la survie globale (figure 2 panneau b), laquelle (lesquelles) est (sont) exacte(s) ?</p>"},
    {"id": "112351", "questiontext": "<p>La limite la plus importante de cette étude est (une seule réponse) :</p>"},
    {"id": "112352", "questiontext": "<p>Les résultats de cet essai incitent à proposer chez les patients présentant un mélanome au stade avancé (une seule réponse possible) :</p>"}
]

# Dados das respostas
answers_data = [
    {"id": "502997", "question": "112338", "answer": "<p>toutes les 2 semaines sur la survie globale comparativement à l’ipilimumab</p>", "fraction": "0.5000000"},
    {"id": "502998", "question": "112338", "answer": "<p>toutes les 3 semaines sur la survie globale comparativement à l’ipilimumab</p>", "fraction": "0.5000000"},
    {"id": "502999", "question": "112338", "answer": "<p>toutes les 2 semaines sur la survie sans progression comparativement à l’ipilimumab</p>", "fraction": "0.5000000"},
    {"id": "503000", "question": "112338", "answer": "<p>toutes les 3 semaines sur la survie sans progression comparativement à l’ipilimumab</p>", "fraction": "0.5000000"},
    {"id": "503001", "question": "112338", "answer": "<p>toutes les 2 semaines sur la survie sans progression comparativement au pembrolizumab toutes les 3 semaines</p>", "fraction": "0.0000000"},
    {"id": "503002", "question": "112339", "answer": "<p>limiter les fluctuations d'échantillonnage</p>", "fraction": "0.0000000"},
    {"id": "503003", "question": "112339", "answer": "<p>augmenter la puissance statistique de l'essai</p>", "fraction": "0.0000000"},
    {"id": "503004", "question": "112339", "answer": "<p>rendre l’attribution du traitement indépendante des caractéristiques des patients</p>", "fraction": "1.0000000"},
    {"id": "503005", "question": "112339", "answer": "<p>contrôler le risque d'erreur statistique de 1ère espèce (alpha)</p>", "fraction": "0.0000000"},
    {"id": "503006", "question": "112339", "answer": "<p>maintenir la comparabilité des trois groupes tout au long de l'essai</p>", "fraction": "0.0000000"},
    {"id": "503007", "question": "112340", "answer": "<p>pouvait influencer la mesure du critère de survie globale</p>", "fraction": "0.0000000"},
    {"id": "503008", "question": "112340", "answer": "<p>pouvait influencer la mesure du critère de survie sans progression</p>", "fraction": "0.5000000"},
    {"id": "503009", "question": "112340", "answer": "<p>pouvait influencer la prise en charge des patients </p>", "fraction": "0.5000000"},
    {"id": "503010", "question": "112340", "answer": "<p>pouvait diminuer la puissance de l’étude</p>", "fraction": "0.0000000"},
    {"id": "503011", "question": "112340", "answer": "<p>remet en cause la validité interne de cet essai</p>", "fraction": "0.5000000"},
    {"id": "503012", "question": "112341", "answer": "<p>taux de réponse au traitement</p>", "fraction": "0.0000000"},
    {"id": "503013", "question": "112341", "answer": "<p>survie globale</p>", "fraction": "1.0000000"},
    {"id": "503014", "question": "112341", "answer": "<p>survie sans progression</p>", "fraction": "0.0000000"},
    {"id": "503015", "question": "112341", "answer": "<p>durée de la réponse</p>", "fraction": "0.0000000"},
    {"id": "503016", "question": "112341", "answer": "<p>taux de survie sans progression à 6 mois</p>", "fraction": "0.0000000"},
    {"id": "503017", "question": "112342", "answer": "<p>duplicité des critères de jugement principaux : survie sans progression et survie globale </p>", "fraction": "0.5000000"},
    {"id": "503018", "question": "112342", "answer": "<p>réalisation d’analyses intermédiaires</p>", "fraction": "0.5000000"},
    {"id": "503019", "question": "112342", "answer": "<p>utilisation d’un critère composite</p>", "fraction": "0.0000000"},
    {"id": "503020", "question": "112342", "answer": "<p>évaluation de 2 posologies du pembrolizumab</p>", "fraction": "0.5000000"},
    {"id": "503021", "question": "112342", "answer": "<p>participation à l’étude de plusieurs centres</p>", "fraction": "0.0000000"},
    {"id": "503022", "question": "112343", "answer": "<p>0,01</p>", "fraction": "1.0000000"},
    {"id": "503023", "question": "112343", "answer": "<p>0,0025</p>", "fraction": "0.0000000"},
    {"id": "503024", "question": "112343", "answer": "<p>0,05</p>", "fraction": "0.0000000"},
    {"id": "503025", "question": "112343", "answer": "<p>0,1</p>", "fraction": "0.0000000"},
    {"id": "503026", "question": "112343", "answer": "<p>0,005</p>", "fraction": "0.0000000"},
    {"id": "503027", "question": "112344", "answer": "<p>la supériorité du pembrolizumab sur l’ipilimumab est observée pour la survie globale</p>", "fraction": "1.0000000"},
    {"id": "503028", "question": "112344", "answer": "<p>la non-infériorité du pembrolizumab sur l’ipilimumab est observée pour la survie globale</p>", "fraction": "0.0000000"},
    {"id": "503029", "question": "112344", "answer": "<p>la supériorité du pembrolizumab sur l’ipilimumab est observée pour la survie sans progression</p>", "fraction": "0.0000000"},
    {"id": "503030", "question": "112344", "answer": "<p>la non-infériorité du pembrolizumab sur l’ipilimumab est observée pour la survie sans progression</p>", "fraction": "0.0000000"},
    {"id": "503031", "question": "112344", "answer": "<p>aucune de ces réponses</p>", "fraction": "0.0000000"},
    {"id": "503032", "question": "112345", "answer": "<p>lorsque 80 % des événements de survie globale avaient été observés </p>", "fraction": "0.5000000"},
    {"id": "503033", "question": "112345", "answer": "<p>lorsque 80 % des événements de survie sans progression avaient été observés</p>", "fraction": "0.0000000"},
    {"id": "503034", "question": "112345", "answer": "<p>lorsque 90 % des événements de survie globale avaient été observés</p>", "fraction": "0.0000000"},
    {"id": "503035", "question": "112345", "answer": "<p>lorsque 90 % des événements de survie sans progression avaient été observés</p>", "fraction": "0.5000000"},
    {"id": "503036", "question": "112345", "answer": "<p>lorsque 100 % des événements de survie globale avaient été observés</p>", "fraction": "0.0000000"},
    {"id": "503037", "question": "112346", "answer": "<p>elles correspondent aux sorties d’étude avant la première administration du traitement </p>", "fraction": "1.0000000"},
    {"id": "503038", "question": "112346", "answer": "<p>elles correspondent à des sorties d’étude à l’initiative des patients</p>", "fraction": "0.0000000"},
    {"id": "503039", "question": "112346", "answer": "<p>elles correspondent à des sorties d’étude en rapport avec un effet indésirable du traitement </p>", "fraction": "0.0000000"},
    {"id": "503040", "question": "112346", "answer": "<p>elles correspondent à des événements de progression de la maladie</p>", "fraction": "0.0000000"},
    {"id": "503041", "question": "112346", "answer": "<p>elles ne remettent pas en cause la validité des résultats observés dans cet essai</p>", "fraction": "0.5000000"},
    {"id": "503042", "question": "112347", "answer": "<p>était en faveur du pembrolizumab comparativement à l’ipilimumab </p>", "fraction": "1.0000000"},
    {"id": "503043", "question": "112347", "answer": "<p>ne permettait pas de conclure à une différence avec l’ipilimumab</p>", "fraction": "0.0000000"},
    {"id": "503044", "question": "112347", "answer": "<p>était en faveur de l’ipilimumab comparativement au pembrolizumab</p>", "fraction": "0.0000000"},
    {"id": "503045", "question": "112347", "answer": "<p>était en faveur du pembrolizumab toutes les 3 semaines comparativement à l’ipilimumab</p>", "fraction": "0.0000000"},
    {"id": "503046", "question": "112347", "answer": "<p>était en faveur du pembrolizumab toutes les 3 semaines comparativement au pembrolizumab toutes les 2 semaines</p>", "fraction": "0.0000000"},
    {"id": "503047", "question": "112348", "answer": "<p>l’intervalle de confiance de la survie globale du pembrolizumab toutes les 3 semaines était compris entre 9,4 et 17,3 mois </p>", "fraction": "1.0000000"},
    {"id": "503048", "question": "112348", "answer": "<p>le nombre de décès observés sous pembrolizumab toutes les 3 semaines était de 70</p>", "fraction": "1.0000000"},
    {"id": "503049", "question": "112348", "answer": "<p>le nombre de décès observés sous pembrolizumab toutes les 2 semaines était de 70</p>", "fraction": "0.0000000"},
    {"id": "503050", "question": "112348", "answer": "<p>le nombre de décès observés sous ipilimumab était de 74</p>", "fraction": "1.0000000"},
    {"id": "503051", "question": "112348", "answer": "<p>le rapport de risque (HR) était en faveur de l’ipilimumab comparativement au pembrolizumab toutes les 3 semaines</p>", "fraction": "0.0000000"},
    {"id": "503052", "question": "112349", "answer": "<p>la différence de survie globale entre pembrolizumab toutes les 3 semaines et ipilimumab était statistiquement significative</p>", "fraction": "1.0000000"},
    {"id": "503053", "question": "112349", "answer": "<p>la différence de survie globale entre pembrolizumab toutes les 3 semaines et ipilimumab n’était pas statistiquement significative</p>", "fraction": "0.0000000"},
    {"id": "503054", "question": "112349", "answer": "<p>la différence de survie globale entre pembrolizumab toutes les 3 semaines et ipilimumab ne peut pas être estimée</p>", "fraction": "0.0000000"},
    {"id": "503055", "question": "112349", "answer": "<p>la différence de survie sans progression entre pembrolizumab toutes les 3 semaines et ipilimumab était statistiquement significative</p>", "fraction": "0.0000000"},
    {"id": "503056", "question": "112349", "answer": "<p>la différence de survie sans progression entre pembrolizumab toutes les 3 semaines et ipilimumab n’était pas statistiquement significative</p>", "fraction": "0.0000000"},
    {"id": "503057", "question": "112350", "answer": "<p>le sexe féminin est associé à un avantage significatif en faveur du pembrolizumab </p>", "fraction": "1.0000000"},
    {"id": "503058", "question": "112350", "answer": "<p>les patients de plus de 65 ans sont associés à un avantage significatif en faveur du pembrolizumab</p>", "fraction": "0.0000000"},
    {"id": "503059", "question": "112350", "answer": "<p>le sexe masculin est associé à un avantage significatif en faveur de l’ipilimumab</p>", "fraction": "0.0000000"},
    {"id": "503060", "question": "112350", "answer": "<p>les patients ayant un mauvais état général (PS &gt; 1) sont associés à un avantage significatif en faveur de l’ipilimumab</p>", "fraction": "0.0000000"},
    {"id": "503061", "question": "112350", "answer": "<p>les patients ayant un bon état général (PS = 0) sont associés à un avantage significatif en faveur du pembrolizumab</p>", "fraction": "1.0000000"},
    {"id": "503062", "question": "112351", "answer": "<p>l’absence de double insu</p>", "fraction": "1.0000000"},
    {"id": "503063", "question": "112351", "answer": "<p>l’absence d’évaluation de la qualité de vie</p>", "fraction": "0.0000000"},
    {"id": "503064", "question": "112351", "answer": "<p>l’absence de cécité des évaluateurs</p>", "fraction": "0.0000000"},
    {"id": "503065", "question": "112351", "answer": "<p>l’absence de stratification des patients selon l’IMC</p>", "fraction": "0.0000000"},
    {"id": "503066", "question": "112351", "answer": "<p>l’absence de comité de surveillance indépendant</p>", "fraction": "0.0000000"},
    {"id": "503067", "question": "112352", "answer": "<p>une chimiothérapie systémique d’emblée dans tous les mélanomes métastatiques</p>", "fraction": "0.0000000"},
    {"id": "503068", "question": "112352", "answer": "<p>une stratégie de combinaison anti-PD-1 et anti-CTLA-4 chez les patients naïfs de traitement</p>", "fraction": "1.0000000"},
    {"id": "503069", "question": "112352", "answer": "<p>l’ajout d’un traitement adjuvant par l’ipilimumab après résection d’un mélanome de stade III</p>", "fraction": "0.0000000"},
    {"id": "503070", "question": "112352", "answer": "<p>une chirurgie systématique des métastases pulmonaires uniques</p>", "fraction": "0.0000000"},
    {"id": "503071", "question": "112352", "answer": "<p>l’ajout d’une thérapie ciblée BRAF/MEK dans tous les mélanomes métastatiques</p>", "fraction": "0.0000000"}
]

In [None]:
def transform_data(questions_data, answers_data):
    questions = []
    
    # Criar um dicionário para mapear id de perguntas para respostas
    answer_map = {}
    for answer in answers_data:
        question_id = answer["question"]
        if question_id not in answer_map:
            answer_map[question_id] = []
        if float(answer["fraction"]) > 0:
            answer_map[question_id].append(answer["answer"])
    
    # Criar lista de perguntas formatadas
    for question in questions_data:
        q_id = question["id"]
        article_id = "1f90a31355e180e376a2a4f420ca51970a772882.pdf"  # Assuming article ID remains constant
        
        quizz_question = question["questiontext"]
        
        # Obter possíveis respostas para a pergunta
        possible_answers = answer_map.get(q_id, [])
        possible_answers_str = ";".join(possible_answers)
        
        # Obter a chave de respostas corretas (answer_key)
        answer_key = question.get("answer_key", "")  # Assuming answer_key is provided in the questions_data
        
        # Montar o dicionário para a pergunta atual
        question_dict = {
            "article": article_id,
            "quizz_question": quizz_question,
            "possible_answers": possible_answers_str,
            "answer_key": answer_key
        }
        
        # Adicionar à lista de perguntas formatadas
        questions.append(question_dict)
    
    return questions

# Uso da função para transformar os dados
questions = transform_data(questions_data, answers_data)

print(questions)

In [None]:
ch_question = questions[5]
question = ch_question['quizz_question']
respostas_possiveis = ch_question['possible_answers']

article = "1f90a31355e180e376a2a4f420ca51970a772882.pdf"
quizz_question = ch_question['quizz_question']
possible_answers = ch_question['possible_answers']

In [61]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

question = """The following question is based on the article: {article}. 
The question statement, possible answers, and the correct answer are defined as follows:
Question: {quizz_question}
Possible answers: {possible_answers}

Please analyze the entire article {article} and determine as accurately as possible which section of the article this question pertains to. Also, justify your choice."""

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

retriever=vectordb.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=False,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}

)

result = qa({"query": question}, return_only_outputs=True)
result['result']

from transformers import pipeline

# Carregar o pipeline de classificação de texto
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Definir o texto e as categorias
text = result['result']

# Definir as seções como categorias
labels = ["Introduction", "Methods", "Results", "Discussion", "Conclusion"]

# Classificar o texto
classification = classifier(text, labels)

# Obter a seção mais provável
predicted_section = classification['labels'][0]

# Atualizar a contagem da seção
section_counts = {
    "Introduction": 0,
    "Methods": 0,
    "Results": 0,
    "Discussion": 0,
    "Conclusion": 0
}

section_counts[predicted_section] += 1

# Mostrar a seção predita e as contagens atualizadas
print(f"Predicted section: {predicted_section}")
print("Section counts:", section_counts)


: 