In [2]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('Breast_cancer_prediction_research_paper.pdf')
docs = loader.load()
docs

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'PyPDF', 'creationdate': '2021-09-04T13:45:22+05:30', 'authoritativedomain[1]': 'sciencedirect.com', 'authoritativedomain[2]': 'elsevier.com', 'crossmarkdomains[1]': 'sciencedirect.com', 'crossmarkdomains[2]': 'elsevier.com', 'crossmarkdomainexclusive': '2010-04-23', 'crossmarkmajorversiondate': '2010-04-23', 'elsevierwebpdfspecifications': '7.0', 'moddate': '2021-09-07T18:54:41+05:30', 'doi': '10.1016/j.procs.2021.07.062', 'robots': 'noindex', 'source': 'Breast_cancer_prediction_research_paper.pdf', 'total_pages': 6, 'page': 0, 'page_label': '487'}, page_content='ScienceDirect\nAvailable online at www.sciencedirect.com\nProcedia Computer Science 191 (2021) 487–492\n1877-0509 © 2021 The Authors. Published by Elsevier B.V .\nThis is an open access article under the CC BY-NC-ND license ( https://creativecommons.org/licenses/by-nc-nd/4.0 )\nPeer-review under responsibility of the Conference Program Chair.\n10.1016/j.proc

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
text_docs = text_splitter.split_documents(docs)

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(text_docs, embedding)

In [12]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 15}
)

In [13]:
from langchain_community.llms import Ollama
llm = Ollama(model="llama3.2", temperature=0.2)

In [14]:
from langchain_core.prompts import ChatPromptTemplate
system_prompt = (
    "You are a helpful assistant answering questions using the provided context. "
    "If the answer is not contained in the context, respond with \"I don't know.\" "
    "Keep answers concise, max three sentences.\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [15]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [14]:
response = rag_chain.invoke({"input": "who all are the authors of this research paper?"})
print(response["answer"])

The authors of this research paper are:

1. Mohammed Amine Naji
2. Sanaa El Filali
3. Kawtar Aarika
4. EL Habib Benlahmard
5. Rachida Ait Abdelouhahide
6. Olivier Debauchef


In [15]:
response = rag_chain.invoke({"input": "How many breast cancer cases were in the dataset?"})
print(response["answer"])

The Breast Cancer Wisconsin Diagnostic dataset contains 569 instances (357 Benign and 212 Malignant).


In [16]:
response = rag_chain.invoke({"input": "summarize the document?"})
print(response["answer"])

The document discusses the use of machine learning algorithms in predicting breast cancer diagnosis. The authors evaluate the performance of several classifiers, including k-NN, SVM, Random Forest, Logistic regression, and C4.5, using accuracy, precision, sensitivity, and F1 score as metrics. They report that the SVM algorithm achieved the highest accuracy (99.10%) in predicting breast cancer diagnosis.
