# Implementing CRAGS #

 - Load URLS
 - Retriever 
 - Get relevant data
 - Check top data, if not ideal perform a web search


In [2]:
### API Keys ###

import getpass
import os

# (optional) LangSmith to inspect inside your chain or agent.
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "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"

In [4]:
### Load Model ###
import os
from dotenv import load_dotenv, find_dotenv
from langchain_together import ChatTogether

load_dotenv()

# client = Together(api_key = os.getenv("TOGETHER_API_KEY"))

#load model
model = ChatTogether(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    temperature=0,
    max_tokens=320,
    # top_k=50,
    together_api_key= os.getenv("TOGETHER_API_KEY")
)

In [5]:
from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_together import TogetherEmbeddings


### Create Index using an the data folder:
file_url = open('/data/urls/links.txt')
url_txt = file_url.read()
urls = url_txt.split("\n")

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, chunk_overlap=50
)

doc_splits = text_splitter.split_documents(docs_list)
# check if docs loaded
#print(docs[2]) 

vectorstore = FAISS.from_documents(documents=doc_splits, 
                                    embedding = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
                                    )

retriever = vectorstore.as_retriever()


USER_AGENT environment variable not set, consider setting it to identify your requests.


### Grades the document until relevant ###

 - choose question 
 - retrieve relevant docs, choose the top one
 - check relevancy of top n
 - pass rel_docs to generate

In [10]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

# LLM with function call
llm = ChatTogether(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    temperature=0,
)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

# total 443
# question = "What are the two common types of skin cancer? What is Non Melanoma"

# testing websearch , question has no relevant docs
question = "is Cryosurgery invasive"

docs = retriever.get_relevant_documents(question)

rel_docs = ''

for i in range(min(3, len(docs))):
    doc_txt = docs[i].page_content
    final = retrieval_grader.invoke({"question": question, "document": doc_txt}) 
    print("final " + str(final)) 
    if(str(final) == "binary_score='yes'" ):
        rel_docs=doc_txt
        # print(final)
        break


# no relevant docs then perform a websearch


final binary_score='no'
final binary_score='no'
final binary_score='no'


### Reduced question: ###

**Input**   

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: What are the two common types of skin cancer? What is Non Melanoma 
Context: lump is often tender to touch and may bleed. It may develop into an ulcer.Pre-cancerous growthsThere are pre-cancerous growths that may develop into non-melanoma skin cancer if they are not treated.These include:Bowen's diseaseActinic keratosesBowen's diseaseBowen's disease is also called squamous cell carcinoma in situ. It develops slowly and is easy to treat.The main sign is a red, scaly patch on the skin. It may be itchy. It can appear on any area of the skin. In women, it is often found on the lower legs.It can develop into SCC.Actinic keratosesActinic keratoses are dry, scaly patches of skin. They are also known as solar keratoses. They are caused by the skin being exposed to the sun for a long time.You may have a lot of these patches. They can be pink, red or brown. They can vary in size from a few millimetres across to a few centimetres.They often appear on parts of your body that have been in the sun a lot. For example, the back of your hands or your scalp.The affected skin can sometimes become very thick. The patches can look like small horns or spikes.There is a small risk of them developing into SCC.Diagnosing non-melanoma skin cancerYour GP can examine your skin for signs of skin   

**Output**   

The two common types of skin cancer are Melanoma and Non-Melanoma. Non-Melanoma skin cancer is further divided into two main types: Basal Cell Carcinoma and Squamous Cell Carcinoma (SCC).

 - Is the doc relevant, 
 - else perform a web search


In [15]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from websearch import WebSearch
import io
import requests
from PyPDF2  import PdfReader


# convert pdf to text
def pdf_2_text(url):
    print(url.page_content)
    r = requests.get(url.page_content)
    f = io.BytesIO(r.content)

    reader = PdfReader(f)
    contents = reader.getPage(0).extractText().split('\n')
    return contents

# define web-search
def web_search(question):
    web = WebSearch(question)
    pdfs = web.pdf
    search_urls = [pdf for pdf in pdfs[:5]]
    # print("urls: " + urls[0])

    search_docs = [pdf_2_text(WebBaseLoader(url).load())for url in search_urls]

    search_docs_list = [item for sublist in search_docs for item in sublist]

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=300, chunk_overlap=50
    )

    search_doc_splits = text_splitter.split_documents(search_docs_list)
    ## print(doc_splits)
    return search_doc_splits


prompt = hub.pull("rlm/rag-prompt")

# Chain
rag_chain = (
    prompt
    | model
    | StrOutputParser()
)


In [16]:
### generates the prompt

# If there is no docs then perform web search
if(rel_docs != ''):
    #print("here")
    output = rag_chain.invoke({"context": rel_docs, "question": question})
    print(output)
else:
    print("here now")
    rag_in = web_search(question)
    print(rag_in)
    #indexing 
    #vectorstore = FAISS.from_documents(documents=rag_in, 
    #                                embedding = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
    #                                )
#
    #retriever = vectorstore.as_retriever()

    #output = rag_chain.invoke({"context": retriever, "question": question})
    #print(output)

here now


AttributeError: 'list' object has no attribute 'source'

### TESTING ###

In [34]:
question

'is Cryosurgery invasive'

In [None]:
web = WebSearch(question)
pdfs = web.pdf
search_urls = [pdf for pdf in pdfs[:5]]

check = search_urls[0]
pdf_loaded = WebBaseLoader(check).load()
print(pdf_loaded[0].page_content)


In [1]:
print(pdf_loaded[0].page_content)

#r = requests.get(pdf_loaded)
#f = io.BytesIO(r.content)
#
#reader = PdfReader(f)
#contents = reader.getPage(0).extractText().split('\n')

# search_docs = [pdf_2_text(WebBaseLoader(url).load())for url in search_urls]
# search_docs_list = [item for sublist in search_docs for item in sublist]
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=300, chunk_overlap=50
# )
# search_doc_splits = text_splitter.split_documents(search_docs_list)

NameError: name 'pdf_loaded' is not defined