# Implementing CRAGS #

 - Load URLS
 - Retriever 
 - Get relevant data
 - Check top data, if not ideal perform a web search


In [35]:
### API Keys ###

import getpass
import os

# (optional) LangSmith to inspect inside your chain or agent.
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "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"

In [36]:
### Load Model ###
import os
from dotenv import load_dotenv, find_dotenv
from langchain_together import ChatTogether

load_dotenv()

# client = Together(api_key = os.getenv("TOGETHER_API_KEY"))

#load model
model = ChatTogether(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    temperature=0,
    max_tokens=320,
    # top_k=50,
    together_api_key= os.getenv("TOGETHER_API_KEY")
)

In [38]:
from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_together import TogetherEmbeddings


### Create Index using an the data folder:
file_url = open('data/urls/links.txt')
url_txt = file_url.read()
urls = url_txt.split("\n")

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=200, chunk_overlap=50
)

doc_splits = text_splitter.split_documents(docs_list)

vectorstore = FAISS.from_documents(documents=doc_splits, 
                                    embedding = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
                                    )

retriever = vectorstore.as_retriever()


In [35]:
## add in commonly asked question folder:
file_questions = "data/text_docs/likes_question.txt"

with open(file_questions, "r") as file:
    lines = file.readlines()

ques_docs=[]

for line in lines:
    # l_split = line.split(", ")
    # # number of times used and number of likes
    # no_used = int(l_split[0].split("=")[1])
    # no_likes = int(l_split[1].split("=")[1])
    prompt = line.split(", response=")[0].split("prompt=")[1]
    response = line.split(", response=")[1]

    ques_docs.append({'prompt':prompt, 'response':response})

retriever.add_documents(ques_docs)

{'prompt': '"What are the two common types of skin cancer? What is Non Melanoma"', 'response': '"The two common types of skin cancer are Melanoma and Non-Melanoma.  Non-Melanoma skin cancer is further divided into two main types:  Basal Cell Carcinoma and Squamous Cell Carcinoma (SCC)."'}


### Grades the document until relevant ###

 - choose question 
 - retrieve relevant docs, choose the top one
 - check relevancy of top n
 - pass rel_docs to generate

In [12]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

# LLM with function call
llm = ChatTogether(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    temperature=0,
)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

rel_docs = ''

# question = "What are the two common types of skin cancer? What is Non Melanoma"

# testing websearch , question has no relevant docs
question = "Is Cryosurgery invasive?"


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
### Check if relevant docs are necessary

docs = retriever.get_relevant_documents(question)

for i in range(min(3, len(docs))):
    doc_txt = docs[i].page_content
    final = retrieval_grader.invoke({"question": question, "document": doc_txt}) 
    print("final " + str(final)) 
    if(str(final) == "binary_score='yes'" ):
        rel_docs=doc_txt
        # print(final)
        break


# no relevant docs then perform a websearch

  docs = retriever.get_relevant_documents(question)


final binary_score='no'
final binary_score='no'


KeyboardInterrupt: 

### Reduced question: ###

**Input**   

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: What are the two common types of skin cancer? What is Non Melanoma 
Context: lump is often tender to touch and may bleed. It may develop into an ulcer.Pre-cancerous growthsThere are pre-cancerous growths that may develop into non-melanoma skin cancer if they are not treated.These include:Bowen's diseaseActinic keratosesBowen's diseaseBowen's disease is also called squamous cell carcinoma in situ. It develops slowly and is easy to treat.The main sign is a red, scaly patch on the skin. It may be itchy. It can appear on any area of the skin. In women, it is often found on the lower legs.It can develop into SCC.Actinic keratosesActinic keratoses are dry, scaly patches of skin. They are also known as solar keratoses. They are caused by the skin being exposed to the sun for a long time.You may have a lot of these patches. They can be pink, red or brown. They can vary in size from a few millimetres across to a few centimetres.They often appear on parts of your body that have been in the sun a lot. For example, the back of your hands or your scalp.The affected skin can sometimes become very thick. The patches can look like small horns or spikes.There is a small risk of them developing into SCC.Diagnosing non-melanoma skin cancerYour GP can examine your skin for signs of skin   

**Output**   

The two common types of skin cancer are Melanoma and Non-Melanoma. Non-Melanoma skin cancer is further divided into two main types: Basal Cell Carcinoma and Squamous Cell Carcinoma (SCC).

 - Is the doc relevant, 
 - else perform a web search


In [34]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from websearch import WebSearch
import io
import requests
from PyPDF2  import PdfReader



# define web-search
def web_search(question):
    web = WebSearch(question)
    search_urls = web.pages[:4]

    ### convert to texts 
    search_docs =[WebBaseLoader(url).load() for url in search_urls]
    search_docs_list =[item for sublist in search_docs for item in sublist]

    text_splitter =RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=200, chunk_overlap=40
    )

    rag_in =text_splitter.split_documents(search_docs_list)

    search_vectorstore = FAISS.from_documents(documents=rag_in, 
                                    embedding = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
                                    )

    search_retriever = search_vectorstore.as_retriever()

    return search_retriever


prompt = hub.pull("rlm/rag-prompt")

# Chain
rag_chain = (
    prompt
    | model
    | StrOutputParser()
)


In [35]:
from sklearn.metrics.pairwise import cosine_similarity

### gets the cosine similarities
def calculate_cosine_similarity(question, chunk):
    score = cosine_similarity([question], chunk)    
    return  score[0]


### generates the prompt

# If there is no docs then perform web search
if(rel_docs != ''):
    #print("here")
    output = rag_chain.invoke({"context": rel_docs, "question": question})
    print(output)
else:
    print("here now")
    search_retriever = web_search(question)
    search_documents = search_retriever.get_relevant_documents(question)

    print(search_documents)


    ### works but you need to remake the retriver, wastes too much time

    # indexing
    # vectorstore = FAISS.from_documents(documents=rag_in, 
    #                                 embedding = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
    #                                 )

    # retriever = vectorstore.as_retriever()
    # documents = retriever.get_relevant_documents(question)
    
    
    #output = rag_chain.invoke({"context": documents,"question": question})
    #print(output)

here now
[Document(metadata={'source': 'https://dermnetnz.org/topics/cryotherapy', 'title': 'Cryotherapy: Uses, Cautions, and Aftercare — DermNet', 'description': 'Cryotherapy is a minimally-invasive treatment that freezes skin surface lesions using extremely cold liquid or instruments (eg, liquid nitrogen).', 'language': 'en'}, page_content='Actinic keratoses often only require one freeze-thaw cycle with complete cure rates varying from 39%– 83%.\n\nSeborrheic keratoses may require longer treatment times and multiple freeze-thaw cycles if lesions are thicker.\n\nViral warts\nClearance rates of verrucous lesions can vary depending on the degree of hyperkeratosis and size of the wart.\nSeveral treatment sessions may be needed and the overall cure rate varies from 39% to 84% at three months.\nFavourable response rates have been reported with keratolytic pre-treatment.\nMalignant lesions: BCCs and SCCs\nCryosurgery is not a first-line treatment for cancerous lesions such as BCCs and SCCs,

In [40]:
urls = [ docs.metadata['source']  for docs in search_documents]
urls = list(dict.fromkeys(urls))

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]
# print(docs_list)

doc_splits = text_splitter.split_documents(docs_list)

retriever.add_documents(doc_splits)
retriever.get_relevant_documents(question)

[Document(metadata={'source': 'https://dermnetnz.org/topics/cryotherapy', 'title': 'Cryotherapy: Uses, Cautions, and Aftercare — DermNet', 'description': 'Cryotherapy is a minimally-invasive treatment that freezes skin surface lesions using extremely cold liquid or instruments (eg, liquid nitrogen).', 'language': 'en'}, page_content="\n\n\n\n\n\n\n\nCryotherapy: Uses, Cautions, and Aftercare — DermNet\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch DermNet\n\nCtrlK\n\n\n\n\n\n\n\n\n\n\nAre you a healthcare professional\nGO TO DERMNET PRO\n\n\n\n\n\n\n\n\n\n\n\n                            Home\n                        \n\n\n\n                            Topics A-Z\n                        \n\n\n\n                            Images\n                        \n\n\n\n                            Cases\n                        \n\n\n\n                            Skin checker\n                      

KeyboardInterrupt: 

In [31]:
search_documents

[Document(metadata={'source': 'https://www2.hse.ie/conditions/skin-cancer-melanoma/', 'title': '\n            \n                Skin cancer (melanoma) overview\n            \n            \n                \n                    - HSE.ie\n                \n            \n        ', 'description': 'Melanoma skin cancer starts in cells in skin called melanocytes. It can spread to other organs in the body. Find out about melanoma', 'language': 'en'}, page_content='stand out among other moles.Most melanomas are new moles.Check your skin every few months for new moles or changes in existing moles. Spot any changes using the ABCDE of moles.Symptoms of melanomaTypes of melanomaThere are many different types of melanoma. Sometimes melanoma skin cancer is called malignant melanoma.'),
 Document(metadata={'source': 'https://www2.hse.ie/conditions/non-melanoma-skin-cancer/', 'title': '\n            \n                Skin cancer (non-melanoma) symptoms and diagnosis\n            \n            \n     

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_together import TogetherEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Define function to calculate cosine similarity
def calculate_cosine_similarity(question_embedding, chunk_embeddings):
    similarities = cosine_similarity([question_embedding], chunk_embeddings)
    return similarities[0]

# Main function to get the most relevant chunks
def get_relevant_chunks(question):
    # Perform web search and get document splits
    rag_in = web_search(question)
    
    # Initialize embedding model
    embedding_model = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
    
    # Calculate embeddings for the question and chunks
    question_embedding = embedding_model.embed_query(question)
    chunk_embeddings = [embedding_model.embed_query(chunk.page_content) for chunk in rag_in]
    
    # Calculate cosine similarities
    similarities = calculate_cosine_similarity(question_embedding, chunk_embeddings)
    
    # Rank chunks based on similarity scores
    ranked_chunks = sorted(zip(rag_in, similarities), key=lambda x: x[1], reverse=True)
    
    # Select top N chunks (e.g., top 5)
    top_chunks = [chunk for chunk, score in ranked_chunks[:5]]
    
    return top_chunks

# Example usage
question = "What are the symptoms of melanoma?"
top_chunks = get_relevant_chunks(question)
for chunk in top_chunks:
    print(chunk.page_content)

### TESTING ###

In [34]:
question

'is Cryosurgery invasive'

In [None]:
web = WebSearch(question)
pdfs = web.pdf
search_urls = [pdf for pdf in pdfs[:5]]

check = search_urls[0]
pdf_loaded = WebBaseLoader(check).load()
print(pdf_loaded[0].page_content)


In [29]:
### alternative method using a different websearch

from bs4 import BeautifulSoup
import requests
import os
from urllib.robotparser import RobotFileParser

def google_search(query, API_KEY, SEAERCH_ID):
    url = f"https://www.googleapis.com/customsearch/v1?key={API_KEY}&cx={SEAERCH_ID}&q={query}&start={1}"
    print("URLS:", url)
    response = requests.get(url)
    if response.status_code == 200:
        results = response.json()
        print(results)

        searches =  results['items']
        urls = []
        for items in searches[:3]:
            urls.append(items['link'])
        return urls
    else:
        return None
    
query = "What is cryosurgery?"

urls = google_search(query,api_key,search_engine)
print(urls)





URLS: https://www.googleapis.com/customsearch/v1?key=AIzaSyCMpJIU5d6I-L3jZgEvvzudsPFETXeTT6I&cx=d2361eda72e7a43f9&q=What is cryosurgery?&start=1
{'kind': 'customsearch#search', 'url': {'type': 'application/json', 'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'}, 'queries': {'request': [{'title': 'Google Custom Search - What is cryosurgery?', 'totalResults': '440000', 'search

In [31]:

def google_search(query, API_KEY, SEAERCH_ID):
    url = f"https://www.googleapis.com/customsearch/v1?key={API_KEY}&cx={SEAERCH_ID}&q={query}&start={1}"
    response = requests.get(url)
    if response.status_code == 200:
        urls = []
        for items in searches[:3]:
            urls.append(items['link'])
        return urls
    else:
        return None

In [39]:
### Create Index using an the data folder:
# Open and read the URL file
with open('data/urls/links.txt') as file_url:
    url_txt = file_url.read()

# Split the content into a list of URLs and filter out empty lines
urls = [url for url in url_txt.split("\n") if url.strip()]


docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

In [1]:
print(pdf_loaded[0].page_content)

#r = requests.get(pdf_loaded)
#f = io.BytesIO(r.content)
#
#reader = PdfReader(f)
#contents = reader.getPage(0).extractText().split('\n')

# search_docs = [pdf_2_text(WebBaseLoader(url).load())for url in search_urls]
# search_docs_list = [item for sublist in search_docs for item in sublist]
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=300, chunk_overlap=50
# )
# search_doc_splits = text_splitter.split_documents(search_docs_list)

NameError: name 'pdf_loaded' is not defined