## EXTRACT TEXT FROM PDF

In [39]:
import pdfplumber
import re

def clean_text_file(text):
    # Replace newline characters with spaces
    text = text.replace('\n', ' ')
    
    # Define a regex pattern to keep only letters, numbers, and spaces
    pattern = r'[^a-zA-Z0-9 -]'
    
    # Remove unwanted characters based on the pattern
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

def save_text_to_file(cleaned_text, output_text_file = "TextFromPDF.txt"):
    """Save the extracted text to a text file."""
    with open(output_text_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)
    print(f"Text extracted from the PDF has been saved to '{output_text_file}'.")


def extract_text_from_pdf(pdf_path, output_text_file = "TextFromPDF.txt"):
    """Extract text from the given PDF file using pdfplumber."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    
    cleaned_text = clean_text_file(text)
    save_text_to_file(cleaned_text, output_text_file)
    return cleaned_text


In [40]:
Text_from_PDF = extract_text_from_pdf("uploads\colon-patient.pdf")

Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


Text extracted from the PDF has been saved to 'TextFromPDF.txt'.


## TEXT FROM PDF PAGES

In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from tqdm import tqdm
import re

In [2]:
def process_page(extracted_page):
   content = []


   # Get a sorted list of elements based on 
                # their Y-coordinate in reverse order
   elements = [element for element in extracted_page._objs]
   elements.sort(key=lambda a: a.y1, reverse=True)


   for i, element in enumerate(elements):
       # Extract text if the element is a text container 
       # and text extraction is enabled
       if isinstance(element, LTTextContainer):
           line_text = extract_text_and_normalize(element)
           content.append(line_text)

   # Combine and clean up the extracted content
   content = re.sub('\n+', '\n', ''.join(content))
   return content

def process_document(pdf_path, page_ids=None):
   extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)

   page2content = {}
   # Process each extracted page
   for extracted_page in tqdm(extracted_pages):
       page_id = extracted_page.pageid
       content = process_page(extracted_page)
       page2content[page_id] = content
   return page2content


def extract_text_and_normalize(element):
   # Extract text from line and split it with new lines
   line_texts = element.get_text().split('\n')
   norm_text = ''
   for line_text in line_texts:
       line_text=line_text.strip()
       # empty strings after striping convert to newline character
       if not line_text:
           line_text = '\n'
       else:
           line_text = re.sub('\s+', ' ', line_text)
           # if the last character is not a letter or number,
                                # add newline character to a line
           if not re.search('[\w\d\,\-]', line_text[-1]):
               line_text+='\n'
           else:
               line_text+=' '
       # concatenate into single string
       norm_text+=line_text
   return norm_text

In [9]:
pdf_path = 'colon-patient.pdf'
page2content = process_document(pdf_path, page_ids=[19,20,23,12,26])
print(page2content)

0it [00:00, ?it/s]The PDF <_io.BufferedReader name='colon-patient.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
5it [00:00, 16.82it/s]


## NER 1 BY FAISAL BHAI

In [19]:
import spacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
 
# Load a scispaCy model
nlp = spacy.load("en_core_sci_md")

#nlp = spacy.load("en_core_sci_md", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
# Add components for linking entities to medical vocabularies
#nlp.add_pipe("abbreviation_detector")
#linker = nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [21]:
from spacy import displacy

text = page2content[3]
doc = nlp(text)

# Visualize the entities with displacy
displacy.render(doc, style="ent")


In [22]:
print(doc.ents)

(Treatment, non-metastatic cancer, » Chemotherapy, surgery, Chemotherapy, surgery, Side effects, Chemotherapy, medicine, kill, cancer cells, systemic therapy, Chemotherapy, intravenously, medicine, infused, bloodstream, vein, bloodstream, cells, body, Systemic therapy, kills, cancer cells, healthy cells, damage, healthy cells, hair loss, cracked skin, mouth sores, side effects, Managing, side effects, your care team, bothersome, side effects, nausea, vomiting, options, managing, effects of, treatment, Chemotherapy, cycles, treatment days, days, rest, your body, cycles, Cycles, length, drugs, Stage, information, supportive care, NCCN.org/patientguidelines, NCCN, Patient Guides for Cancer app, Chemotherapy, colectomy, stage 1 cancers, Stage, Chemotherapy, colectomy, stage 2A, 2B dMMR/MSI-H cancers, stage 2C, dMMR/MSI-H cancers, Chemotherapy, colectomy, stage, pMMR/MSS cancers, cancers, high risk, recurrence, treatment, features, risk factors, cancer, recurrence, chemotherapy, planned, re

## NER 2

### Split into 512 chunks for bert

In [5]:
def split_into_chunks(text, chunk_size=512):
    # Ensure the chunk_size is positive and non-zero
    if chunk_size <= 0:
        raise ValueError("Chunk size must be a positive integer")
    
    # Create a list of chunks
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    
    return chunks

### Load Model

In [6]:
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          pipeline,
                          )

model_checkpoint = "SCISPACY_Models/scibert2"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=5,
                                                        id2label={0: 'O', 1: 'B-DRUG', 2: 'I-DRUG', 3: 'B-EFFECT', 4: 'I-EFFECT'} 
                                                        )                                                        
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model_pipeline = pipeline(task="ner", model=model, tokenizer=tokenizer)


  from .autonotebook import tqdm as notebook_tqdm


### NER label enitities

In [19]:
def Extract_drugs(chunks):
    all_ner_results = []

    for chunk in chunks:
        # Get NER results
        ner_results = model_pipeline(chunk)
        all_ner_results.extend(ner_results)
    
    # Combine sub-tokens into full words with a single label
    word_entities = []
    current_word = ""
    current_label = ""
    start_idx, end_idx = -1, -1

    # print(f"NER results: {all_ner_results}")

    for entity in all_ner_results:
        # if entity['word'].startswith("##") and entity['entity'] == "I-DRUG" and current_label == "B-DRUG":
        if entity['entity'] == "I-DRUG" and current_label == "B-DRUG":
            if entity['word'].startswith("##"):
                current_word += entity['word'][2:]  # Combine sub-token with the previous word
            else:
                current_word += entity['word']
            end_idx = entity['end']

        elif entity['entity'] == "I-EFFECT" and current_label == "B-EFFECT":
            if entity['word'].startswith("##"):
                current_word += entity['word'][2:]  # Combine sub-token with the previous word
            else:
                current_word += entity['word']
            end_idx = entity['end']

        else:
            if current_word:  # Save the previous word and its label
                word_entities.append({
                    "word": current_word,
                    "label": current_label,
                    "start": start_idx,
                    "end": end_idx
                })
            # Start a new word
            current_word = entity['word']
            current_label = entity['entity']
            start_idx = entity['start']
            end_idx = entity['end']

    # Append the last word
    if current_word:
        word_entities.append({
            "word": current_word,
            "label": current_label,
            "start": start_idx,
            "end": end_idx
        })

    # print(f"Entities found: {word_entities}")
    Drugs = []
    current_drug = ""
    current_start = None

    for entity in word_entities:
        if entity['label'] == 'B-DRUG':  # Beginning of a new drug
            # Append the previous drug if it exists
            if current_drug:
                Drugs.append({
                    "word": current_drug,
                    "label": 'DRUG',
                    "start": current_start,
                    "end": entity['start'] - 1
                })
            # Start a new drug
            current_drug = entity['word']
            current_start = entity['start']
        elif entity['label'] == 'I-DRUG':  # Continuation of the current drug
            current_drug += entity['word']
        else:  # Not a drug entity, reset current drug
            if current_drug:
                Drugs.append({
                    "word": current_drug,
                    "label": 'DRUG',
                    "start": current_start,
                    "end": entity['start'] - 1
                })
            current_drug = ""
            current_start = None

    # Append the last drug if there is one
    if current_drug:
        Drugs.append({
            "word": current_drug,
            "label": 'DRUG',
            "start": current_start,
            "end": end_idx
        })
    
    filtered_drugs = []

    for drug in Drugs:
        if drug['word'].lower() not in filtered_drugs:
            filtered_drugs.append(drug['word'].lower())
 
    return filtered_drugs


In [20]:
## TESTING
# text = page2content[3]
# text_file_path = "TextFromPDF.txt"
# text_content = read_text_file(file_path)
# chunks = split_into_chunks(text_content)
# Drugs = Extract_drugs(chunks)

# for drug in Drugs:
#     print(drug + " (DRUG)")


NER results: [{'entity': 'I-EFFECT', 'score': 0.58434415, 'index': 76, 'word': '-', 'start': 332, 'end': 333}, {'entity': 'I-EFFECT', 'score': 0.7736949, 'index': 78, 'word': '##ous', 'start': 339, 'end': 342}, {'entity': 'I-EFFECT', 'score': 0.772688, 'index': 79, 'word': 'growth', 'start': 343, 'end': 349}, {'entity': 'I-EFFECT', 'score': 0.7710059, 'index': 42, 'word': 'and', 'start': 243, 'end': 246}, {'entity': 'I-EFFECT', 'score': 0.5393977, 'index': 43, 'word': 'ovarian', 'start': 247, 'end': 254}, {'entity': 'I-EFFECT', 'score': 0.6736732, 'index': 44, 'word': 'cancers', 'start': 255, 'end': 262}, {'entity': 'B-DRUG', 'score': 0.8813997, 'index': 5, 'word': 'fa', 'start': 22, 'end': 24}, {'entity': 'B-DRUG', 'score': 0.68887854, 'index': 73, 'word': 'fa', 'start': 397, 'end': 399}, {'entity': 'B-DRUG', 'score': 0.9032278, 'index': 80, 'word': 'fa', 'start': 426, 'end': 428}, {'entity': 'I-EFFECT', 'score': 0.574778, 'index': 42, 'word': 'blood', 'start': 213, 'end': 218}, {'ent

### Results

In [21]:
Drugs

['fa',
 'pembrolizumab',
 'nivolumab',
 'cape',
 '5-fluorouracil',
 '5-fuleucovorin',
 'capecitabine',
 '5-fu',
 'peox',
 'ige',
 'oth',
 '##in',
 'folfir',
 'folfirino',
 'fol',
 'cet',
 'erb',
 'folfox',
 'capeox',
 'dmmrmsi',
 'polepold1',
 'fruquintinib',
 'tipira',
 'regorafenib',
 'stivarga',
 'opdiv',
 'bevacizumab',
 'dostarlimab',
 'encorafenib',
 'cetuximab',
 'trast',
 'pert',
 'lapatinib',
 'tu',
 'deruxan',
 'ada',
 'entrectinib',
 'rozlytrek',
 'larotrectin',
 'selpercat',
 'folfiri',
 'irinotecan',
 'ziv-aflibercept',
 'ramucirumab',
 'panitumumab',
 'oxaliplatin',
 'capeoxcovatin',
 '##atin',
 'duloxetine',
 'crosatellite',
 'leucovorin',
 'racil',
 'leuc',
 'fluorouracil',
 'oxal',
 'calciumfluorouracil',
 'foxchase',
 'hip',
 '##orgdonate']

### check the results with golden results


In [24]:
# Define the text you want to put into a list
text = """
Oxaliplatin
Pertuzumab
Trastuzumab
Tucatinib
Capecitabine
5-Fluorouracil
Bevacizumab
Fruquintinib
Famtrastuzumabderuxtecannxk
Ipilimumab
Dostarlimabgxly
Nivolumab
Pembrolizumab
Cetuximab
Panitumumab
Encorafenib
Adagrasib
Sotorasib
Lapatinib
Regorafenib
Entrectinib
Larotrectinib
Selpercatinib
Irinotecanhydrochloride
Leucovorin
Trifluridine
Tipiracil
"""
 
# Split the text into a list using newline characters
FaisalBHAI_list = text.lower().strip().split('\n')

In [25]:
matched_drugs = []
for drug in Drugs:
    if drug in FaisalBHAI_list:
        matched_drugs.append(drug)


for drug in matched_drugs:
    print(drug)

pembrolizumab
nivolumab
5-fluorouracil
capecitabine
fruquintinib
regorafenib
bevacizumab
encorafenib
cetuximab
lapatinib
entrectinib
panitumumab
oxaliplatin
leucovorin


## LLMs (gemma2:2b/llama3.1) 

In [5]:
import requests

def call_api(prompt, model="gemma"):
    """
    Calls the API at http://172.16.8.56:5000/query, sends a prompt, and returns the output.

    Args:
        prompt (str): The prompt to send to the API.
        model (str): The LLM model to use.
    Returns:
        dict: The response from the API, parsed as a JSON object.
    """

    if model == "llama":
        port = 5001
    else:
        port = 5000
    # Define the API endpoint and headers
    api_url = f"http://172.16.8.56:{port}/query"
    headers = {
        "Content-Type": "application/json"
    }

    # Define the payload with the prompt
    data = {
        "prompt": prompt
    }

    try:
        # Send the POST request to the API
        response = requests.post(api_url, json=data, headers=headers)

        # Raise an exception if the request was not successful
        response.raise_for_status()

        # Return the JSON response
        return response.json()

    except requests.exceptions.RequestException as e:
        # Handle any exceptions that occur during the request
        print(f"An error occurred: {e}")
        return None


In [7]:

# Example usage
for i, chunk in enumerate(text_chunks):
    prompt = f"You are an exxpert oncologist. Using your expertise find and extract Drug names from the given text. OUTPUT FORMAT: In case you do not find any drugs in the text then just respond with an empty python list. If you find drugs then respond with a pythonic list of strings that has the each drug name as an element. Do not includes anything else in your response. Text to process: {chunk}"
    output = call_api(prompt,"llama")
    print(f"Output for Chunk {i+1}:\n", output)


Output for Chunk 1:
 {'response': '[]'}
Output for Chunk 2:
 {'response': "['CEA', 'ctDNA']"}
Output for Chunk 3:
 {'response': "['Sedative', 'Carcinoembryonic antigen (CEA)', 'Contrast']"}
Output for Chunk 4:
 {'response': '[]'}
Output for Chunk 5:
 {'response': "['Pembrolizumab', 'Keytruda', 'Nivolumab', 'Opdivo', 'ipilimumab', 'Yervoy', 'CAPEOX', 'FOLFOX', 'capecitabine', '5-FUleucovorin', 'CEA']"}


## LLM FROM Sohaib Bhai

In [4]:
def chunk_text(Text_from_PDF, max_words=1000):
    # Split the text into words
    words = Text_from_PDF.split()
    
    # Create chunks of words with a maximum length of `max_words`
    chunks = [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
    
    return chunks

In [5]:
import torch
import requests

def get_response(prompt): # Method to pass in prompt and pipe and return response generated from the pipeline
    with torch.no_grad(), torch.inference_mode(): # inference mode disables gradient saving
        #return pipe(prompt)[0]['generated_text'].split(prompt)[1]
        response = requests.post(
            url='http://172.16.101.171:8002/notes/',
            json={
                'prompt': prompt
            }
        )
        return response.json()['response']
   

In [6]:
def prepare_prompt(Text_chunk):
    my_prompt = [
        {
            'role': 'system',
            'content': """
            You are an expert Data Analyst with 20 years of experinance.
            Your task is to find and extract Drug names from the given text in form of a Python list.            
            OUTPUT FORMAT: In case you do not find any drugs in the text, then just return an empty python list. Do not include anything else in your output.
            IMPORTANT:YOU ARE NOT VERBOSE.
            """.strip().replace('\t', '')
        },
        {
            'role': 'user',
            'content': Text_chunk
        },
    ]
    
    return my_prompt


In [7]:
chunks = chunk_text(Text_from_PDF)

In [8]:
chunks[7]

'more information see Chapter 5 Survivorship Guide 3 Surveillance after treatment for stage 4 colon cancer First 2 years Every 3 to 6 months Physical exam and CEA blood test Next 3 years Every 6 months First 2 years Every 3 to 6 months CT of chest abdomen and pelvis Next 3 years Every 6 to 12 months You did not have a total colonoscopy at diagnosis Colonoscopy is recommended 3 to 6 months after surgery You had a total colonoscopy at diagnosis Colonoscopy Colonoscopy is recommended 1 year after surgery If no advanced adenomas are found repeat in 3 years After that repeat every 5 years NCCN Guidelines for Patients Colon Cancer 2024 29 Treatment for metastatic cancer Distant recurrence Distant recurrence For cancers without biomarkers chemotherapy with FOLFIRI or irinotecan is a recommended After treatment for non-metastatic colon first-line option A biologic may be given with cancer the cancer may return and spread to chemotherapy Biologics include bevacizumab the liver lungs or other ar

In [18]:
All_responses = []
for i , chunk in enumerate(chunks):
    prompt = prepare_prompt(chunk)
    response = get_response(prompt)
    All_responses.append(response)
    print(f"Response for chunk {i+1}:\n\t{response}")



Response for chunk 1:
	['colon cancer', 'adenoma', 'adenocarcinoma','mismatch repair deficiency', 'dMMR', 'hyperplastic polyps', 'inflammatory polyps', 'polypectomy', 'Colectomy']
Response for chunk 2:
	['Chemotherapy', 'Cancer', 'Metastatic cancer', 'Systemic therapy', 'Mismatch repair', 'Lynch syndrome', 'FAP', 'Colon cancer', 'Colonoscopy', 'Endometrial cancer', 'Ovarian cancer', 'Microsatellite instability', 'MSI-H', 'dMMR', 'pMMR', 'Mismatch repair deficient', 'MSS', 'Mismatch repair proficient', 'Carcinoembryonic antigen', 'ctDNA', 'Liquid biopsy']
Response for chunk 3:
	['colon cancer','sedative','medicine', 'contrast', 'CT scan', 'MRI', 'chemotherapy']
Response for chunk 4:
	['Colectomy', 'Colostomy', 'Chemotherapy', 'Immunotherapy', 'Stent']
Response for chunk 5:
	['Pembrolizumab', 'Keytruda', 'Nivolumab', 'Opdivo', 'Yervoy', '5-fluorouracil', '5-FU', 'leucovorin', 'capecitabine', 'FOLFOX', 'CAPEOX']
Response for chunk 6:
	['Chemotherapy', 'RAS', 'KRAS', 'NRAS', 'BRAF', 'HER2'

In [73]:
import re

def extract_drugs_from_responses(responses):
    # List to store all extracted drug names
    all_drugs = []

    # Iterate over each response
    for response in responses:
        # Use regex to extract list-like structures from the response
        extracted_lists = re.findall(r"\[.*?\]", response)

        for item_list in extracted_lists:
            # Convert the extracted string to a Python list using eval
            try:
                drugs = eval(item_list)
                if isinstance(drugs, list):
                    all_drugs.extend(drugs)
            except:
                # Handle any errors in eval (e.g., malformed list)
                print(f"Could not parse the list: {item_list}")

    return all_drugs

# Extract all drug names from responses
all_drug_names = extract_drugs_from_responses(All_responses)

filtered_drugs = []

for drug in all_drug_names:
    if drug.lower() not in filtered_drugs:
        filtered_drugs.append(drug.lower())


# Print the combined list of drug names
print(filtered_drugs)


['colon cancer', 'adenoma', 'adenocarcinoma', 'mismatch repair deficiency', 'dmmr', 'hyperplastic polyps', 'inflammatory polyps', 'polypectomy', 'colectomy', 'chemotherapy', 'cancer', 'metastatic cancer', 'systemic therapy', 'mismatch repair', 'lynch syndrome', 'fap', 'colonoscopy', 'endometrial cancer', 'ovarian cancer', 'microsatellite instability', 'msi-h', 'pmmr', 'mismatch repair deficient', 'mss', 'mismatch repair proficient', 'carcinoembryonic antigen', 'ctdna', 'liquid biopsy', 'sedative', 'medicine', 'contrast', 'ct scan', 'mri', 'colostomy', 'immunotherapy', 'stent', 'pembrolizumab', 'keytruda', 'nivolumab', 'opdivo', 'yervoy', '5-fluorouracil', '5-fu', 'leucovorin', 'capecitabine', 'folfox', 'capeox', 'ras', 'kras', 'nras', 'braf', 'her2', 'high microsatellite instability', 'dmmrmsi-h', 'pole', 'pold1', 'ret', 'ntrk', 'nccn', 'avastin', 'bevacizumab', 'folfiri', 'folfirinox', 'panitumumab', 'vectibix', 'cetuximab', 'erbitux', 'lonsurf', 'trifluridine', 'tipiracil', 'regorafe

### Check with golden results

In [20]:
# Define the text you want to put into a list
text = """
Oxaliplatin
Pertuzumab
Trastuzumab
Tucatinib
Capecitabine
5-Fluorouracil
Bevacizumab
Fruquintinib
Famtrastuzumabderuxtecannxk
Ipilimumab
Dostarlimabgxly
Nivolumab
Pembrolizumab
Cetuximab
Panitumumab
Encorafenib
Adagrasib
Sotorasib
Lapatinib
Regorafenib
Entrectinib
Larotrectinib
Selpercatinib
Irinotecanhydrochloride
Leucovorin
Trifluridine
Tipiracil
"""
 
# Split the text into a list using newline characters
FaisalBHAI_list = text.lower().strip().split('\n')


In [22]:
Golden_results = FaisalBHAI_list
LLM_results = all_drug_names

matched_drugs = []
for drug in LLM_results:
    if drug in FaisalBHAI_list:
        matched_drugs.append(drug)


for drug in matched_drugs:
    print(drug)

5-fluorouracil
leucovorin
capecitabine
bevacizumab
panitumumab
cetuximab
trifluridine
tipiracil
regorafenib
nivolumab
ipilimumab
pertuzumab
lapatinib
tucatinib
bevacizumab
cetuximab
panitumumab
oxaliplatin
leucovorin
oxaliplatin
capecitabine
oxaliplatin
leucovorin


In [23]:
len(matched_drugs)

23

## Cure Doc By Obaid Bhai


In [41]:
def chunk_text(Text_from_PDF, max_words=1000):
    # Split the text into words
    words = Text_from_PDF.split()
    
    # Create chunks of words with a maximum length of `max_words`
    chunks = [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
    
    return chunks

In [42]:
import requests

def Call_CureDoc(prompt):
    # Define the headers with the access token
    URL = "http://172.16.101.164:8015/upload-notes"
    headers = {
        'access-token': "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZG1pbiIsImV4cCI6MTcyODAzODgzOX0.PHBktkso2FZrONIueuCgn9JDJc2sc7bu3ATlZiDffLA"
    }
    
    # Define the payload with the required fields
    payload = {
        "userId": "123798",
        "chatId": "",
        "filename": "123.txt",
        "prompt": prompt,
        "progressNote": "",
    }
    
    try:
        # Make the POST request to the API
        response = requests.post(URL, headers=headers, json=payload)
        
        # Check if the request was successful
        if response.status_code == 200:
            # print("Notes uploaded successfully.")
            return response.json().get('response', 'No response key found')  # Return the response as JSON if needed
        else:
            print(f"Failed to upload notes. Status code: {response.status_code}")
            print(f"Response: {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        # Handle any exceptions that occur during the request
        print(f"An error occurred: {e}")
        return None



In [53]:
def prepare_prompt_for_CureDoc(Text_chunk):
    my_prompt = """You are an expert Data Analyst with 20 years of experinance.
                Your task is to find and extract Drug names from the given text in form of a Python list.
                OUTPUT FORMAT: In case you do not find any drugs in the text, then just return an empty python list. Do not include anything else in your output.
                Following is the text to process: 
                """.strip().replace('\t', '') + Text_chunk
    
    return my_prompt

In [54]:
chunks = chunk_text(Text_from_PDF)

In [55]:
All_responses = []
for i , chunk in enumerate(chunks):
    prompt = prepare_prompt_for_CureDoc(chunk)
    response = Call_CureDoc(prompt)
    All_responses.append(response)
    print(f"Response for chunk {i+1}:\n\t{response}")


Response for chunk 1:
	I apologize, but there is no patient history or treatment suggested in the given text. The text appears to be a guide for patients with colon cancer, providing information on the basics of colon cancer, testing and treatment planning, and treatment options. It does not contain any specific information about a patient's medical history or treatment plan.
Response for chunk 2:
	Based on the provided clinical note, here is the information:

**Patient History:**

The patient was diagnosed with colon cancer and underwent colectomy. The patient's cancer was tested to ensure that it had not spread to other parts of the body. The patient's family history is significant, with a first-degree relative having colon cancer, which increases the risk of developing the disease.

**Treatment Suggested:**

The note does not specifically mention a treatment plan for the patient. However, it does mention that the patient may need to undergo further testing and treatment planning, in

In [35]:
import re

def extract_drugs_from_responses(responses):
    # List to store all extracted drug names
    all_drugs = []

    # Iterate over each response
    for response in responses:
        # Use regex to extract list-like structures from the response
        extracted_lists = re.findall(r"\[.*?\]", response)

        for item_list in extracted_lists:
            # Convert the extracted string to a Python list using eval
            try:
                drugs = eval(item_list)
                if isinstance(drugs, list):
                    all_drugs.extend(drugs)
            except:
                # Handle any errors in eval (e.g., malformed list)
                print(f"Could not parse the list: {item_list}")

    return all_drugs

# Extract all drug names from responses
all_drug_names = extract_drugs_from_responses(All_responses)

filtered_drugs = []

for drug in all_drug_names:
    if drug.lower() not in filtered_drugs:
        filtered_drugs.append(drug.lower())


# Print the combined list of drug names
print(filtered_drugs)


"I apologize, but I'm a clinical decision support system, and I don't have any patient information or notes to provide. I'm here to assist you with answering questions based on a given clinical note. Please provide the clinical note, and I'll do my best to assist you."

## GPI CODES 

In [70]:
import pandas as pd

def match_drugs_with_gpi(drug_list, csv_file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Clean the gpiName column by stripping leading and trailing spaces
    df['gpiName'] = df['gpiName'].str.strip()
    # print(df['gpiName'])    
    # Initialize an empty list to store dictionaries with Drug and GPI key-value pairs
    matched_drugs = []
    
    # Iterate through the list of drugs
    for drug in drug_list:
        # Filter the DataFrame for rows where gpiName matches the drug name
        match = df[df['gpiName'].str.lower() == drug.lower()]
        # print(match)
        if not match.empty:
            # Extract the gpiCode and truncate it to 8 digits
            gpi_code = str(match['gpi'].values[0]).strip()[:8]
            # print(len(gpi_code))
            zeroes = 8 - len(gpi_code)
            # print(type(zeroes))
            # print(type(gpi_code))
            # print(zeroes)
            
            gpi_code = gpi_code + zeroes*"0"
            # Add a dictionary with Drug and GPI keys to the list
            drug = drug.capitalize()
            matched_drugs.append({'Drug': drug, 'GPI': gpi_code})
    
    return matched_drugs

In [74]:
cleaned_drug_list = [re.sub(r'[^a-zA-Z]', '', drug) for drug in filtered_drugs]
drug_list = cleaned_drug_list  # Example drug names
csv_file_path = 'filtered_drug_gpi.csv'  # Path to the CSV file
matched_drug_gpi = match_drugs_with_gpi(drug_list, csv_file_path)
print(matched_drug_gpi)

[{'Drug': 'Pembrolizumab', 'GPI': '21357953'}, {'Drug': 'Nivolumab', 'GPI': '21357941'}, {'Drug': 'Fluorouracil', 'GPI': '21300030'}, {'Drug': 'Capecitabine', 'GPI': '21300005'}, {'Drug': 'Bevacizumab', 'GPI': '21335020'}, {'Drug': 'Panitumumab', 'GPI': '21360070'}, {'Drug': 'Cetuximab', 'GPI': '21360015'}, {'Drug': 'Regorafenib', 'GPI': '21533050'}, {'Drug': 'Ipilimumab', 'GPI': '21355232'}, {'Drug': 'Encorafenib', 'GPI': '21532040'}, {'Drug': 'Trastuzumab', 'GPI': '21170070'}, {'Drug': 'Pertuzumab', 'GPI': '21170054'}, {'Drug': 'Tucatinib', 'GPI': '21170080'}, {'Drug': 'Sotorasib', 'GPI': '21532480'}, {'Drug': 'Entrectinib', 'GPI': '21533820'}, {'Drug': 'Selpercatinib', 'GPI': '21535779'}, {'Drug': 'Ramucirumab', 'GPI': '21335070'}, {'Drug': 'Oxaliplatin', 'GPI': '21100028'}, {'Drug': 'Fluorouracil', 'GPI': '21300030'}]


In [75]:
print(len(matched_drug_gpi))

19


## NIH approval check

### WEB SCRAPER FOR NIH WEBSITE

SCRAP THE NIH WEBSITE

In [27]:
import requests

URL = "https://www.cancer.gov/about-cancer/treatment/drugs/colorectal"


def scrape(url):
    """scarpe the web provided the url"""        
    response = requests.get(URL)
    source = response.text
    return source


def save_data(data):
    try:
        with open("scraped_data.txt","w") as file:
            file.write(data + "\n")
            print("Data saved")
    except Exception as e:
        print(e)
        print("Data not saved")

source = scrape(URL)
# extracted_data = extract(source)
# save_data(extracted_data)
save_data(source)

Data saved


In [28]:
from bs4 import BeautifulSoup

# Load the HTML content from the file
with open('scraped_data.txt', 'r') as file:
    html_content = file.read()

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Function to extract drugs under a specific section
def extract_drugs(section_title):
    section = soup.find('h2', text=section_title)
    if section:
        drugs_list = section.find_next('ul')
        drugs = [li.get_text() for li in drugs_list.find_all('li')]
        return drugs
    return []

# Extract drugs approved for colon cancer
approved_drugs = extract_drugs('Drugs Approved for Colon Cancer')

# Extract drug combinations used in colon cancer
drug_combinations = extract_drugs('Drug Combinations Used in Colon Cancer')

# Print the extracted drugs
print("Drugs Approved for Colon Cancer:")
print(approved_drugs)

print("\nDrug Combinations Used in Colon Cancer:")
print(drug_combinations)


Drugs Approved for Colon Cancer:
['Adagrasib', 'Alymsys (Bevacizumab)', 'Avastin (Bevacizumab)', 'Bevacizumab', 'Camptosar (Irinotecan Hydrochloride)', 'Capecitabine', 'Cetuximab', 'Cyramza (Ramucirumab)', 'Eloxatin (Oxaliplatin)', 'Erbitux (Cetuximab)', '5-FU (Fluorouracil Injection)', 'Fluorouracil Injection', 'Fruquintinib', 'Fruzaqla (Fruquintinib)', 'Ipilimumab', 'Irinotecan Hydrochloride', 'Keytruda (Pembrolizumab)', 'Krazati (Adagrasib)', 'Leucovorin Calcium', 'Lonsurf (Trifluridine and Tipiracil Hydrochloride)', 'Mvasi (Bevacizumab)', 'Nivolumab', 'Opdivo (Nivolumab)', 'Oxaliplatin', 'Panitumumab', 'Pembrolizumab', 'Ramucirumab', 'Regorafenib', 'Stivarga (Regorafenib)', 'Trifluridine and Tipiracil Hydrochloride', 'Tucatinib', 'Tukysa (Tucatinib)', 'Vectibix (Panitumumab)', 'Xeloda (Capecitabine)', 'Yervoy (Ipilimumab)', 'Zaltrap (Ziv-Aflibercept)', 'Zirabev (Bevacizumab)', 'Ziv-Aflibercept']

Drug Combinations Used in Colon Cancer:
['CAPOX', 'FOLFIRI', 'FOLFIRI-BEVACIZUMAB', 'F

  section = soup.find('h2', text=section_title)


In [29]:
# List of drugs with various forms and brand names
drug_approval_list = approved_drugs

# List of dictionaries containing drugs and GPI codes
Appoved_drugs_list = matched_drug_gpi

# Convert the approval list to lowercase for case-insensitive matching
approval_set = set(drug.lower() for drug in drug_approval_list)

# Add the 'approval' key with True/False based on the presence in the approval list
for entry in Appoved_drugs_list:
    # Check if the drug name is in the approval list (case-insensitive)
    entry['approval'] = entry['Drug'].lower() in approval_set

# Print the updated list of dictionaries
print(Appoved_drugs_list)


[{'Drug': 'Pembrolizumab', 'GPI': '21357953', 'approval': True}, {'Drug': 'Nivolumab', 'GPI': '21357941', 'approval': True}, {'Drug': 'fluorouracil', 'GPI': '2130003 ', 'approval': False}, {'Drug': 'capecitabine', 'GPI': '21300005', 'approval': True}, {'Drug': 'bevacizumab', 'GPI': '2133502 ', 'approval': True}, {'Drug': 'panitumumab', 'GPI': '2136007 ', 'approval': True}, {'Drug': 'cetuximab', 'GPI': '21360015', 'approval': True}, {'Drug': 'regorafenib', 'GPI': '2153305 ', 'approval': True}, {'Drug': 'nivolumab', 'GPI': '21357941', 'approval': True}, {'Drug': 'ipilimumab', 'GPI': '21355232', 'approval': True}, {'Drug': 'Pembrolizumab', 'GPI': '21357953', 'approval': True}, {'Drug': 'Encorafenib', 'GPI': '2153204 ', 'approval': False}, {'Drug': 'Trastuzumab', 'GPI': '2117007 ', 'approval': False}, {'Drug': 'pertuzumab', 'GPI': '21170054', 'approval': False}, {'Drug': 'tucatinib', 'GPI': '2117008 ', 'approval': True}, {'Drug': 'Sotorasib', 'GPI': '2153248 ', 'approval': False}, {'Drug':

In [30]:
len(Appoved_drugs_list)

30

In [1]:
Appoved_drugs_list = [{'Drug': 'Pembrolizumab', 'GPI': '21357953', 'approval': True}, {'Drug': 'Nivolumab', 'GPI': '21357941', 'approval': True}, {'Drug': 'fluorouracil', 'GPI': '2130003 ', 'approval': False}, {'Drug': 'capecitabine', 'GPI': '21300005', 'approval': True}, {'Drug': 'bevacizumab', 'GPI': '2133502 ', 'approval': True}, {'Drug': 'panitumumab', 'GPI': '2136007 ', 'approval': True}, {'Drug': 'cetuximab', 'GPI': '21360015', 'approval': True}, {'Drug': 'regorafenib', 'GPI': '2153305 ', 'approval': True}, {'Drug': 'nivolumab', 'GPI': '21357941', 'approval': True}, {'Drug': 'ipilimumab', 'GPI': '21355232', 'approval': True}, {'Drug': 'Pembrolizumab', 'GPI': '21357953', 'approval': True}, {'Drug': 'Encorafenib', 'GPI': '2153204 ', 'approval': False}, {'Drug': 'Trastuzumab', 'GPI': '2117007 ', 'approval': False}, {'Drug': 'pertuzumab', 'GPI': '21170054', 'approval': False}, {'Drug': 'tucatinib', 'GPI': '2117008 ', 'approval': True}, {'Drug': 'Sotorasib', 'GPI': '2153248 ', 'approval': False}, {'Drug': 'Entrectinib', 'GPI': '2153382 ', 'approval': False}, {'Drug': 'Selpercatinib', 'GPI': '21535779', 'approval': False}, {'Drug': 'bevacizumab', 'GPI': '2133502 ', 'approval': True}, {'Drug': 'ramucirumab', 'GPI': '2133507 ', 'approval': True}, {'Drug': 'cetuximab', 'GPI': '21360015', 'approval': True}, {'Drug': 'panitumumab', 'GPI': '2136007 ', 'approval': True}, {'Drug': 'oxaliplatin', 'GPI': '21100028', 'approval': True}, {'Drug': 'Capecitabine', 'GPI': '21300005', 'approval': True}, {'Drug': 'oxaliplatin', 'GPI': '21100028', 'approval': True}, {'Drug': 'capecitabine', 'GPI': '21300005', 'approval': True}, {'Drug': 'oxaliplatin', 'GPI': '21100028', 'approval': True}, {'Drug': 'fluorouracil', 'GPI': '2130003 ', 'approval': False}, {'Drug': 'Fluorouracil', 'GPI': '2130003 ', 'approval': False}, {'Drug': 'Oxaliplatin', 'GPI': '21100028', 'approval': True}]


In [26]:
def create_report(approved_drugs_list, cancer_type, model_name):
    # Create a canvas for the PDF
    directory = 'Reports'
    if not os.path.exists(directory):
        print("Directory does not exist. Creating directory...")
        os.makedirs(directory)
    filename = cancer_type + ' Drugs Report by '+ model_name + '.pdf'
    HEADING = cancer_type + ' Drugs Report by '+ model_name
    pdf_file_path = os.path.join(directory, filename)
    
    c = canvas.Canvas(pdf_file_path, pagesize=letter)
    width, height = letter

    # Draw the heading at the top, centered
    c.setFont('Helvetica-Bold', 16)
    c.drawCentredString(width / 2, height - 40, HEADING)  # Position heading at the top, centered
    
    # Define the table data with headers
    table_data = [['Serial No.','Drug Name', 'GPI Code']]
    i = 1
    # Populate the table data with the list of drugs
    for drug in approved_drugs_list:
        table_data.append([i,drug['Drug'], drug['GPI']])
        i += 1
    
    # Create the table
    drug_table = Table(table_data, colWidths=[75, 250, 100])

    # Define the base style for the table
    table_style = TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.grey),  # Header background color
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),  # Header text color
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),  # Center align all text
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),  # Header font
        ('FONTSIZE', (0, 0), (-1, 0), 14),  # Header font size
        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),  # Header padding
        ('GRID', (0, 0), (-1, -1), 1, colors.black),  # Grid lines
        ('FONTSIZE', (0, 1), (-1, -1), 12),  # Font size for the table content
    ])
    
    # Apply the base style to the table
    drug_table.setStyle(table_style)
    
    # Apply row-specific background color for rows where approval is False
    for i, drug in enumerate(approved_drugs_list, start=1):  # start=1 to skip header row
        if not drug['approval']:
            # Apply yellow background for rows with 'approval' as False
            drug_table.setStyle(TableStyle([('BACKGROUND', (0, i), (-1, i), colors.yellow)]))
    
    # Calculate the table width and position
    table_width, table_height = drug_table.wrap(0, 0)
    x = (width - table_width) / 2  # Center the table horizontally
    y = height - table_height - 50  # Position the table from the top of the page
    
    # Draw the table on the canvas
    drug_table.drawOn(c, x, y)
    
    # Save the PDF
    c.save()
    print(f"PDF created at {pdf_file_path}")
    return pdf_file_path

In [27]:
Appoved_drugs_list = [{'Drug': 'Pembrolizumab', 'GPI': '21357953', 'approval': True}, {'Drug': 'Nivolumab', 'GPI': '21357941', 'approval': True}, {'Drug': 'fluorouracil', 'GPI': '2130003 ', 'approval': False}, {'Drug': 'capecitabine', 'GPI': '21300005', 'approval': True}, {'Drug': 'bevacizumab', 'GPI': '2133502 ', 'approval': True}, {'Drug': 'panitumumab', 'GPI': '2136007 ', 'approval': True}, {'Drug': 'cetuximab', 'GPI': '21360015', 'approval': True}, {'Drug': 'regorafenib', 'GPI': '2153305 ', 'approval': True}, {'Drug': 'nivolumab', 'GPI': '21357941', 'approval': True}, {'Drug': 'ipilimumab', 'GPI': '21355232', 'approval': True}, {'Drug': 'Pembrolizumab', 'GPI': '21357953', 'approval': True}, {'Drug': 'Encorafenib', 'GPI': '2153204 ', 'approval': False}, {'Drug': 'Trastuzumab', 'GPI': '2117007 ', 'approval': False}, {'Drug': 'pertuzumab', 'GPI': '21170054', 'approval': False}, {'Drug': 'tucatinib', 'GPI': '2117008 ', 'approval': True}, {'Drug': 'Sotorasib', 'GPI': '2153248 ', 'approval': False}, {'Drug': 'Entrectinib', 'GPI': '2153382 ', 'approval': False}, {'Drug': 'Selpercatinib', 'GPI': '21535779', 'approval': False}, {'Drug': 'bevacizumab', 'GPI': '2133502 ', 'approval': True}, {'Drug': 'ramucirumab', 'GPI': '2133507 ', 'approval': True}, {'Drug': 'cetuximab', 'GPI': '21360015', 'approval': True}, {'Drug': 'panitumumab', 'GPI': '2136007 ', 'approval': True}, {'Drug': 'oxaliplatin', 'GPI': '21100028', 'approval': True}, {'Drug': 'Capecitabine', 'GPI': '21300005', 'approval': True}, {'Drug': 'oxaliplatin', 'GPI': '21100028', 'approval': True}, {'Drug': 'capecitabine', 'GPI': '21300005', 'approval': True}, {'Drug': 'oxaliplatin', 'GPI': '21100028', 'approval': True}, {'Drug': 'fluorouracil', 'GPI': '2130003 ', 'approval': False}, {'Drug': 'Fluorouracil', 'GPI': '2130003 ', 'approval': False}, {'Drug': 'Oxaliplatin', 'GPI': '21100028', 'approval': True}]


In [28]:
create_report(Appoved_drugs_list, "Cancerous Cancer", "MANUAL")

PDF created at Reports\Cancerous Cancer Drugs Report by MANUAL.pdf


'Reports\\Cancerous Cancer Drugs Report by MANUAL.pdf'