## DATA UNDERSTANDING

In [117]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
import xml.etree.ElementTree as ET
warnings.filterwarnings('ignore')
import os


In [118]:
"""
#Cell already run
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
"""

"\n#Cell already run\nnltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [119]:
class CancerQALoader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.questions = []
        self.answers = []
        self.root = None
        self.source = os.path.splitext(os.path.basename(filepath))[0] 

    def parse_xml(self):
        try:
            tree = ET.parse(self.filepath)
            self.root = tree.getroot()
        except ET.ParseError as e:
            print(f"Error parsing XML in {self.filepath}: {e}")
        except FileNotFoundError:
            print(f"File not found: {self.filepath}")

    def extract_qa_pairs(self):
        if self.root is None:
            return

        for qa_pair in self.root.findall('.//QAPair'):
            question = qa_pair.find('Question').text
            answer = qa_pair.find('Answer').text
            if question and answer:
                self.questions.append(question)
                self.answers.append(answer)

    def get_dataframe(self):
        return pd.DataFrame({
            'question': self.questions,
            'answer': self.answers,
            'source': [self.source] * len(self.questions)  # Add source to each row
        })

    def load_all_qa_from_folder(folder_path):
        all_dfs = []

        for filename in os.listdir(folder_path):
            if filename.endswith(".xml"):
                full_path = os.path.join(folder_path, filename)
                loader = CancerQALoader(full_path)
                loader.parse_xml()
                loader.extract_qa_pairs()
                df = loader.get_dataframe()
                all_dfs.append(df)

        return pd.concat(all_dfs, ignore_index=True)

In [120]:
folder = "1_CancerGov_QA"
cancer_df = CancerQALoader.load_all_qa_from_folder(folder)

In [121]:
print(f"Reading {cancer_df['source'].nunique()} files")

Reading 116 files


In [122]:
cancer_df.head(20)

Unnamed: 0,question,answer,source
0,What is (are) Adult Acute Lymphoblastic Leukem...,Key Points\n - Adult acute ...,0000001_1
1,What are the symptoms of Adult Acute Lymphobla...,"Signs and symptoms of adult ALL include fever,...",0000001_1
2,How to diagnose Adult Acute Lymphoblastic Leuk...,Tests that examine the blood and bone marrow a...,0000001_1
3,What is the outlook for Adult Acute Lymphoblas...,Certain factors affect prognosis (chance of re...,0000001_1
4,Who is at risk for Adult Acute Lymphoblastic L...,Previous chemotherapy and exposure to radiatio...,0000001_1
5,What are the stages of Adult Acute Lymphoblast...,Key Points\n - Once adult A...,0000001_1
6,What are the treatments for Adult Acute Lympho...,Key Points\n - There are di...,0000001_1
7,What is (are) Adult Acute Myeloid Leukemia ?,Key Points\n - Adult acute ...,0000001_2
8,Who is at risk for Adult Acute Myeloid Leukemi...,"Smoking, previous chemotherapy treatment, and ...",0000001_2
9,What are the symptoms of Adult Acute Myeloid L...,"Signs and symptoms of adult AML include fever,...",0000001_2


In [123]:
cancer_df.shape

(729, 3)

In [124]:
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  729 non-null    object
 1   answer    729 non-null    object
 2   source    729 non-null    object
dtypes: object(3)
memory usage: 17.2+ KB


In [125]:
cancer_df.isnull().sum()

question    0
answer      0
source      0
dtype: int64

In [126]:
len(cancer_df)

729

In [127]:
type(cancer_df)

pandas.core.frame.DataFrame

In [128]:
# Check how many duplicate questions exist
duplicate_questions = cancer_df[cancer_df.duplicated(subset='question', keep=False)]
print(f"Total duplicate questions: {duplicate_questions.shape[0]}")
duplicate_questions


Total duplicate questions: 78


Unnamed: 0,question,answer,source
22,What is (are) Chronic Myelogenous Leukemia ?,Key Points\n - Chronic myel...,0000001_4
29,What are the treatments for Chronic Myelogenou...,Key Points\n - There are di...,0000001_4
246,What is (are) Liver (Hepatocellular) Cancer ?,Key Points\n - Liver cancer...,0000007_4
247,Who is at risk for Liver (Hepatocellular) Canc...,Being infected with certain types of the hepat...,0000007_4
250,What is (are) Liver (Hepatocellular) Cancer ?,Key Points\n - Liver cancer...,0000007_5
...,...,...,...
678,Who is at risk for Prostate Cancer? ?,Different factors increase or decrease the ris...,0000036_3
695,What is (are) Colorectal Cancer ?,Key Points\n - Colorectal c...,0000037_3
697,Who is at risk for Colorectal Cancer? ?,Key Points\n - Avoiding ris...,0000037_3
699,What is (are) Colorectal Cancer ?,Key Points\n - Colorectal c...,0000037_4


In [129]:
cancer_df=cancer_df.drop_duplicates(subset='question', keep='first').reset_index(drop=True)

In [130]:
cancer_df.shape

(683, 3)

## EXPLORATORY DATA ANALYSIS

In [131]:

nltk.download('wordnet')      
nltk.download('omw-1.4')      
nltk.download('punkt')        
nltk.download('stopwords')    


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [132]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stopwords
    return ' '.join(tokens)


In [133]:
#remove spaces and new lines
cancer_df['question'] = cancer_df['question'].str.strip().replace('\n', ' ')
cancer_df['answer'] = cancer_df['answer'].str.strip().replace('\n', ' ')


# Preprocess the questions and answers
cancer_df['question'] = cancer_df['question'].apply(preprocess_text)
cancer_df['answer'] = cancer_df['answer'].apply(preprocess_text)


In [134]:
# Vectorization
vectorizer = TfidfVectorizer(preprocessor=preprocess_text, stop_words='english')
X = vectorizer.fit_transform(cancer_df['question'])
y = cancer_df['answer']

In [135]:
print(f"Number of unique answers: {cancer_df['answer'].nunique()}")
print(cancer_df['answer'].value_counts().head(5))


Number of unique answers: 655
answer
new type treatment tested clinical trial information clinical trial available nci website patient may want think taking part clinical trial patient taking part clinical trial may best treatment choice clinical trial part cancer research process clinical trial done find new cancer treatment safe effective better standard treatment many today standard treatment cancer based earlier clinical trial patient take part clinical trial may receive standard treatment among first receive new treatment patient take part clinical trial also help improve way cancer treated future even clinical trial lead effective new treatment often answer important question help move research forward patient enter clinical trial starting cancer treatment clinical trial include patient yet received treatment trial test treatment patient whose cancer gotten better also clinical trial test new way stop cancer recurring coming back reduce side effect cancer treatment clinical trial

In [136]:
from sklearn.metrics.pairwise import cosine_similarity
def get_answer(user_question):
    user_vec = vectorizer.transform([user_question])
    similarities = cosine_similarity(user_vec, X)
    most_similar_idx = np.argmax(similarities)
    return cancer_df.iloc[most_similar_idx]['answer']

In [137]:
def get_top_n_answers(user_question, n=3):
    user_vec = vectorizer.transform([user_question])
    similarities = cosine_similarity(user_vec, X).flatten()
    top_indices = similarities.argsort()[-n:][::-1]
    
    results = []
    for idx in top_indices:
        results.append({
            "question": cancer_df.iloc[idx]['question'],
            "answer": cancer_df.iloc[idx]['answer'],
            "similarity": similarities[idx]
        })
    return results


In [138]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required resources (run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove HTML tags if any
    text = re.sub(r"<.*?>", "", text)
    # Lowercase and remove special characters
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    # Tokenize words
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [139]:
def get_top_n_answers(user_question, n=3):
    # Clean and embed the user question
    cleaned_question = clean_text(user_question)
    user_vec = model.encode([cleaned_question])

    # Compute cosine similarity with all questions in dataset
    similarities = cosine_similarity(user_vec, question_embeddings).flatten()
    top_indices = similarities.argsort()[-n:][::-1]

    results = []
    for idx in top_indices:
        results.append({
            "question": df.iloc[idx]['question'],
            "answer": df.iloc[idx]['answer'],
            "similarity": float(similarities[idx])
        })
    return results



In [140]:
# Example usage
print("Example question: What are symptoms of breast cancer?")
print("Answer:", get_answer("What are symptoms of breast cancer?")) 



Example question: What are symptoms of breast cancer?
Answer: sign breast cancer include lump change breast sign may caused breast cancer condition check doctor following lump thickening near breast underarm area change size shape breast dimple puckering skin breast nipple turned inward breast fluid breast milk nipple especially bloody scaly red swollen skin breast nipple areola dark area skin around nipple dimple breast look like skin orange called peau dorange


In [141]:
%pip install gradio


Note: you may need to restart the kernel to use updated packages.


In [142]:
from difflib import SequenceMatcher

def get_response(user_input, threshold=0.7):
    processed_input = preprocess_text(user_input)
    for index, row in df.iterrows():
        question = preprocess_text(row['question'])
        similarity = SequenceMatcher(None, processed_input, question).ratio()
        if similarity >= threshold:
            return row['answer']
    return "I'm sorry, I don't have information on that. Please consult a doctor."


In [143]:
%pip install sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [144]:
from sentence_transformers import SentenceTransformer

# Load a lightweight sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")


In [145]:
def get_semantic_answer(user_question, n=3, threshold=0.6):
    user_embedding = model.encode([user_question])
    similarities = cosine_similarity(user_embedding, question_embeddings).flatten()
    
    top_indices = similarities.argsort()[-n:][::-1]
    top_score = similarities[top_indices[0]]

    # Fallback response if match is too weak
    if top_score < threshold:
        return [{
            "question": None,
            "answer": "I'm sorry, I don't have information on that. Please consult a doctor.",
            "similarity": float(top_score)
        }]

    # Return top answers
    results = []
    for idx in top_indices:
        results.append({
            "question": cancer_df.iloc[idx]['question'],
            "answer": cancer_df.iloc[idx]['answer'],
            "similarity": float(similarities[idx])
        })
    return results

In [146]:
from sklearn.metrics.pairwise import cosine_similarity


In [147]:
# Compute embeddings for all questions if not already done
question_embeddings = model.encode(cancer_df['question'].tolist())

for item in get_semantic_answer("Signs of pregnancy?"):
    print(item["answer"])

I'm sorry, I don't have information on that. Please consult a doctor.


In [148]:
#Example usage

for item in get_semantic_answer("Signs of throat cancer?"):
    print(item["answer"])

sign symptom laryngeal cancer include sore throat ear pain sign symptom may caused laryngeal cancer condition check doctor following sore throat cough go away trouble pain swallowing ear pain lump neck throat change hoarseness voice
sign symptom oropharyngeal cancer include lump neck sore throat sign symptom may caused oropharyngeal cancer condition check doctor following sore throat go away trouble swallowing trouble opening mouth fully trouble moving tongue weight loss known reason ear pain lump back mouth throat neck white patch tongue lining mouth go away coughing blood sometimes oropharyngeal cancer cause early sign symptom
sign salivary gland cancer include lump trouble swallowing salivary gland cancer may cause symptom may found regular dental check physical exam sign symptom may caused salivary gland cancer condition check doctor following lump usually painless area ear cheek jaw lip inside mouth fluid draining ear trouble swallowing opening mouth widely numbness weakness face 

In [149]:
#  Model Validation with Top-k Accuracy

In [150]:
validation_data = [
    {
        "query": "What are symptoms of prostate cancer?",
        "expected_answer": "sign prostate cancer include weak flow urine frequent urination"
    },
    {
        "query": "How is breast cancer treated?",
        "expected_answer": "treatment breast cancer include surgery radiation therapy chemotherapy"
    },
    {
        "query": "What causes leukemia?",
        "expected_answer": "leukemia caused by abnormal blood cells developing in bone marrow"
    },
]



In [151]:
# Evaluation of Top-k Accuracy
correct = 0
total = len(validation_data)
k = 3  # Top-k

for item in validation_data:
    query = item["query"]
    expected = item["expected_answer"]

    top_results = get_semantic_answer(query, n=k, threshold=0.0)

    found = any(expected.lower() in result["answer"].lower() for result in top_results)

    if found:
        correct += 1

accuracy = correct / total
print(f"Top-{k} Accuracy: {accuracy:.2%}")


Top-3 Accuracy: 33.33%


In [152]:
from torch.utils.data import DataLoader
from sentence_transformers import InputExample

# Prepare train_examples from cancer_df
train_examples = [
	InputExample(texts=[row['question'], row['answer']])
	for _, row in cancer_df.iterrows()
]

# Split into train and test sets
train_data, test_data = train_test_split(train_examples, test_size=0.2, random_state=42)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
test_dataloader = DataLoader(test_data, batch_size=16)

In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl (30 kB)
Installing collected packages: xxhash, multiprocess, datasets
Successfully installed datasets-4.0.0 multiprocess-0.70.16 xxhash-3.5.0


In [None]:
from datasets import Dataset
