## DATA UNDERSTANDING

In [1]:
#Importing necessary libraries

import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
import xml.etree.ElementTree as ET
warnings.filterwarnings('ignore')
import os



In [2]:
#Creating a class to load and process cancer-related Q&A data from XML files

class CancerQALoader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.questions = []
        self.answers = []
        self.root = None
        self.source = os.path.splitext(os.path.basename(filepath))[0] 

    def parse_xml(self):
        try:
            tree = ET.parse(self.filepath)
            self.root = tree.getroot()
        except ET.ParseError as e:
            print(f"Error parsing XML in {self.filepath}: {e}")
        except FileNotFoundError:
            print(f"File not found: {self.filepath}")

    def extract_qa_pairs(self):
        if self.root is None:
            return

        for qa_pair in self.root.findall('.//QAPair'):
            question = qa_pair.find('Question').text
            answer = qa_pair.find('Answer').text
            if question and answer:
                self.questions.append(question)
                self.answers.append(answer)

    def get_dataframe(self):
        return pd.DataFrame({
            'question': self.questions,
            'answer': self.answers,
            'source': [self.source] * len(self.questions)  # Add source to each row
        })

    def load_all_qa_from_folder(folder_path):
        all_dfs = []

        for filename in os.listdir(folder_path):
            if filename.endswith(".xml"):
                full_path = os.path.join(folder_path, filename)
                loader = CancerQALoader(full_path)
                loader.parse_xml()
                loader.extract_qa_pairs()
                df = loader.get_dataframe()
                all_dfs.append(df)

        return pd.concat(all_dfs, ignore_index=True)

In [4]:
#We used the class to load all Q&A pairs from XML files in a specified folder
folder = ('Cancer_NLP_project/1_CancerGov_QA' ) 
cancer_df = CancerQALoader.load_all_qa_from_folder(folder)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'Cancer_NLP_project/1_CancerGov_QA'

In [None]:
#Displaying the number of files loaded into the cancer_df DataFrame
print(f"Reading {cancer_df['source'].nunique()} files")

Reading 116 files


In [None]:
#Displaying the first 20 rows of the DataFrame
cancer_df.head(20)

Unnamed: 0,question,answer,source
0,What is (are) Childhood Liver Cancer ?,Key Points\n - Childhood li...,0000007_3
1,Who is at risk for Childhood Liver Cancer? ?,Certain diseases and disorders can increase th...,0000007_3
2,What are the symptoms of Childhood Liver Cancer ?,Signs and symptoms of childhood liver cancer i...,0000007_3
3,How to diagnose Childhood Liver Cancer ?,Tests that examine the liver and the blood are...,0000007_3
4,What is the outlook for Childhood Liver Cancer ?,Certain factors affect prognosis (chance of re...,0000007_3
5,What are the stages of Childhood Liver Cancer ?,Key Points\n - After childh...,0000007_3
6,What are the treatments for Childhood Liver Ca...,Key Points\n - There are di...,0000007_3
7,what research (or clinical trials) is being do...,New types of treatment are being tested in cli...,0000007_3
8,What is (are) Chronic Myeloproliferative Neopl...,Key Points\n - Myeloprolife...,0000013_2
9,How to diagnose Chronic Myeloproliferative Neo...,Tests that examine the blood and bone marrow a...,0000013_2


In [None]:
#Checking the shape of the DataFrame to see how many rows and columns it contains
cancer_df.shape

(729, 3)

In [None]:
#Checking the information about the DataFrame, including data types and non-null counts
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  729 non-null    object
 1   answer    729 non-null    object
 2   source    729 non-null    object
dtypes: object(3)
memory usage: 17.2+ KB


In [None]:
#Checking the number of missing values in each column of the DataFrame
cancer_df.isnull().sum()

question    0
answer      0
source      0
dtype: int64

In [None]:
#Checking the number of rows in the DataFrame to see how many Q&A pairs were loaded
len(cancer_df)

729

In [None]:
#Checking the data type of the DataFrame to confirm it is a pandas DataFrame
type(cancer_df)

pandas.core.frame.DataFrame

In [None]:
# Check how many duplicate questions exist
duplicate_questions = cancer_df[cancer_df.duplicated(subset='question', keep=False)]
print(f"Total duplicate questions: {duplicate_questions.shape[0]}")
duplicate_questions


Total duplicate questions: 78


Unnamed: 0,question,answer,source
20,What is (are) Endometrial Cancer ?,Key Points\n - Endometrial ...,0000014_4
21,Who is at risk for Endometrial Cancer? ?,Health history and certain medicines can affec...,0000014_4
26,What is (are) Breast Cancer ?,Key Points\n - Breast cance...,0000027_4
27,How to prevent Breast Cancer ?,Key Points\n - Avoiding ris...,0000027_4
28,Who is at risk for Breast Cancer? ?,Key Points\n - Avoiding ris...,0000027_4
...,...,...,...
609,Who is at risk for Neuroblastoma? ?,Key Points\n - Screening te...,0000031_2
625,What is (are) Liver (Hepatocellular) Cancer ?,Key Points\n - Liver cancer...,0000007_4
626,Who is at risk for Liver (Hepatocellular) Canc...,Being infected with certain types of the hepat...,0000007_4
629,What is (are) Colorectal Cancer ?,Key Points\n - Colorectal c...,0000037_3


In [None]:
#Removing duplicate questions from the DataFrame while keeping the first occurrence
cancer_df=cancer_df.drop_duplicates(subset='question', keep='first').reset_index(drop=True)

In [None]:
#Checking the shape of the DataFrame after removing duplicates to see how many rows remain
cancer_df.shape

(683, 3)

## EXPLORATORY DATA ANALYSIS

In [None]:
#Downloading necessary NLTK resources for text processing
nltk.download('wordnet')      
nltk.download('omw-1.4')      
nltk.download('punkt')        
nltk.download('stopwords')    


[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#Defining a function to clean and preprocess the text data in the DataFrame

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove HTML tags if any
    text = re.sub(r"<.*?>", "", text)
    # Lowercase and remove special characters
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    # Tokenize words
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)


In [None]:
#remove spaces and new lines
cancer_df['question'] = cancer_df['question'].str.strip().replace('\n', ' ')
cancer_df['answer'] = cancer_df['answer'].str.strip().replace('\n', ' ')


# Preprocess the questions and answers
cancer_df['question'] = cancer_df['question'].apply(clean_text)
cancer_df['answer'] = cancer_df['answer'].apply(clean_text)


In [None]:
#Checking the number of unique answers in the DataFrame
print(f"Number of unique answers: {cancer_df['answer'].nunique()}")
print(cancer_df['answer'].value_counts().head(5))


Number of unique answers: 655
answer
new type treatment tested clinical trial information clinical trial available nci website patient may want think taking part clinical trial patient taking part clinical trial may best treatment choice clinical trial part cancer research process clinical trial done find new cancer treatment safe effective better standard treatment many today standard treatment cancer based earlier clinical trial patient take part clinical trial may receive standard treatment among first receive new treatment patient take part clinical trial also help improve way cancer treated future even clinical trial lead effective new treatment often answer important question help move research forward patient enter clinical trial starting cancer treatment clinical trial include patient yet received treatment trial test treatment patient whose cancer gotten better also clinical trial test new way stop cancer recurring coming back reduce side effect cancer treatment clinical trial

In [None]:
# Vectorization for text data using TF-IDF 
vectorizer = TfidfVectorizer(preprocessor=clean_text, stop_words='english')
X = vectorizer.fit_transform(cancer_df['question'])
y = cancer_df['answer']

In [None]:
# Function to get the top N answers based on cosine similarity of the user's question with the dataset questions

def get_top_n_answers(user_question, n=3, threshold=0.6):
    user_vec = vectorizer.transform([user_question])
    similarities = cosine_similarity(user_vec, X).flatten()
    top_indices = similarities.argsort()[-n:][::-1]
    top_score = similarities[top_indices[0]]

    if top_score < threshold:
        return [{
            "question": None,
            "answer": "I'm sorry, I don't have information on that. Please consult a doctor.",
            "similarity": float(top_score)
        }]
    
    results = []
    for idx in top_indices:
        results.append({
            "question": cancer_df.iloc[idx]['question'],
            "answer": cancer_df.iloc[idx]['answer'],
            "similarity": float(similarities[idx])
        })
    return results


In [None]:
for item in get_top_n_answers("What are the symptoms of throat cancer?"):
    print(f"Q: {item['question']}\nA: {item['answer']}")


Q: symptom breast cancer
A: sign breast cancer include lump change breast sign may caused breast cancer condition check doctor following lump thickening near breast underarm area change size shape breast dimple puckering skin breast nipple turned inward breast fluid breast milk nipple especially bloody scaly red swollen skin breast nipple areola dark area skin around nipple dimple breast look like skin orange called peau dorange may difficult detect find breast cancer early pregnant nursing woman breast usually get larger tender lumpy woman pregnant nursing given birth occurs normal hormone change take place pregnancy change make small lump difficult detect breast may also become denser difficult detect breast cancer woman dense breast using mammography breast change delay diagnosis breast cancer often found later stage woman
Q: symptom pancreatic cancer
A: sign symptom pancreatic cancer include jaundice pain weight loss pancreatic cancer may cause early sign symptom sign symptom may c

# TUNING THE MODEL



In [None]:
# Using the sequence matcher to find the closest match to a user's question.

In [None]:
# Function to get a response based on user input with a similarity threshold
from difflib import SequenceMatcher

def get_response(user_input, threshold=0.7):
    processed_input = clean_text(user_input)
    for index, row in cancer_df.iterrows():
        question = clean_text(row['question'])
        similarity = SequenceMatcher(None, processed_input, question).ratio()
        if similarity >= threshold:
            return row['answer']
    return "I'm sorry, I don't have information on that. Please consult a doctor."


In [None]:
# %pip install sentence-transformers


In [None]:
# Importing the SentenceTransformer model for generating sentence embeddings 

from sentence_transformers import SentenceTransformer

# Load a lightweight sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Generating embeddings for the questions in the DataFrame
# Embeddings are used to capture the semantic meaning of the questions
def get_semantic_answer(user_question, n=3, threshold=0.6):
    user_embedding = model.encode([user_question])
    similarities = cosine_similarity(user_embedding, question_embeddings).flatten()
    
    top_indices = similarities.argsort()[-n:][::-1]
    top_score = similarities[top_indices[0]]

    # Fallback response if match is too weak
    if top_score < threshold:
        return [{
            "question": None,
            "answer": "I'm sorry, I don't have information on that. Please consult a doctor.",
            "similarity": float(top_score)
        }]

    # Return top answers
    results = []
    for idx in top_indices:
        results.append({
            "question": cancer_df.iloc[idx]['question'],
            "answer": cancer_df.iloc[idx]['answer'],
            "similarity": float(similarities[idx])
        })
    return results


In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Compute embeddings for all questions 
question_embeddings = model.encode(cancer_df['question'].tolist())

for item in get_semantic_answer("Signs of pregnancy?"):
    print(item["answer"])


I'm sorry, I don't have information on that. Please consult a doctor.


In [None]:
for item in get_semantic_answer("Signs of throat cancer?"):
    print(item["answer"])

sign symptom laryngeal cancer include sore throat ear pain sign symptom may caused laryngeal cancer condition check doctor following sore throat cough go away trouble pain swallowing ear pain lump neck throat change hoarseness voice
sign symptom oropharyngeal cancer include lump neck sore throat sign symptom may caused oropharyngeal cancer condition check doctor following sore throat go away trouble swallowing trouble opening mouth fully trouble moving tongue weight loss known reason ear pain lump back mouth throat neck white patch tongue lining mouth go away coughing blood sometimes oropharyngeal cancer cause early sign symptom
sign salivary gland cancer include lump trouble swallowing salivary gland cancer may cause symptom may found regular dental checkup physical exam sign symptom may caused salivary gland cancer condition check doctor following lump usually painless area ear cheek jaw lip inside mouth fluid draining ear trouble swallowing opening mouth widely numbness weakness fac

### Evaluating the model

In [None]:
validation_data = [
    {
        "query": "What are symptoms of prostate cancer?",
        "expected_answer": "sign prostate cancer include weak flow urine frequent urination"
    },
    {
        "query": "How is breast cancer treated?",
        "expected_answer": "treatment breast cancer include surgery radiation therapy chemotherapy"
    },
    {
        "query": "What causes leukemia?",
        "expected_answer": "leukemia caused by abnormal blood cells developing in bone marrow"
    },
]



In [None]:
# Evaluating the model's accuracy on a validation set

correct = 0
total = len(validation_data)
k = 3  # Top-k

for item in validation_data:
    query = item["query"]
    expected = item["expected_answer"]

    top_results = get_semantic_answer(query, n=k, threshold=0.0)

    found = any(expected.lower() in result["answer"].lower() for result in top_results)

    if found:
        correct += 1

accuracy = correct / total
print(f"Top-{k} Accuracy: {accuracy:.2%}")


Top-3 Accuracy: 33.33%


The accuracy score of 33.33% is very low. We need to finetune the model to get better accuracy score.