## DATA UNDERSTANDING

In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
import xml.etree.ElementTree as ET
warnings.filterwarnings('ignore')
import os


In [17]:
"""
#Cell already run
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
"""

"\n#Cell already run\nnltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [18]:
class CancerQALoader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.questions = []
        self.answers = []
        self.root = None
        self.source = os.path.splitext(os.path.basename(filepath))[0] 

    def parse_xml(self):
        try:
            tree = ET.parse(self.filepath)
            self.root = tree.getroot()
        except ET.ParseError as e:
            print(f"Error parsing XML in {self.filepath}: {e}")
        except FileNotFoundError:
            print(f"File not found: {self.filepath}")

    def extract_qa_pairs(self):
        if self.root is None:
            return

        for qa_pair in self.root.findall('.//QAPair'):
            question = qa_pair.find('Question').text
            answer = qa_pair.find('Answer').text
            if question and answer:
                self.questions.append(question)
                self.answers.append(answer)

    def get_dataframe(self):
        return pd.DataFrame({
            'question': self.questions,
            'answer': self.answers,
            'source': [self.source] * len(self.questions)  # Add source to each row
        })

    def load_all_qa_from_folder(folder_path):
        all_dfs = []

        for filename in os.listdir(folder_path):
            if filename.endswith(".xml"):
                full_path = os.path.join(folder_path, filename)
                loader = CancerQALoader(full_path)
                loader.parse_xml()
                loader.extract_qa_pairs()
                df = loader.get_dataframe()
                all_dfs.append(df)

        return pd.concat(all_dfs, ignore_index=True)

In [19]:
folder = "1_CancerGov_QA"
cancer_df = CancerQALoader.load_all_qa_from_folder(folder)

In [20]:
print(f"Reading {cancer_df['source'].nunique()} files")

Reading 116 files


In [21]:
cancer_df.head(20)

Unnamed: 0,question,answer,source
0,What is (are) Childhood Liver Cancer ?,Key Points\n - Childhood li...,0000007_3
1,Who is at risk for Childhood Liver Cancer? ?,Certain diseases and disorders can increase th...,0000007_3
2,What are the symptoms of Childhood Liver Cancer ?,Signs and symptoms of childhood liver cancer i...,0000007_3
3,How to diagnose Childhood Liver Cancer ?,Tests that examine the liver and the blood are...,0000007_3
4,What is the outlook for Childhood Liver Cancer ?,Certain factors affect prognosis (chance of re...,0000007_3
5,What are the stages of Childhood Liver Cancer ?,Key Points\n - After childh...,0000007_3
6,What are the treatments for Childhood Liver Ca...,Key Points\n - There are di...,0000007_3
7,what research (or clinical trials) is being do...,New types of treatment are being tested in cli...,0000007_3
8,What is (are) Chronic Myeloproliferative Neopl...,Key Points\n - Myeloprolife...,0000013_2
9,How to diagnose Chronic Myeloproliferative Neo...,Tests that examine the blood and bone marrow a...,0000013_2


In [22]:
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  729 non-null    object
 1   answer    729 non-null    object
 2   source    729 non-null    object
dtypes: object(3)
memory usage: 17.2+ KB


In [23]:
cancer_df.isnull().sum()

question    0
answer      0
source      0
dtype: int64

In [9]:
len(cancer_df)

729

In [24]:
type(cancer_df)

pandas.core.frame.DataFrame

## EXPLORITARY DATA ANALYSIS

In [25]:

nltk.download('wordnet')      
nltk.download('omw-1.4')      
nltk.download('punkt')        
nltk.download('stopwords')    


[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stopwords
    return ' '.join(tokens)


In [30]:
#remove spaces and new lines
cancer_df['question'] = cancer_df['question'].str.strip().replace('\n', ' ')
cancer_df['answer'] = cancer_df['answer'].str.strip().replace('\n', ' ')


# Preprocess the questions and answers
cancer_df['question'] = cancer_df['question'].apply(preprocess_text)
cancer_df['answer'] = cancer_df['answer'].apply(preprocess_text)


In [31]:
# Vectorization
vectorizer = TfidfVectorizer(preprocessor=preprocess_text, stop_words='english')
X = vectorizer.fit_transform(cancer_df['question'])
y = cancer_df['answer']

In [32]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
print(f"Number of unique answers: {cancer_df['answer'].nunique()}")
print(cancer_df['answer'].value_counts().head(5))


Number of unique answers: 701
answer
new type treatment tested clinical trial information clinical trial available nci website patient may want think taking part clinical trial patient taking part clinical trial may best treatment choice clinical trial part cancer research process clinical trial done find new cancer treatment safe effective better standard treatment many today standard treatment cancer based earlier clinical trial patient take part clinical trial may receive standard treatment among first receive new treatment patient take part clinical trial also help improve way cancer treated future even clinical trial lead effective new treatment often answer important question help move research forward patient enter clinical trial starting cancer treatment clinical trial include patient yet received treatment trial test treatment patient whose cancer gotten better also clinical trial test new way stop cancer recurring coming back reduce side effect cancer treatment clinical trial

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
def get_answer(user_question):
    user_vec = vectorizer.transform([user_question])
    similarities = cosine_similarity(user_vec, X)
    most_similar_idx = np.argmax(similarities)
    return cancer_df.iloc[most_similar_idx]['answer']

In [43]:
def get_top_n_answers(user_question, n=3):
    user_vec = vectorizer.transform([user_question])
    similarities = cosine_similarity(user_vec, X).flatten()
    top_indices = similarities.argsort()[-n:][::-1]
    
    results = []
    for idx in top_indices:
        results.append({
            "question": cancer_df.iloc[idx]['question'],
            "answer": cancer_df.iloc[idx]['answer'],
            "similarity": similarities[idx]
        })
    return results


In [None]:
from sklearn.preprocessing import StandardScaler
