## DATA UNDERSTANDING

In [25]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
import xml.etree.ElementTree as ET
warnings.filterwarnings('ignore')
import os


In [26]:
"""
#Cell already run
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
"""

"\n#Cell already run\nnltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [27]:
class CancerQALoader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.questions = []
        self.answers = []
        self.root = None
        self.source = os.path.splitext(os.path.basename(filepath))[0] 

    def parse_xml(self):
        try:
            tree = ET.parse(self.filepath)
            self.root = tree.getroot()
        except ET.ParseError as e:
            print(f"Error parsing XML in {self.filepath}: {e}")
        except FileNotFoundError:
            print(f"File not found: {self.filepath}")

    def extract_qa_pairs(self):
        if self.root is None:
            return

        for qa_pair in self.root.findall('.//QAPair'):
            question = qa_pair.find('Question').text
            answer = qa_pair.find('Answer').text
            if question and answer:
                self.questions.append(question)
                self.answers.append(answer)

    def get_dataframe(self):
        return pd.DataFrame({
            'question': self.questions,
            'answer': self.answers,
            'source': [self.source] * len(self.questions)  # Add source to each row
        })

    def load_all_qa_from_folder(folder_path):
        all_dfs = []

        for filename in os.listdir(folder_path):
            if filename.endswith(".xml"):
                full_path = os.path.join(folder_path, filename)
                loader = CancerQALoader(full_path)
                loader.parse_xml()
                loader.extract_qa_pairs()
                df = loader.get_dataframe()
                all_dfs.append(df)

        return pd.concat(all_dfs, ignore_index=True)

In [28]:
folder = "1_CancerGov_QA"
cancer_df = CancerQALoader.load_all_qa_from_folder(folder)

In [29]:
print(f"Reading {cancer_df['source'].nunique()} files")

Reading 116 files


In [30]:
cancer_df.head(20)

Unnamed: 0,question,answer,source
0,What is (are) Adult Acute Lymphoblastic Leukem...,Key Points\n - Adult acute ...,0000001_1
1,What are the symptoms of Adult Acute Lymphobla...,"Signs and symptoms of adult ALL include fever,...",0000001_1
2,How to diagnose Adult Acute Lymphoblastic Leuk...,Tests that examine the blood and bone marrow a...,0000001_1
3,What is the outlook for Adult Acute Lymphoblas...,Certain factors affect prognosis (chance of re...,0000001_1
4,Who is at risk for Adult Acute Lymphoblastic L...,Previous chemotherapy and exposure to radiatio...,0000001_1
5,What are the stages of Adult Acute Lymphoblast...,Key Points\n - Once adult A...,0000001_1
6,What are the treatments for Adult Acute Lympho...,Key Points\n - There are di...,0000001_1
7,What is (are) Adult Acute Myeloid Leukemia ?,Key Points\n - Adult acute ...,0000001_2
8,Who is at risk for Adult Acute Myeloid Leukemi...,"Smoking, previous chemotherapy treatment, and ...",0000001_2
9,What are the symptoms of Adult Acute Myeloid L...,"Signs and symptoms of adult AML include fever,...",0000001_2


In [31]:
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  729 non-null    object
 1   answer    729 non-null    object
 2   source    729 non-null    object
dtypes: object(3)
memory usage: 17.2+ KB


In [32]:
cancer_df.isnull().sum()

question    0
answer      0
source      0
dtype: int64

In [33]:
len(cancer_df)

729

In [34]:
type(cancer_df)

pandas.core.frame.DataFrame

## EXPLORITARY DATA ANALYSIS

In [35]:

nltk.download('wordnet')      
nltk.download('omw-1.4')      
nltk.download('punkt')        
nltk.download('stopwords')    


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PapaDay\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\PapaDay\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PapaDay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PapaDay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stopwords
    return ' '.join(tokens)


In [None]:
# Vectorization
vectorizer = TfidfVectorizer(preprocessor=preprocess_text, stop_words='english')
X = vectorizer.fit_transform(cancer_df['question'])
y = cancer_df['answer']

In [40]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
print(f"Number of unique answers: {cancer_df['answer'].nunique()}")
print(cancer_df['answer'].value_counts().head(5))


Number of unique answers: 705
answer
New types of treatment are being tested in clinical trials.\n                    Information about clinical trials is available from the NCI website.\n                \n                \n                    Patients may want to think about taking part in a clinical trial.\n                    For some patients, taking part in a clinical trial may be the best treatment choice. Clinical trials are part of the cancer research process. Clinical trials are done to find out if new cancer treatments are safe and effective or better than the standard treatment.   Many of today's standard treatments for cancer are based on earlier clinical trials. Patients who take part in a clinical trial may receive the standard treatment or be among the first to receive a new treatment.   Patients who take part in clinical trials also help improve the way cancer will be treated in the future. Even when clinical trials do not lead to effective new treatments, they often an

In [None]:
# Training a model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predicting answers for the test set
y_pred = model.predict(X_test)

