## DATA UNDERSTANDING

In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
import xml.etree.ElementTree as ET
warnings.filterwarnings('ignore')
import os


In [2]:
"""
#Cell already run
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
"""

"\n#Cell already run\nnltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [2]:
class CancerQALoader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.questions = []
        self.answers = []
        self.root = None
        self.source = os.path.splitext(os.path.basename(filepath))[0] 

    def parse_xml(self):
        try:
            tree = ET.parse(self.filepath)
            self.root = tree.getroot()
        except ET.ParseError as e:
            print(f"Error parsing XML in {self.filepath}: {e}")
        except FileNotFoundError:
            print(f"File not found: {self.filepath}")

    def extract_qa_pairs(self):
        if self.root is None:
            return

        for qa_pair in self.root.findall('.//QAPair'):
            question = qa_pair.find('Question').text
            answer = qa_pair.find('Answer').text
            if question and answer:
                self.questions.append(question)
                self.answers.append(answer)

    def get_dataframe(self):
        return pd.DataFrame({
            'question': self.questions,
            'answer': self.answers,
            'source': [self.source] * len(self.questions)  # Add source to each row
        })

    def load_all_qa_from_folder(folder_path):
        all_dfs = []

        for filename in os.listdir(folder_path):
            if filename.endswith(".xml"):
                full_path = os.path.join(folder_path, filename)
                loader = CancerQALoader(full_path)
                loader.parse_xml()
                loader.extract_qa_pairs()
                df = loader.get_dataframe()
                all_dfs.append(df)

        return pd.concat(all_dfs, ignore_index=True)

In [3]:
folder = "/home/user/Documents/Flatiron/Phase_4_Project/1_CancerGov_QA"
cancer_df = CancerQALoader.load_all_qa_from_folder(folder)

In [4]:
print(f"Reading {cancer_df['source'].nunique()} files")

Reading 116 files


In [5]:
cancer_df.head(20)

Unnamed: 0,question,answer,source
0,What is (are) Childhood Liver Cancer ?,Key Points\n - Childhood li...,0000007_3
1,Who is at risk for Childhood Liver Cancer? ?,Certain diseases and disorders can increase th...,0000007_3
2,What are the symptoms of Childhood Liver Cancer ?,Signs and symptoms of childhood liver cancer i...,0000007_3
3,How to diagnose Childhood Liver Cancer ?,Tests that examine the liver and the blood are...,0000007_3
4,What is the outlook for Childhood Liver Cancer ?,Certain factors affect prognosis (chance of re...,0000007_3
5,What are the stages of Childhood Liver Cancer ?,Key Points\n - After childh...,0000007_3
6,What are the treatments for Childhood Liver Ca...,Key Points\n - There are di...,0000007_3
7,what research (or clinical trials) is being do...,New types of treatment are being tested in cli...,0000007_3
8,What is (are) Chronic Myeloproliferative Neopl...,Key Points\n - Myeloprolife...,0000013_2
9,How to diagnose Chronic Myeloproliferative Neo...,Tests that examine the blood and bone marrow a...,0000013_2


In [6]:
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  729 non-null    object
 1   answer    729 non-null    object
 2   source    729 non-null    object
dtypes: object(3)
memory usage: 17.2+ KB


In [7]:
cancer_df.isnull().sum()

question    0
answer      0
source      0
dtype: int64

In [8]:
len(cancer_df)

729

In [9]:
type(cancer_df)

pandas.core.frame.DataFrame

## EXPLORITARY DATA ANALYSIS

In [10]:
#create a target
x = cancer_df['question']
y = cancer_df['answer']


#split the data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)