***
# Model Deployment : Estimating Lung Cancer Probabilities From Demographic Factors And Behavioral Indicators

***
### John Pauline Pineda <br> <br> *August 17, 2024*
***

* [**1. Table of Contents**](#TOC)
    * [1.1 Data Background](#1.1)
    * [1.2 Data Description](#1.2)
    * [1.3 Data Quality Assessment](#1.3)
    * [1.4 Data Preprocessing](#1.4)
        * [1.4.1 Data Cleaning](#1.4.1)
        * [1.4.2 Missing Data Imputation](#1.4.2)
        * [1.4.3 Outlier Treatment](#1.4.3)
        * [1.4.4 Collinearity](#1.4.4)
        * [1.4.5 Shape Transformation](#1.4.5)
        * [1.4.6 Centering and Scaling](#1.4.6)
        * [1.4.7 Data Encoding](#1.4.7)
        * [1.4.8 Preprocessed Data Description](#1.4.8)
    * [1.5 Data Exploration](#1.5)
        * [1.5.1 Exploratory Data Analysis](#1.5.1)
        * [1.5.2 Hypothesis Testing](#1.5.2)
    * [1.6 Predictive Model Development](#1.6)
        * [1.6.1 Data Preprocessing Pipeline](#1.6.1)
        * [1.6.2 Model Testing](#1.6.2)
        * [1.6.3 Model Validation](#1.6.2)
    * [1.7 Consolidated Findings](#1.7)
* [**2. Summary**](#Summary)   
* [**3. References**](#References)

***

# 1. Table of Contents <a class="anchor" id="TOC"></a>

## 1.1. Data Background <a class="anchor" id="1.1"></a>

## 1.2. Data Description <a class="anchor" id="1.2"></a>

In [1]:
##################################
# Loading Python Libraries
##################################
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import itertools
%matplotlib inline
import shap

from operator import add,mul,truediv
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from scipy import stats

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import StackingClassifier

In [2]:
##################################
# Defining file paths
##################################
DATASETS_ORIGINAL_PATH = r"datasets\original"
DATASET_PREPROCESSED_PATH = r"datasets\preprocessed"
DATASET_FINAL_PATH = r"datasets\final"

In [3]:
##################################
# Loading the dataset
##################################
lung_cancer = pd.read_csv(os.path.join("..", DATASETS_ORIGINAL_PATH, "LungCancer.csv"))

In [4]:
##################################
# Performing a general exploration of the dataset
##################################
print('Dataset Dimensions: ')
display(lung_cancer.shape)

Dataset Dimensions: 


(309, 16)

In [5]:
##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(lung_cancer.dtypes)

Column Names and Data Types:


GENDER                   object
AGE                       int64
SMOKING                   int64
YELLOW_FINGERS            int64
ANXIETY                   int64
PEER_PRESSURE             int64
CHRONIC DISEASE           int64
FATIGUE                   int64
ALLERGY                   int64
WHEEZING                  int64
ALCOHOL CONSUMING         int64
COUGHING                  int64
SHORTNESS OF BREATH       int64
SWALLOWING DIFFICULTY     int64
CHEST PAIN                int64
LUNG_CANCER              object
dtype: object

In [6]:
##################################
# Taking a snapshot of the dataset
##################################
lung_cancer.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [7]:
##################################
# Setting the levels of the dichotomous categorical variables
# to boolean values
##################################
lung_cancer[['GENDER','LUNG_CANCER']] = lung_cancer[['GENDER','LUNG_CANCER']].astype('category')
lung_cancer['GENDER'] = lung_cancer['GENDER'].cat.set_categories(['F', 'M'], ordered=True)
lung_cancer['LUNG_CANCER'] = lung_cancer['LUNG_CANCER'].cat.set_categories(['NO', 'YES'], ordered=True)
lung_cancer.iloc[:,2:15] = lung_cancer.iloc[:,2:15].replace({1: 'Absent', 2: 'Present'})

In [8]:
##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(lung_cancer.dtypes)

Column Names and Data Types:


GENDER                   category
AGE                         int64
SMOKING                    object
YELLOW_FINGERS             object
ANXIETY                    object
PEER_PRESSURE              object
CHRONIC DISEASE            object
FATIGUE                    object
ALLERGY                    object
WHEEZING                   object
ALCOHOL CONSUMING          object
COUGHING                   object
SHORTNESS OF BREATH        object
SWALLOWING DIFFICULTY      object
CHEST PAIN                 object
LUNG_CANCER              category
dtype: object

In [9]:
##################################
# Taking a snapshot of the dataset
##################################
lung_cancer.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,Absent,Present,Present,Absent,Absent,Present,Absent,Present,Present,Present,Present,Present,Present,YES
1,M,74,Present,Absent,Absent,Absent,Present,Present,Present,Absent,Absent,Absent,Present,Present,Present,YES
2,F,59,Absent,Absent,Absent,Present,Absent,Present,Absent,Present,Absent,Present,Present,Absent,Present,NO
3,M,63,Present,Present,Present,Absent,Absent,Absent,Absent,Absent,Present,Absent,Absent,Present,Present,NO
4,F,63,Absent,Present,Absent,Absent,Absent,Absent,Absent,Present,Absent,Present,Present,Absent,Absent,NO


In [10]:
##################################
# Performing a general exploration of the numeric variables
##################################
print('Numeric Variable Summary:')
display(lung_cancer.describe(include='number').transpose())

Numeric Variable Summary:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AGE,309.0,62.673139,8.210301,21.0,57.0,62.0,69.0,87.0


In [11]:
##################################
# Performing a general exploration of the object variables
##################################
print('Object Variable Summary:')
display(lung_cancer.describe(include='object').transpose())

Object Variable Summary:


Unnamed: 0,count,unique,top,freq
SMOKING,309,2,Present,174
YELLOW_FINGERS,309,2,Present,176
ANXIETY,309,2,Absent,155
PEER_PRESSURE,309,2,Present,155
CHRONIC DISEASE,309,2,Present,156
FATIGUE,309,2,Present,208
ALLERGY,309,2,Present,172
WHEEZING,309,2,Present,172
ALCOHOL CONSUMING,309,2,Present,172
COUGHING,309,2,Present,179


In [12]:
##################################
# Performing a general exploration of the object variables
##################################
print('Categorical Variable Summary:')
display(lung_cancer.describe(include='category').transpose())

Categorical Variable Summary:


Unnamed: 0,count,unique,top,freq
GENDER,309,2,M,162
LUNG_CANCER,309,2,YES,270


## 1.3. Data Quality Assessment <a class="anchor" id="1.3"></a>

In [13]:
##################################
# Counting the number of duplicated rows
##################################
lung_cancer.duplicated().sum()

33

In [14]:
##################################
# Gathering the data types for each column
##################################
data_type_list = list(lung_cancer.dtypes)

In [15]:
##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(lung_cancer.columns)

In [16]:
##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(lung_cancer)] * len(lung_cancer.columns))

In [17]:
##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(lung_cancer.isna().sum(axis=0))

In [18]:
##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(lung_cancer.count())

In [19]:
##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

In [20]:
##################################
# Formulating the summary
# for all columns
##################################
all_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                              data_type_list,
                                              row_count_list,
                                              non_null_count_list,
                                              null_count_list,
                                              fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(all_column_quality_summary)

Unnamed: 0,Column.Name,Column.Type,Row.Count,Non.Null.Count,Null.Count,Fill.Rate
0,GENDER,category,309,309,0,1.0
1,AGE,int64,309,309,0,1.0
2,SMOKING,object,309,309,0,1.0
3,YELLOW_FINGERS,object,309,309,0,1.0
4,ANXIETY,object,309,309,0,1.0
5,PEER_PRESSURE,object,309,309,0,1.0
6,CHRONIC DISEASE,object,309,309,0,1.0
7,FATIGUE,object,309,309,0,1.0
8,ALLERGY,object,309,309,0,1.0
9,WHEEZING,object,309,309,0,1.0


In [21]:
##################################
# Counting the number of columns
# with Fill.Rate < 1.00
##################################
print('Number of Columns with Missing Data:', str(len(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)])))

Number of Columns with Missing Data: 0


In [22]:
##################################
# Identifying the rows
# with Fill.Rate < 1.00
##################################
column_low_fill_rate = all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1.00)]

In [23]:
##################################
# Gathering the metadata labels for each observation
##################################
row_metadata_list = lung_cancer.index.values.tolist()

In [24]:
##################################
# Gathering the number of columns for each observation
##################################
column_count_list = list([len(lung_cancer.columns)] * len(lung_cancer))

In [25]:
##################################
# Gathering the number of missing data for each row
##################################
null_row_list = list(lung_cancer.isna().sum(axis=1))

In [26]:
##################################
# Gathering the missing data percentage for each column
##################################
missing_rate_list = map(truediv, null_row_list, column_count_list)

In [27]:
##################################
# Exploring the rows
# for missing data
##################################
all_row_quality_summary = pd.DataFrame(zip(row_metadata_list,
                                           column_count_list,
                                           null_row_list,
                                           missing_rate_list), 
                                        columns=['Row.Name',
                                                 'Column.Count',
                                                 'Null.Count',                                                 
                                                 'Missing.Rate'])
display(all_row_quality_summary)

Unnamed: 0,Row.Name,Column.Count,Null.Count,Missing.Rate
0,0,16,0,0.0
1,1,16,0,0.0
2,2,16,0,0.0
3,3,16,0,0.0
4,4,16,0,0.0
...,...,...,...,...
304,304,16,0,0.0
305,305,16,0,0.0
306,306,16,0,0.0
307,307,16,0,0.0


In [28]:
##################################
# Counting the number of rows
# with Fill.Rate < 1.00
##################################
print('Number of Rows with Missing Data:',str(len(all_row_quality_summary[all_row_quality_summary['Missing.Rate']>0])))

Number of Rows with Missing Data: 0


In [29]:
from IPython.display import display, HTML
display(HTML("<style>.rendered_html { font-size: 15px; font-family: 'Trebuchet MS'; }</style>"))