In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot 
from kaggle_handler import handler

plt.style.use('ggplot')
pd.options.display.max_columns = 25

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Download / Loading Dataset

In [3]:
Assets = handler('aizahzeeshan/lung-cancer-risk-in-25-countries')

Directory 'Assets' already exists.
Datasets already exist in Assets folder
['lung_cancer_prediction_dataset.csv', 'Titanic-Dataset.csv', 'GenZ_DatingApp_Data.csv', 'concrete_data.csv']
 Change Add_more parameter to download more datasets


In [4]:
Assets

['lung_cancer_prediction_dataset.csv',
 'Titanic-Dataset.csv',
 'GenZ_DatingApp_Data.csv',
 'concrete_data.csv']

In [5]:
df = pd.read_csv('Assets/lung_cancer_prediction_dataset.csv',
                 usecols=['Population_Size','Age','Cigarettes_per_Day','Annual_Lung_Cancer_Deaths','Cancer_Stage'],
                dtype={'Population_Size':np.int32,'Age':np.int32,'Cigarettes_per_Day':np.int32,'Annual_Lung_Cancer_Deaths':np.int32,'Cancer_Stage':object})
df.sample(5)

Unnamed: 0,Population_Size,Age,Cigarettes_per_Day,Cancer_Stage,Annual_Lung_Cancer_Deaths
159235,120,55,0,,18000
215435,113,82,0,,23000
85378,225,78,27,,30000
35137,213,76,0,,45000
166030,54,31,29,,59204


In [6]:
df['Cancer_Stage'].unique()

array([nan, 'Stage 1', 'Stage 2', 'Stage 3', 'Stage 4'], dtype=object)

In [7]:
df.shape

(220632, 5)

In [8]:
df.isna().sum()

Population_Size                   0
Age                               0
Cigarettes_per_Day                0
Cancer_Stage                 211671
Annual_Lung_Cancer_Deaths         0
dtype: int64

In [9]:
df['Cancer_Stage'].unique()

array([nan, 'Stage 1', 'Stage 2', 'Stage 3', 'Stage 4'], dtype=object)

In [10]:
df.dropna(subset=['Cancer_Stage'],axis=0,inplace=True)

# Split Dataset

In [11]:
X = df.drop(columns=['Cancer_Stage'])
y = df['Cancer_Stage']

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,shuffle=True)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((7168, 4), (7168,), (1793, 4), (1793,))

# Diclaring KBinsDiscretizer Transformation
- ColumnTransformer, KBinsDiscretizer, LabelEncoder

In [13]:
ct = ColumnTransformer(transformers=[('trf1',KBinsDiscretizer(n_bins=3,
                                                              encode='ordinal',
                                                              strategy='quantile'),
                                      ['Population_Size','Annual_Lung_Cancer_Deaths']),
                                    ('trf2',KBinsDiscretizer(n_bins=10,
                                                             # encode='onehot',
                                                             encode='onehot-dense',
                                                             strategy='uniform'),
                                     ['Age']),
                                    ('trf3',KBinsDiscretizer(n_bins=10,
                                                             encode='ordinal',
                                                             strategy='kmeans'),
                                     ['Cigarettes_per_Day'])],
                       remainder='passthrough')

le = LabelEncoder()

# Transforming Dataset

In [14]:
X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)

X_train_ct.shape, y_train_le.shape

((7168, 13), (7168,))

# Transformed Data Info

In [15]:
le.classes_

array(['Stage 1', 'Stage 2', 'Stage 3', 'Stage 4'], dtype=object)

In [16]:
ct.named_transformers_['trf1'].bin_edges_

array([array([  54.,   85.,  145., 1400.]),
       array([ 10230.,  25000.,  40000., 690000.])], dtype=object)

In [17]:
ct.named_transformers_['trf2'].bin_edges_[0]

array([20. , 26.5, 33. , 39.5, 46. , 52.5, 59. , 65.5, 72. , 78.5, 85. ])

In [18]:
ct.named_transformers_['trf3'].bin_edges_[0]

array([ 0.        ,  2.74089636,  6.74574289,  9.50484653, 12.23039216,
       14.73842787, 17.48975273, 20.74040964, 24.02042102, 26.9985705 ,
       30.        ])

# Transformed DataFrame

In [19]:
pd.DataFrame(X_train_ct,columns=['Population_Size','Annual_Lung_Cancer_Deaths',
                                 '20.0-26.5','26.5-33.0','33.0-39.5','39.5-46.0','46.0-52.5',
                                 '52.5-59.0','59.0-65.5','65.5-72.0','72.0-78.5','78.5-85.0',
                                 'Cigarettes_per_Day']).head(3)

Unnamed: 0,Population_Size,Annual_Lung_Cancer_Deaths,20.0-26.5,26.5-33.0,33.0-39.5,39.5-46.0,46.0-52.5,52.5-59.0,59.0-65.5,65.5-72.0,72.0-78.5,78.5-85.0,Cigarettes_per_Day
0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0
2,2.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
pd.DataFrame(y_train_le,columns=['Cancer_Stage']).head(3)

Unnamed: 0,Cancer_Stage
0,2
1,0
2,3


In [21]:
X_train.head(3)

Unnamed: 0,Population_Size,Age,Cigarettes_per_Day,Annual_Lung_Cancer_Deaths
3022,125,83,6,75000
98366,70,61,24,26000
100975,1380,51,0,70000


# Traning Dataset without Transformation

In [22]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train_le)
y_pred = dtc.predict(X_test)
np.round(accuracy_score(y_true=y_test_le,y_pred=y_pred),3)

np.float64(0.245)

## Cross Velidation Score

In [23]:
dtc = DecisionTreeClassifier()

np.round(cross_val_score(dtc,X,LabelEncoder().fit_transform(y),cv=30).mean(),3)

np.float64(0.253)

# Traning Dataset With Transformation

In [24]:
dtc = DecisionTreeClassifier()
dtc.fit(np.delete(X_train_ct,2,axis=1),y_train_le)
y_pred = dtc.predict(np.delete(X_test_ct,2,axis=1))
np.round(accuracy_score(y_true=y_test_le,y_pred=y_pred),3)

np.float64(0.273)

## Cross Velidation Score

In [25]:
dtc = DecisionTreeClassifier()

np.round(cross_val_score(dtc,
                np.delete(ct.fit_transform(X),2,axis=1),
                LabelEncoder().fit_transform(y),cv=30).mean(),3)

np.float64(0.257)

# Typers of Outcome

In [26]:
pd.DataFrame(y_pred)[0].unique()

array([1, 0, 3, 2])