In [1]:
from kaggle_handler import handler
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import Binarizer, LabelEncoder

# Downloading / Loading Dataset

In [3]:
Assets = handler('aizahzeeshan/lung-cancer-risk-in-25-countries')

Directory 'Assets' already exists.
Datasets already exist in Assets folder
['lung_cancer_prediction_dataset.csv', 'Titanic-Dataset.csv', 'GenZ_DatingApp_Data.csv', 'concrete_data.csv']
 Change Add_more parameter to download more datasets


In [4]:
Assets

['lung_cancer_prediction_dataset.csv',
 'Titanic-Dataset.csv',
 'GenZ_DatingApp_Data.csv',
 'concrete_data.csv']

In [5]:
df =pd.read_csv('Assets/lung_cancer_prediction_dataset.csv',
                usecols=['Population_Size','Age','Cigarettes_per_Day','Annual_Lung_Cancer_Deaths','Cancer_Stage'],
                dtype={'Population_Size':np.int32,'Age':np.int32,'Cigarettes_per_Day':np.int32,'Annual_Lung_Cancer_Deaths':np.int32,'Cancer_Stage':object})
df.sample(3)

Unnamed: 0,Population_Size,Age,Cigarettes_per_Day,Cancer_Stage,Annual_Lung_Cancer_Deaths
99830,145,28,19,,60000
16591,70,60,0,,26000
115022,206,33,19,Stage 1,20000


In [6]:
df.isna().sum()

Population_Size                   0
Age                               0
Cigarettes_per_Day                0
Cancer_Stage                 211671
Annual_Lung_Cancer_Deaths         0
dtype: int64

In [7]:
df.dropna(subset=['Cancer_Stage'],axis=0,inplace=True)

In [8]:
df.sample(3)

Unnamed: 0,Population_Size,Age,Cigarettes_per_Day,Cancer_Stage,Annual_Lung_Cancer_Deaths
115823,166,57,0,Stage 3,25000
58918,85,66,0,Stage 2,29000
71094,84,35,11,Stage 2,27000


# Spliting Dataset

In [9]:
X = df.drop(columns=['Cancer_Stage'])
y = df['Cancer_Stage']
y = LabelEncoder().fit_transform(y)

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((7168, 4), (7168,), (1793, 4), (1793,))

# Treaning and Testing Model without Transformation

In [11]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
np.round(accuracy_score(y_true=y_test,y_pred=y_pred),3)

np.float64(0.261)

# Cross Validating Model without Transformation

In [12]:
dtc = DecisionTreeClassifier()
np.round(cross_val_score(dtc,X,y,cv=30).mean(),3)

np.float64(0.256)

# Binarizer ColumnTransformer
- Convert the column data to "0" or "1"

In [13]:
ct = ColumnTransformer(transformers=[('binarizer',
                                      Binarizer(copy=False,threshold=X['Annual_Lung_Cancer_Deaths'].median()),
                                      ['Annual_Lung_Cancer_Deaths']
                                     )],
                       remainder='passthrough')

In [14]:
X_train_binarized = ct.fit_transform(X_train)
X_test_binarized = ct.transform(X_test)

In [15]:
pd.DataFrame(X_train_binarized, columns=['Annual_Lung_Cancer_Deaths','Population_Size', 'Age', 'Cigarettes_per_Day']).head(5)

Unnamed: 0,Annual_Lung_Cancer_Deaths,Population_Size,Age,Cigarettes_per_Day
0,1,83,44,7
1,1,273,51,16
2,0,120,51,17
3,0,85,40,0
4,0,225,59,27


# Treaning and Testing Model with Transformation

In [16]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_binarized,y_train)
y_pred = dtc.predict(X_test_binarized)
np.round(accuracy_score(y_true=y_test,y_pred=y_pred),3)

np.float64(0.247)

# Cross Validating Model with Transformation

In [17]:
dtc = DecisionTreeClassifier()
X_ = ct.transform(X)
np.round(cross_val_score(dtc,X_,y,cv=30).mean(),3)

np.float64(0.253)