In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer # only using for test purposes

import numpy as np

from test_class import PRKNeighborsClassifier
from test_class2 import PRKNeighborsClassifier2
from sklearn.utils.estimator_checks import check_estimator


In [2]:
df = pd.read_csv('Heart.csv')

In [3]:
# df = df.dropna()

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [5]:
df['AHD'] = 1*(df['AHD'] == 'No')

In [6]:

df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,1
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,0
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,0
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,1
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,1


Create lists of columns for different trainformations

In [7]:
df['Fbs'].unique()

array([1, 0])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  303 non-null    int64  
 1   Age         303 non-null    int64  
 2   Sex         303 non-null    int64  
 3   ChestPain   303 non-null    object 
 4   RestBP      303 non-null    int64  
 5   Chol        303 non-null    int64  
 6   Fbs         303 non-null    int64  
 7   RestECG     303 non-null    int64  
 8   MaxHR       303 non-null    int64  
 9   ExAng       303 non-null    int64  
 10  Oldpeak     303 non-null    float64
 11  Slope       303 non-null    int64  
 12  Ca          299 non-null    float64
 13  Thal        301 non-null    object 
 14  AHD         303 non-null    int64  
dtypes: float64(2), int64(11), object(2)
memory usage: 35.6+ KB


In [21]:
df.shape[0] - df.drop_duplicates().shape[0]

0

In [9]:
ordinal_columns = [
    'RestECG',
    'Slope',
    'Ca'
]

contineous_columns = [
    'Age',
    'RestBP'
    'Chol',
    'MaxHR',
    'Oldpeak'
]

categoric_columns = [
    'Sex',
    'ChestPain',
    'Fbs',
    'ExAng',
    'Thal'
]

Create test and train data sets

In [10]:
X = df.drop('AHD', axis=1)
y = df['AHD']

In [11]:
X.isnull().sum()

Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

Create pipline

In [13]:
encoder = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categoric_columns),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_columns),
        
    ]
)

# for testing on test data
knn_pipe_list = [
    ('encoder', encoder),
    ('scaler', MinMaxScaler()),
    ('imputer', SimpleImputer()),
    # ('classifier', KNeighborsClassifier(n_neighbors=5))
]

knn_pipe_list_classifier = [
    ('encoder', encoder),
    ('scaler', MinMaxScaler()),
    ('imputer', SimpleImputer()),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
]

knn_pipe = Pipeline(knn_pipe_list)
knn_pipe_classifier = Pipeline(knn_pipe_list_classifier) # with classifier

In [14]:
df.isnull().sum()

Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64

In [15]:
knn_pipe_classifier.fit(X_train, y_train)

0,1,2
,steps,"[('encoder', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [16]:
knn_pipe_classifier.classes_

array([0, 1])

In [17]:
X_train.values[y_train.values == 0]

array([[224, 53, 1, ..., 2, 2.0, 'reversable'],
       [244, 61, 1, ..., 2, 2.0, 'normal'],
       [301, 57, 1, ..., 2, 1.0, 'reversable'],
       ...,
       [58, 41, 1, ..., 1, 0.0, 'reversable'],
       [147, 57, 1, ..., 2, 3.0, 'reversable'],
       [67, 60, 1, ..., 2, 0.0, 'normal']], shape=(104, 14), dtype=object)

In [18]:
encoded_x_test = knn_pipe.fit_transform(X_test)