In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer # only using for test purposes

import numpy as np


In [2]:
df = pd.read_csv('Heart.csv')

In [3]:
# df = df.dropna()

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [5]:
df['AHD'] = 1*(df['AHD'] == 'No')

In [6]:

df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,1
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,0
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,0
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,1
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,1


Create lists of columns for different trainformations

In [7]:
df['Fbs'].unique()

array([1, 0])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  303 non-null    int64  
 1   Age         303 non-null    int64  
 2   Sex         303 non-null    int64  
 3   ChestPain   303 non-null    object 
 4   RestBP      303 non-null    int64  
 5   Chol        303 non-null    int64  
 6   Fbs         303 non-null    int64  
 7   RestECG     303 non-null    int64  
 8   MaxHR       303 non-null    int64  
 9   ExAng       303 non-null    int64  
 10  Oldpeak     303 non-null    float64
 11  Slope       303 non-null    int64  
 12  Ca          299 non-null    float64
 13  Thal        301 non-null    object 
 14  AHD         303 non-null    int64  
dtypes: float64(2), int64(11), object(2)
memory usage: 35.6+ KB


In [None]:
ordinal_columns = [
    'RestECG',
    'Slope',
    'Ca'
]

contineous_columns = [
    'Age',
    'RestBP'
    'Chol',
    'MaxHR',
    'Oldpeak'
]

categoric_columns = [
    'Sex',
    'ChestPain',
    'Fbs',
    'ExAng',
    'Thal'
]

Create test and train data sets

In [10]:
X = df.drop('AHD', axis=1)
y = df['AHD']

In [11]:
X.isnull().sum()

Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

Create pipline

In [13]:
encoder = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categoric_columns),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_columns),
        
    ]
)

# for testing on test data
knn_pipe_list = [
    ('encoder', encoder),
    ('scaler', MinMaxScaler()),
    ('imputer', SimpleImputer()),
    # ('classifier', KNeighborsClassifier(n_neighbors=5))
]

knn_pipe_list_classifier = [
    ('encoder', encoder),
    ('scaler', MinMaxScaler()),
    ('imputer', SimpleImputer()),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
]

knn_pipe = Pipeline(knn_pipe_list)
knn_pipe_classifier = Pipeline(knn_pipe_list_classifier) # with classifier

In [14]:
df.isnull().sum()

Unnamed: 0    0
Age           0
Sex           0
ChestPain     0
RestBP        0
Chol          0
Fbs           0
RestECG       0
MaxHR         0
ExAng         0
Oldpeak       0
Slope         0
Ca            4
Thal          2
AHD           0
dtype: int64

In [15]:
knn_pipe_classifier.fit(X_train, y_train)

0,1,2
,steps,"[('encoder', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [16]:
encoded_x_test = knn_pipe.fit_transform(X_test)

In [17]:
knn_classifier_test = KNeighborsClassifier().fit(encoded_x_test, y_test)

In [18]:
test_neighbors, test_distances = knn_classifier_test.kneighbors(encoded_x_test, n_neighbors=5, return_distance=True)

In [19]:
test_neighbors.shape

(61, 5)

In [20]:
test_distances

array([[ 0, 51, 47, 40, 16],
       [ 1,  7,  2, 13,  4],
       [ 2, 33,  4, 26,  1],
       [ 3, 56, 11, 22, 12],
       [33,  4,  2, 16, 40],
       [ 5, 30, 17,  8, 51],
       [34,  6, 11, 20, 47],
       [ 7,  1, 13,  2,  9],
       [ 8,  5, 30, 17, 51],
       [ 9, 14, 24, 10, 49],
       [10, 49,  9, 14, 24],
       [11, 56, 22,  3,  6],
       [12, 52, 57,  5,  3],
       [13, 37, 58,  1,  7],
       [14,  9, 10, 24, 49],
       [15, 59, 39, 22, 11],
       [16, 40, 36, 60, 32],
       [17,  5, 30,  8, 51],
       [18, 46, 44, 27, 25],
       [19, 20, 55,  8, 17],
       [20, 55, 19, 11,  6],
       [21, 38, 28, 50, 26],
       [22, 11, 56,  3, 59],
       [23, 35, 58, 43, 46],
       [24,  9, 49, 10, 14],
       [25, 41, 46, 18, 27],
       [50, 26, 28, 38, 21],
       [27, 44, 18, 46, 25],
       [50, 26, 28, 38, 21],
       [29, 60, 32, 36, 40],
       [ 5, 30, 17,  8, 51],
       [31, 19,  8, 17,  5],
       [60, 32, 36, 29, 40],
       [33,  4,  2, 16, 40],
       [34,  6

Now need to find way of getting radius

In [21]:
y_train.unique()

array([0, 1])

In [22]:
df_train = X_train.copy()

In [23]:
df_train['AHD'] = y_train

In [24]:
df_train.shape

(242, 15)

In [25]:
df_train.values[0][0]

224

In [26]:
# before calculating

In [27]:
np.unique(df_train.values.T[-1])

array([0, 1], dtype=object)

In [28]:
target = 'AHD'

df_train.columns.get_loc(target)

14

In [29]:
from scipy.spatial.distance import euclidean

target = 'AHD'

# iterate through classes
target_classes = df_train[target].unique()

values = df_train.values

x_values = df_train.drop(target, axis=1)
encoded_values = knn_pipe.fit_transform(x_values)


class_mean_distances = []

for t_class in target_classes:
    
    # compute class pairwise euclidean radius (mean average)

    mean_distance = 0
    n = 1

    class_values = values[df_train[target] == t_class]
    rows = class_values.shape[0]

    for i in range(rows):
        for j in range(i + 1, rows):
            
            distance_eu = euclidean(encoded_values[i], encoded_values[j])
            mean_distance = ( (mean_distance * n) + distance_eu ) / (n + 1)
            n+=1

    class_mean_distances.append(mean_distance)




            

In [30]:
class_mean_distances

[2.2899646835542145, 2.2863928042117108]

In [34]:
values[:, -1]

array([0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1],
      dtype=object)

In [32]:
from test_functions import get_class_average_eu_distance

get_class_average_eu_distance(values, target_column_index=df_train.columns.get_loc(target))

IndexError: boolean index did not match indexed array along axis 0; size of axis is 242 but size of corresponding boolean axis is 15