This notebook will try to explore the effectiveness of prknn vs standard knn models for prediction on an inbalanced dataset.   

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, TargetEncoder

from sklearn.impute import KNNImputer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from test_class import PRKNeighborsClassifier

In [3]:
RANDOM_STATE = 33

In [4]:
df= pd.read_csv('train_data.csv')

In [5]:
df.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [6]:
df.describe()

Unnamed: 0,case_id,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Visitors with Patient,Admission_Deposit
count,318438.0,318438.0,318438.0,318438.0,318325.0,318438.0,313906.0,318438.0,318438.0
mean,159219.5,18.318841,4.771717,3.197627,2.625807,65747.579472,7.251859,3.284099,4880.749392
std,91925.276848,8.633755,3.102535,1.168171,0.873146,37979.93644,4.745266,1.764061,1086.776254
min,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1800.0
25%,79610.25,11.0,2.0,2.0,2.0,32847.0,4.0,2.0,4186.0
50%,159219.5,19.0,5.0,3.0,3.0,65724.5,8.0,3.0,4741.0
75%,238828.75,26.0,7.0,4.0,3.0,98470.0,8.0,4.0,5409.0
max,318438.0,32.0,13.0,24.0,4.0,131624.0,38.0,32.0,11008.0


In [5]:
# Assess missing rows
df.isnull().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [1]:
columns_to_drop = [
    "Hospital_region_code",
    "Ward_Facility_Code",
    "case_id",
    "patientid",
    "Hospital_code",
]

df = df.drop(columns_to_drop, axis=1)

NameError: name 'df' is not defined

In [7]:
df.columns

Index(['Hospital_type_code', 'City_Code_Hospital',
       'Available Extra Rooms in Hospital', 'Department', 'Ward_Type',
       'Bed Grade', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

In [8]:
for column in df.columns:
    print(f"'{column}',")

'Hospital_type_code',
'City_Code_Hospital',
'Available Extra Rooms in Hospital',
'Department',
'Ward_Type',
'Bed Grade',
'City_Code_Patient',
'Type of Admission',
'Severity of Illness',
'Visitors with Patient',
'Age',
'Admission_Deposit',
'Stay',


For categoric variables, I am going to use frequency encoding, due to the high number of categoric features and their high cardinality. I am also concerned that target encoding may lead to data leakage due to the low frequencies of the target variable. While other techniques may be better, the aim is to test the effectiveness of prknn compared to knn.

In [9]:
categoric_columns = [
    'Hospital_type_code',
    'City_Code_Hospital',
    'Department', 
    'Ward_Type',
    'City_Code_Patient', 
    'Type of Admission'
]

ordinal_columns = [
    'Bed Grade',
    'Severity of Illness'
]

cont_columns = [
    'Available Extra Rooms in Hospital',
    'Visitors with Patient',
    'Age',
    'Admission_Deposit'
]

In [10]:
col_transformer = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), categoric_columns),
        ('ordinal', OrdinalEncoder(), ordinal_columns)
    ]
)

prknn_pipeline_list = [
    ('col_transform', col_transformer),
    ('scaler', MinMaxScaler()),
    ('knn_imputer', KNNImputer(n_neighbors=10, weights='distance')),
    ('prknn_predictor', PRKNeighborsClassifier())
]

prknn_pipeline = Pipeline(prknn_pipeline_list)

In [11]:
X = df.drop('Stay', axis=1)
y = df['Stay']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)

In [12]:
ordinal_encoder = OrdinalEncoder()

y_train = ordinal_encoder.fit_transform(pd.DataFrame(y_train)).ravel()
y_test = ordinal_encoder.fit_transform(pd.DataFrame(y_test)).ravel()

In [13]:
np.unique(y_train)

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [None]:
prknn_pipeline.fit(X_train, y_train)

Fitting pr_knn
Getting class radii


In [None]:
y_pred = prknn_pipeline.predict(X_test)