This notebook will try to explore the effectiveness of prknn vs standard knn models for prediction on an inbalanced dataset.   

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler

from sklearn.impute import KNNImputer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [39]:
df_train= pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

In [40]:
df_train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [None]:
columns_to_drop = [
    "Hospital_region_code",
    "Ward_Facility_Code",
    "case_id",
    "patientid",
    "Hospital_code",
]

df_train = df_train.drop(columns_to_drop, axis=1)

In [42]:
df_train

Unnamed: 0,Hospital_type_code,City_Code_Hospital,Available Extra Rooms in Hospital,Department,Ward_Type,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,c,3,3,radiotherapy,R,2.0,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,c,5,2,radiotherapy,S,2.0,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,e,1,2,anesthesia,S,2.0,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,b,2,2,radiotherapy,R,2.0,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,b,2,2,radiotherapy,S,2.0,7.0,Trauma,Extreme,2,51-60,5558.0,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
318433,a,6,3,radiotherapy,Q,4.0,23.0,Emergency,Moderate,3,41-50,4144.0,11-20
318434,a,1,2,anesthesia,Q,4.0,8.0,Urgent,Moderate,4,81-90,6699.0,31-40
318435,a,4,3,gynecology,R,4.0,10.0,Emergency,Minor,3,71-80,4235.0,11-20
318436,b,2,3,anesthesia,Q,3.0,8.0,Trauma,Minor,5,11-20,3761.0,11-20


In [44]:
for column in df_train.columns:
    print(f"'{column}',")

'Hospital_type_code',
'City_Code_Hospital',
'Available Extra Rooms in Hospital',
'Department',
'Ward_Type',
'Bed Grade',
'City_Code_Patient',
'Type of Admission',
'Severity of Illness',
'Visitors with Patient',
'Age',
'Admission_Deposit',
'Stay',


In [45]:
categoric_columns = [
    'Hospital_type_code',
    'City_Code_Hospital',
    'Department', 
    'Ward_Type',
    'City_Code_Patient', 
    'Type of Admission'
]

ordinal_columns = [
    'Bed Grade',
    'Severity of Illness'
]

cont_columns = [
    'Available Extra Rooms in Hospital',
    'Visitors with Patient',
    'Age',
    'Admission_Deposit'
]

In [50]:
col_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categoric_columns),
        ('ordinal', OrdinalEncoder(), ordinal_columns)
    ]
)

prknn_pipeline_list = [
    ('col_transform', col_transformer),
    ('scaler', MinMaxScaler()),
    ('knn_imputer', KNNImputer(n_neighbors=10, weights='distance')),
    ('prknn_predictor', KNeighborsClassifier(n_neighbors=10))
]

prknn_pipeline = Pipeline(prknn_pipeline_list)

In [49]:
X_train = df_train.drop('Stay', axis=1)
y_train = df_train['Stay']

In [51]:
prknn_pipeline.fit(X_train, y_train)

TypeError: MinMaxScaler does not support sparse input. Consider using MaxAbsScaler instead.