This notebook will try to explore the effectiveness of prknn vs standard knn models for prediction on an inbalanced dataset.   

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, TargetEncoder

from sklearn.impute import KNNImputer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from test_class import PRKNeighborsClassifier
from tqdm import tqdm

In [32]:
RANDOM_STATE = 33

In [33]:
df= pd.read_csv('train_data.csv')

In [34]:
columns_to_drop = [
    "Hospital_region_code",
    "Ward_Facility_Code",
    "case_id",
    "patientid",
    "Hospital_code",
]

df = df.drop(columns_to_drop, axis=1)

In [35]:
df.isna().sum()

Hospital_type_code                      0
City_Code_Hospital                      0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Bed Grade                             113
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

For categoric variables, I am going to use frequency encoding, due to the high number of categoric features and their high cardinality. I am also concerned that target encoding may lead to data leakage due to the low frequencies of the target variable. While other techniques may be better, the aim is to test the effectiveness of prknn compared to knn.

In [36]:
categoric_columns = [
    'Hospital_type_code',
    'City_Code_Hospital',
    'Department', 
    'Ward_Type',
    'City_Code_Patient', 
    'Type of Admission'
]

ordinal_columns = [
    'Bed Grade',
    'Severity of Illness'
]

cont_columns = [
    'Available Extra Rooms in Hospital',
    'Visitors with Patient',
    'Age',
    'Admission_Deposit'
]

In [50]:
col_transformer = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), categoric_columns),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_columns)
    ]
)

prknn_pipeline_list = [
    ('col_transform', col_transformer),
    ('scaler', MinMaxScaler()),
    ('knn_imputer', KNNImputer(n_neighbors=10, weights='distance')),
    ('prknn_predictor', PRKNeighborsClassifier())
]

knn_pipeline_list = [
    ('col_transform', col_transformer),
    ('scaler', MinMaxScaler()),
    ('knn_imputer', KNNImputer(n_neighbors=10, weights='distance')),
    ('knn_predictor', KNeighborsClassifier(weights='distance'))
]


prknn_pipeline = Pipeline(prknn_pipeline_list)
knn_pipeline = Pipeline(knn_pipeline_list)

In [51]:
classes = df[["Stay"]]
orde = OrdinalEncoder()

np.unique(orde.fit_transform(classes))

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [54]:
samples = 20

knn_accuracy_scores = []
prknn_accuracy_scores = []

knn_f1_scores = []
prknn_f1_scores = []

for _ in tqdm(range(samples)):

    df_sample = df.sample(50_00)

    X = df_sample.drop('Stay', axis=1)
    y = df_sample['Stay']

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)
    ordinal_encoder = OrdinalEncoder()

    y_train = ordinal_encoder.fit_transform(pd.DataFrame(y_train)).ravel()
    y_test = ordinal_encoder.fit_transform(pd.DataFrame(y_test)).ravel()
    
    knn_pipeline.fit(X_train, y_train)
    prknn_pipeline.fit(X_train, y_train)
    
    y_pred_knn = knn_pipeline.predict(X_test)
    y_pred_prknn = prknn_pipeline.predict(X_test)

    # print(np.unique(y_pred_knn))
    # print(np.unique(y_pred_prknn))

    knn_accuracy_scores.append(accuracy_score(y_test, y_pred_knn))
    knn_f1_scores.append(f1_score(y_test, y_pred_knn, average='micro'))

    prknn_accuracy_scores.append(accuracy_score(y_test, y_pred_prknn))
    prknn_f1_scores.append(f1_score(y_test, y_pred_prknn, average='micro'))


100%|██████████| 20/20 [00:22<00:00,  1.14s/it]


In [55]:
knn_accuracy_scores = np.array(knn_accuracy_scores)
prknn_accuracy_scores = np.array(prknn_accuracy_scores)

print(np.mean(knn_accuracy_scores))
print(np.mean(prknn_accuracy_scores))

knn_f1_scores = np.array(knn_f1_scores)
prknn_f1_scores = np.array(prknn_f1_scores)

print(np.mean(knn_f1_scores))
print(np.mean(prknn_f1_scores))

0.24835
0.25815
0.24835
0.25815
