In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler

from sklearn.impute import KNNImputer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [2]:
RANDOM_STATE = 33
TEST_SIZE = 0.4
TARGET = 'Stay'

In [3]:
df_train= pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

In [4]:
df_train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [5]:
df_test.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,318439,21,c,3,Z,3,gynecology,S,A,2.0,17006,2.0,Emergency,Moderate,2,71-80,3095.0
1,318440,29,a,4,X,2,gynecology,S,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4018.0
2,318441,26,b,2,Y,3,gynecology,Q,D,4.0,17006,2.0,Emergency,Moderate,3,71-80,4492.0
3,318442,6,a,6,X,3,gynecology,Q,F,2.0,17006,2.0,Trauma,Moderate,3,71-80,4173.0
4,318443,28,b,11,X,2,gynecology,R,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4161.0


In [6]:
df_train.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

In [23]:
# remove hospital code (this has high assocation with other variables and high cardinality)

vars1 = [
    'Hospital_region_code',
    'Available Extra Rooms in Hospital',
    'Department',
    'Ward_Type',
    'Ward_Facility_Code',
    'Bed Grade',
    'Type of Admission',
    'Severity of Illness',
    'Visitors with Patient',
    'Age',
    'Admission_Deposit',
]

vars1 = [
    # OHE features
    'Hospital_region_code',
    'Department',
    'Ward_Type',
    'Ward_Facility_Code',
    'Type of Admission',

    # ordinal features
    'Bed Grade',
    'Severity of Illness',
    'Age',
    'Admission_Deposit',
    # 'Stay'

    # numeric features
    'Visitors with Patient',
    'Age',
    'Admission_Deposit',
    'Available Extra Rooms in Hospital'
]


In [24]:
vars1_with_target = vars1.copy()
vars1_with_target.append(TARGET)


df_train[vars1_with_target].head()


Unnamed: 0,Hospital_region_code,Department,Ward_Type,Ward_Facility_Code,Type of Admission,Bed Grade,Severity of Illness,Age,Admission_Deposit,Visitors with Patient,Age.1,Admission_Deposit.1,Available Extra Rooms in Hospital,Stay
0,Z,radiotherapy,R,F,Emergency,2.0,Extreme,51-60,4911.0,2,51-60,4911.0,3,0-10
1,Z,radiotherapy,S,F,Trauma,2.0,Extreme,51-60,5954.0,2,51-60,5954.0,2,41-50
2,X,anesthesia,S,E,Trauma,2.0,Extreme,51-60,4745.0,2,51-60,4745.0,2,31-40
3,Y,radiotherapy,R,D,Trauma,2.0,Extreme,51-60,7272.0,2,51-60,7272.0,2,41-50
4,Y,radiotherapy,S,D,Trauma,2.0,Extreme,51-60,5558.0,2,51-60,5558.0,2,41-50


In [9]:
df_train['Bed Grade'].unique()

array([ 2.,  3.,  4.,  1., nan])

Encodings:
-   
    HRC - 1hot
    AERIH - none
    Department - 1hot
    WT - 1hot
    WFC - 1hot
    bed grade - ordinalencoder
    ToA - 1hot
    Severity of illness - ordinal
    VwP - none
    Age - ordinal
    Admission_depo (none)

    stay - ordinal

    all X encodings need to be standardized to ensure scales are the same

In [10]:
onehot_features = [
    'Hospital_region_code',
    'Department',
    'Ward_Type',
    'Ward_Facility_Code',
    'Type of Admission',
]

ordinal_features = [
    'Bed Grade',
    'Severity of Illness',
    'Age',
    'Admission_Deposit',
    # 'Stay'
]

numeric_features = [
    'Visitors with Patient',
    'Age',
    'Admission_Deposit',
    'Available Extra Rooms in Hospital'
]

In [25]:
df_train_vars1 = df_train[vars1_with_target]
x_test = df_test[vars1]

In [26]:
col_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), onehot_features),
        ('ordinal', OrdinalEncoder(), ordinal_features)
    ]
)

# all features then need to be scaled and then imputed

knn_pipeline = Pipeline(
    [
        ('col_transform', col_transformer),
        ('scaler', MinMaxScaler()),
        ('knn_imputer', KNNImputer(n_neighbors=50)),
        ('knn', KNeighborsClassifier(n_neighbors=50))
    ]
)

knn_pipeline_list = [
        ('col_transform', col_transformer),
        ('scaler', MinMaxScaler()),
        ('knn_imputer', KNNImputer(n_neighbors=50)),
        ('knn', KNeighborsClassifier(n_neighbors=50))
    ]

knn_pipeline = Pipeline(knn_pipeline_list)





In [13]:
X_train = df_train.drop('Stay', axis=1)
y_train = df_train[['Stay']]

y_train = OneHotEncoder().fit_transform(y_train).toarray()

In [14]:
y_train

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], shape=(318438, 11))

In [15]:
knn_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('col_transform', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,n_neighbors,50
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_neighbors,50
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [16]:
# knn_pipeline.predict(x_test)

Now pipline has been defined, perform cross validation to find ideal n neighbors for imputation and fitting, starting with fitting

In [17]:
validation_ks = np.array([5,10,50])
validation_ks

array([ 5, 10, 50])

In [18]:
X_train = df_train_vars1.drop(TARGET, axis=1)
y_train = df_train_vars1[[TARGET]] 

In [19]:
scoring = ['precision_macro', 'recall_macro', 'accuracy', 'f1_macro']

In [20]:
def knn_validate(predictor, X, y, scoring_average, test_size = TEST_SIZE, random_state = RANDOM_STATE):

    X_train_, X_val_, y_train_, y_val_ = train_test_split(X, y, test_size=test_size, random_state=random_state)

    if isinstance(predictor, KNeighborsClassifier):
        if not scoring_average or scoring_average == 'binary':
            raise ValueError('Multiclass knn target requires non binary average')

    predictor.fit(X_train_, y_train_)
    y_pred = predictor.predict(X_val_)

    print(pd.crosstab(y_val_, y_pred))

    # this code sucks
    f1 = f1_score(y_val_, y_pred, average=scoring_average)
    accuracy = accuracy_score(y_val_, y_pred)
    recall = recall_score(y_val_, y_pred, average=scoring_average)
    preciscion = precision_score(y_val_, y_pred, average=scoring_average)

    dict = {}

    dict['f1_score'] = f1
    dict['accuracy_score'] = accuracy
    dict['recall_score'] = recall
    dict['precision_score'] = preciscion

    return dict 


In [None]:
scoring_lists = {}

col_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), onehot_features),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_features)
    ]
)

onehot = OneHotEncoder()
y_train_1hot = onehot.fit_transform(y_train).toarray()

for n_neighbors in tqdm(validation_ks):
    knn_pipeline_list = [
        ('col_transform', col_transformer),
        ('scaler', MinMaxScaler()),
        ('knn_imputer', KNNImputer(n_neighbors=50)),
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
    ]

    print('Initializing pipeline')
    knn_pipeline = Pipeline(knn_pipeline_list)
    print('Pipeline initialized')

    # knn_wrapped_model = TransformedTargetRegressor(regressor=knn_pipeline, transformer=OneHotEncoder())

    print('Cross validating...')
    # knn_scores = cross_validate(knn_pipeline, X_train, y_train_1hot, scoring=scoring, cv=5)
    knn_scores = knn_validate(predictor=knn_pipeline, X=X_train, y=y_train_1hot, random_state=RANDOM_STATE, test_size=TEST_SIZE, scoring_average='macro')
    print('Cross validation complete')

    print('Adding scores')
    for score in knn_scores.keys():
        if not score in scoring_lists:
            scoring_lists[score] = []
            
        scoring_lists[score].append(knn_scores[score])
    
    print('Scores added')
    print(f'n_neighbors {n_neighbors} completed', end='\n')

  0%|          | 0/3 [00:00<?, ?it/s]


Initializing pipeline
Pipeline initialized
Cross validating...


ValueError: Selected columns, ['Bed Grade', 'Severity of Illness', 'Age', 'Admission_Deposit'], are not unique in dataframe