In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler


from sklearn.impute import KNNImputer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

In [46]:
df_train= pd.read_csv('train_data.csv')

In [47]:
df_train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [48]:
df_train.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

In [49]:
# remove hospital code (this has high assocation with other variables and high cardinality)

vars1 = [
    'Hospital_region_code',
    'Available Extra Rooms in Hospital',
    'Department',
    'Ward_Type',
    'Ward_Facility_Code',
    'Bed Grade',
    'Type of Admission',
    'Severity of Illness',
    'Visitors with Patient',
    'Age',
    'Admission_Deposit',
    'Stay'
]

In [50]:
df_train[vars1].head()


Unnamed: 0,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,Z,3,radiotherapy,R,F,2.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,Z,2,radiotherapy,S,F,2.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,X,2,anesthesia,S,E,2.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,Y,2,radiotherapy,R,D,2.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,Y,2,radiotherapy,S,D,2.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [51]:
df_train['Bed Grade'].unique()

array([ 2.,  3.,  4.,  1., nan])

Encodings:
-   
    HRC - 1hot
    AERIH - none
    Department - 1hot
    WT - 1hot
    WFC - 1hot
    bed grade - ordinalencoder
    ToA - 1hot
    Severity of illness - ordinal
    VwP - none
    Age - ordinal
    Admission_depo (none)

    stay - ordinal

    all X encodings need to be standardized to ensure scales are the same

In [52]:
onehot_features = [
    'Hospital_region_code',
    'Department',
    'Ward_Type',
    'Ward_Facility_Code',
    'Type of Admission',
]

ordinal_features = [
    'Bed Grade',
    'Severity of Illness',
    'Age',
    'Admission_Deposit',
    # 'Stay'
]

numeric_features = [
    'Visitors with Patient',
    'Age',
    'Admission_Deposit',
]

In [53]:
df_train_vars1 = df_train[vars1]

In [57]:
col_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), onehot_features),
        ('ordinal', OrdinalEncoder(), ordinal_features)
    ]
)

# all features then need to be scaled and then imputed

knn_pipeline = Pipeline(
    [
        ('col_transform', col_transformer),
        ('scaler', MinMaxScaler()),
        ('knn_imputer', KNNImputer(n_neighbors=50)),
        ('knn', KNeighborsClassifier(n_neighbors=50))
    ]
)





In [55]:
X_train = df_train.drop('Stay', axis=1)
y_train = df_train['Stay']

In [58]:
knn_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('col_transform', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,n_neighbors,50
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_neighbors,50
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,
