In [15]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder

In [16]:
data = pd.read_csv("input/TRAIN.csv")
test = pd.read_csv("input/TEST.csv", index_col=47)

test.head()

Unnamed: 0_level_0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Caucasian,Female,[80-90),?,2,3,7,6,MC,Emergency/Trauma,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,[40-50),?,1,1,7,2,CP,?,...,No,No,Down,No,No,No,No,No,Ch,Yes
2,Caucasian,Male,[50-60),?,3,1,1,1,CP,?,...,No,No,No,No,No,No,No,No,No,Yes
3,Caucasian,Female,[50-60),?,1,1,7,3,HM,Family/GeneralPractice,...,No,No,No,No,No,No,No,No,Ch,Yes
4,Caucasian,Male,[80-90),?,2,3,7,5,MC,Emergency/Trauma,...,No,No,Down,No,No,No,No,No,Ch,Yes


In [17]:
medication_features = ['metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

cat_col = ['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty', 'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed']
num_cols = ['time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient',
       'number_diagnoses']
target_col = ['readmitted_NO']

In [18]:
from copy import deepcopy
def process_data(df, labelencoder=dict(), cat_cols = [], encode=True):
    df = deepcopy(df)
    df.replace("?", 'NaN', inplace=True)
    df.replace(np.nan, 'NaN', inplace=True)
    df.medical_specialty = df.medical_specialty.str.replace("&",'and')
    df.medical_specialty = df.medical_specialty.str.replace("/",'or')
    df.diag_1 = df.diag_1.str.extract(r'(\d+).')[0]
    df.diag_2 = df.diag_2.str.extract(r'(\d+).')[0]
    df.diag_3 = df.diag_3.str.extract(r'(\d+).')[0]
    
    def process_split_medical_specialty(dd):
        col = 'medical_specialty'
        arr = list()
        for i in dd.medical_specialty.str.split('-'):
            if type(i).__name__ == 'list':
                if len(i) == 1:
                    i = ["NaN"] + i
                if len(i) > 2:
                    i = [i[0]] + ['and'.join(i[1:])]
            elif i == "NaN":
                i = ["NaN", "NaN"]
            arr.append(i)
        
        arr = np.array(arr)
        dd['medical_field'] = arr[:, 0]
        dd['medical_specialty'] = arr[:, 1]
        return dd

    df.max_glu_serum = df.max_glu_serum.replace("None", "NaN")
    df.A1Cresult = df.A1Cresult.replace("None", "NaN")
    df.age = df.age.str.extract(r'\[(\d+)-')[0]
    df.weight = df.weight.str.extract(r'\[(\d+)-')[0]
    df = process_split_medical_specialty(df)
    cat_cols += ['medical_field', "diag_1", 'diag_2', 'diag_3', 'weight', 'age']
    df.replace(np.nan, 'NaN', inplace=True)

    if encode == True:
        for col in cat_cols:
            labelencoder[col] = LabelEncoder().fit(df[col])
            df[col] = labelencoder[col].transform(df[col])
    else:
        for col in cat_cols:
            try:
                df[col] = labelencoder[col].transform(df[col])
            except:
                print(col)
                df[col] = df[col].astype(str)
    df = df.astype(float)
    return df, labelencoder

In [19]:
X, y = data.drop(columns=["readmitted_NO"]), data.readmitted_NO
d = pd.concat([X, test], axis=0, sort=False, keys=['X', 'test'])
processed_Xandtest, le = process_data(d, cat_cols=cat_col+medication_features, encode=True)


In [20]:
processed_X = processed_Xandtest.loc['X']
processed_test = processed_Xandtest.loc['test']
best_cols = ['num_lab_procedures', 'num_medications', 'number_diagnoses']


In [21]:
processed_X = processed_X[best_cols]
processed_test = processed_test[best_cols]

In [22]:
from sklearn.cluster import KMeans

clf = KMeans(n_clusters=2, algorithm='elkan')
clf.fit(processed_X, y)
pred = clf.predict(processed_test)

In [23]:
pred.shape, test.shape

((30530,), (30530, 47))

In [26]:
test['target'] = pred
test[['target']].to_csv("output.csv", index=True)