In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [5]:
df_TFIDF40 = pd.read_csv('input_ICD9_TFIDF_40.csv')
df_TFIDF40.rename(columns={'Label': 'ICD9'}, inplace=True)
df_TM5 = pd.read_csv('input_ICD9_TM_5.csv')
df_TM20 = pd.read_csv('input_ICD9_TM_20.csv')
df_TM30 = pd.read_csv('input_ICD9_TM_30.csv')
df_TM30.rename(columns={'top_icd': 'ICD9'}, inplace=True)
df_TM39 = pd.read_csv('input_ICD9_TM_39.csv')

df_list = [df_TFIDF40, df_TM5, df_TM20, df_TM30, df_TM39]
files_list = ['TFIDF_40', 'TM_5', 'TM_20', 'TM_30', 'TM_39']

In [6]:
# Some data quality checks
# Label is consistent
print([True for df in df_list if 'ICD9' in df.columns])
print([df.shape for df in df_list])
print([df['ICD9'].value_counts() for df in df_list])

[True, True, True, True, True]
[(11537, 41), (11537, 6), (11537, 21), (11537, 31), (11537, 40)]
[414    3502
38     3184
410    3175
424    1676
Name: ICD9, dtype: int64, 414    3502
38     3184
410    3175
424    1676
Name: ICD9, dtype: int64, 414    3502
38     3184
410    3175
424    1676
Name: ICD9, dtype: int64, 414    3502
38     3184
410    3175
424    1676
Name: ICD9, dtype: int64, 414    3502
38     3184
410    3175
424    1676
Name: ICD9, dtype: int64]


In [8]:
def get_classification_metrics_rf(df: pd.DataFrame, label_col:str):
    '''
    Get accuracy and F1 metrics from Random Forest
    '''
    # Train test split
    X = df.drop(columns=[label_col])
    y = df[label_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Random Forest Classifer
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    rf_prob = rf.predict_proba(X_test)
    rf_acc = balanced_accuracy_score(y_test, rf_pred)
    rf_f1 = f1_score(y_test, rf_pred, average = 'weighted')
    rf_auc = roc_auc_score(y_test, rf_prob, multi_class='ovr', average='macro')
    
    # Construct results
    results = dict()
    results['pred'], results['pred_prob'] = rf_pred, rf_prob
    results['acc'], results['f1'], results['auc'] = rf_acc, rf_f1, rf_auc
    results['model'] = rf
        
    return results

In [48]:
def F1(pred, true, clabel): # Accuracy / F1 / Precision / Recall Output
    TP,FP,FN=0,0,0 
    for i in range(len(pred)):
        if pred[i] == true[i] and pred[i] == clabel: # only for minority class.
            TP+=1
        if pred[i] == clabel and true[i] != clabel:
            FP+=1
        if pred[i] != clabel and true[i] == clabel:
            FN+=1
    if TP==0:
        precision=0
        recall=0
        f1=0
    else:
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f1 = 2*TP/(2*TP+FP+FN)

    return precision,recall,f1


def get_classification_metrics_km(df: pd.DataFrame, label_col:str): #PY Double check
    '''
    Get accuracy and F1 metrics from Kmeans
    '''
    # Train test split
    X = df.drop(columns=[label_col])
    y = df[label_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Kmeans Clustering
    km = KMeans()
    km.fit(X_train)
    km_pred = km.predict(X_test)
    
    ICD9_CODE_map = {
    '414':  0, #chronic heart
    '38':  1, #sepsis
    '410': 2, #heart attack
    '424': 3, #diseases of endocardium
    }
    
    y_train_km = y_train.map(ICD9_CODE_map)
    y_test_km = y_test.map(ICD9_CODE_map)
    pred = kmeans.labels_
    true = y_test_km
    
    acc=0
    max = 0
    predTemp=[-1,-1,-1,-1]
    predNew=[-1]*len(pred)
    predAssign = pd.Series(pred)
    predFinal=[-1]*len(pred)

    for i in range(4):
        predTemp[i]=0
        for j in range(4):
            if j!=i:
                predTemp[j]=1
            else:
                continue
            for k in range(4):
                if k!=i and k!=j:
                    predTemp[k]=2
                else:
                    continue
                for l in range(4):
                    if l!=i and l!=k and l!=j:
                        predTemp[l]=3
                        pred_map = {
                            0: predTemp[0],
                            1: predTemp[1],
                            2: predTemp[2],
                            3: predTemp[3],
                        }
                        predNew = predAssign.map(pred_map)
                        predNew = predNew.values
                        acc = accuracy_score(true, predNew)
                        if acc > max: 
                            max = acc
                            predFinal = predNew  
                    else:
                        continue

    #Assign new class to pred.
    precision_c0,recall_c0,f1_c0=F1(predFinal,true.tolist(),0)
    precision_c1,recall_c1,f1_c1=F1(predFinal,true.tolist(),1)
    precision_c2,recall_c2,f1_c2=F1(predFinal,true.tolist(),2)
    precision_c3,recall_c3,f1_c3=F1(predFinal,true.tolist(),3)

    preAvg=(precision_c0+precision_c1+precision_c2+precision_c3)/4
    reAvg=(recall_c0+recall_c1+recall_c2+recall_c3)/4
    f1Avg=(f1_c0+f1_c1+f1_c2+f1_c3)/4
    
    # Weighted F1 (PY double check)
    f1Weighted = np.average([f1_c0, f1_c1, f1_c2, f1_c3], weights=[len(predFinal==0), len(predFinal==1), len(predFinal==2), len(predFinal==3)])
    
    # Accuracy
    bal_acc = balanced_accuracy_score(true, predFinal)
    
    # Results
    results = dict()
    results['pred'] = rf_pred, pred
    results['acc'], results['f1'], results['f1_weighted'] = bal_acc, f1Avg, f1Weighted
    results['model'] = km
    
    return results    

In [9]:
rf_results_list = [get_classification_metrics_rf(df, 'ICD9') for df in df_list]

In [10]:
rf_results_list

[{'pred': array([ 38, 410, 410, ...,  38, 414, 414], dtype=int64),
  'pred_prob': array([[0.9 , 0.03, 0.04, 0.03],
         [0.  , 0.99, 0.01, 0.  ],
         [0.04, 0.45, 0.37, 0.14],
         ...,
         [0.95, 0.02, 0.01, 0.02],
         [0.09, 0.3 , 0.59, 0.02],
         [0.  , 0.07, 0.76, 0.17]]),
  'acc': 0.8106023066881761,
  'f1': 0.8133501566252203,
  'auc': 0.9475385019950848,
  'model': RandomForestClassifier()},
 {'pred': array([ 38, 410, 414, ...,  38, 424, 414], dtype=int64),
  'pred_prob': array([[1.  , 0.  , 0.  , 0.  ],
         [0.  , 0.81, 0.14, 0.05],
         [0.  , 0.01, 0.83, 0.16],
         ...,
         [0.98, 0.  , 0.  , 0.02],
         [0.03, 0.25, 0.14, 0.58],
         [0.  , 0.  , 0.71, 0.29]]),
  'acc': 0.6268615502979251,
  'f1': 0.6597444348864806,
  'auc': 0.8712997244654279,
  'model': RandomForestClassifier()},
 {'pred': array([ 38, 410, 414, ...,  38, 414, 424], dtype=int64),
  'pred_prob': array([[0.99, 0.01, 0.  , 0.  ],
         [0.25, 0.34, 0.2

In [49]:
km_results_list = [get_classification_metrics_km(df, 'ICD9') for df in df_list]

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [73]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

d = pd.read_csv('input_ICD9_TFIDF_40.csv')
d.rename(columns={'Label': 'ICD9'}, inplace=True)

ICD9_CODE_map = {
    424:  0,
    414:  1,
    410: 2,
    38: 3,
}

d["ICD9"] = d["ICD9"].map(ICD9_CODE_map)

#X = d.values
#X = X[:,1:len(d.columns)]
X = d.drop(columns=['ICD9'])
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)

pred = kmeans.labels_
true = d["ICD9"].values
pd.DataFrame(kmeans.labels_).to_csv("kmeanlabels.csv")

def F1(pred, true, clabel): # Accuracy / F1 / Precision / Recall Output
    TP,FP,FN=0,0,0 
    for i in range(len(pred)):
        if pred[i] == true[i] and pred[i] == clabel: # only for minority class.
            TP+=1
        if pred[i] == clabel and true[i] != clabel:
            FP+=1
        if pred[i] != clabel and true[i] == clabel:
            FN+=1
    if TP==0:
        precision=0
        recall=0
        f1=0
    else:
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f1 = 2*TP/(2*TP+FP+FN)

    return precision,recall,f1

#4 classes
#pred - kmeans.labels_ - 0/1/2/3
#true - ICD9 

acc=0
max = 0
predTemp=[-1,-1,-1,-1]
predNew=[-1]*len(pred)
predAssign = pd.Series(pred)
predFinal=[-1]*len(pred)

from sklearn.metrics import accuracy_score

for i in range(4):
    predTemp[i]=0
    for j in range(4):
        if j!=i:
            predTemp[j]=1
        else:
            continue
        for k in range(4):
            if k!=i and k!=j:
                predTemp[k]=2
            else:
                continue
            for l in range(4):
                if l!=i and l!=k and l!=j:
                    predTemp[l]=3
                    pred_map = {
                        0: predTemp[0],
                        1: predTemp[1],
                        2: predTemp[2],
                        3: predTemp[3],
                    }
                    predNew = predAssign.map(pred_map)
                    predNew = predNew.values
                    acc = accuracy_score(true, predNew)
                    if acc > max: 
                        max = acc
                        predFinal = predNew  
                else:
                    continue
 


#Assign new class to pred.
precision_c0,recall_c0,f1_c0=F1(predFinal,true,0)
precision_c1,recall_c1,f1_c1=F1(predFinal,true,1)
precision_c2,recall_c2,f1_c2=F1(predFinal,true,2)
precision_c3,recall_c3,f1_c3=F1(predFinal,true,3)

preAvg=(precision_c0+precision_c1+precision_c2+precision_c3)/4
reAvg=(recall_c0+recall_c1+recall_c2+recall_c3)/4
f1Avg=(f1_c0+f1_c1+f1_c2+f1_c3)/4


from sklearn.metrics import balanced_accuracy_score
print("Balanced Acc:",round(balanced_accuracy_score(true, predFinal),2))

from sklearn.metrics import accuracy_score
print("Acc:",round(accuracy_score(true, predFinal),2))
print("Avg. precision:",round(preAvg,2))
print("Avg. recall:",round(reAvg,2))
print("Avg. F1:",round(f1Avg,2))

from sklearn.metrics import f1_score
print("weighted F1:",round(f1_score(true, predFinal, average='weighted'),2))

Balanced Acc: 0.48
Acc: 0.49
Avg. precision: 0.49
Avg. recall: 0.48
Avg. F1: 0.44
weighted F1: 0.42


In [72]:
d

Unnamed: 0,valve,cardiac,chest,aortic,test,elevation,stress,coronary,heparin,catheterization,...,sepsis,diarrhea,micu,male,started,plavix,exertion,left,severe,ICD9
0,0.086525,0.045055,0.000000,0.152823,0.000000,0.000000,0.000000,0.200528,0.000000,0.056237,...,0.0,0.000000,0.000000,0.055850,0.000000,0.000000,0.070841,0.086614,0.000000,0
1,0.000000,0.000000,0.065119,0.000000,0.000000,0.092086,0.000000,0.000000,0.082676,0.000000,...,0.0,0.000000,0.000000,0.030352,0.098400,0.050072,0.000000,0.023535,0.000000,2
2,0.000000,0.133376,0.047296,0.000000,0.000000,0.000000,0.000000,0.029681,0.045036,0.033296,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.041942,0.025640,0.000000,1
3,0.000000,0.124666,0.018420,0.000000,0.000000,0.000000,0.000000,0.000000,0.035079,0.000000,...,0.0,0.000000,0.000000,0.025756,0.027834,0.000000,0.000000,0.079887,0.000000,2
4,0.000000,0.098397,0.087232,0.000000,0.000000,0.185034,0.000000,0.182477,0.000000,0.122819,...,0.0,0.000000,0.000000,0.040658,0.000000,0.000000,0.000000,0.063053,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11532,0.000000,0.000000,0.020775,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.046481,0.050865,0.000000,0.000000,0.000000,0.000000,0.045049,0.000000,3
11533,0.000000,0.000000,0.080510,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.045033,0.000000,0.000000,0.030414,0.000000,0.000000,0.065469,0.033391,3
11534,0.000000,0.028020,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.082418,3
11535,0.000000,0.042008,0.037241,0.000000,0.074883,0.000000,0.073181,0.093485,0.000000,0.052435,...,0.0,0.000000,0.000000,0.052074,0.000000,0.000000,0.000000,0.040379,0.000000,1


In [47]:
d = pd.read_csv('input_ICD9_TFIDF_40.csv')
d.columns = list(range(0,40)) + ['ICD9']
d.columns

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            9,     10,     11,     12,     13,     14,     15,     16,     17,
           18,     19,     20,     21,     22,     23,     24,     25,     26,
           27,     28,     29,     30,     31,     32,     33,     34,     35,
           36,     37,     38,     39, 'ICD9'],
      dtype='object')