In [97]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report 
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

# Data set

In [98]:
df=pd.read_excel("Training_Data.xlsx")

In [99]:
df

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Gender,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,ChronicObstructivePulmonaryDisease,...,BetaBlockers,Diuretics,TotalMedicine,CardiacTroponin,Hemoglobin,SerumSodium,SerumCreatinine,BNP,NT-proBNP,ReadmissionWithin_90Days
0,4200412,199171333,Home,Male,White,DM,,Anemia,,COPD,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,Yes
1,4055894,26704337,Home,Male,White,DM,CKD,Anemia,Depression,COPD,...,1,5,8,0.0,0.00,0.0,1.540000,0.0,0.0,No
2,4867407,60388216,Home,Male,White,DM,CKD,Anemia,,COPD,...,1,1,2,0.0,10.20,0.0,0.000000,0.0,0.0,No
3,4058064,274642265,Hospice - Home,Female,White,DM,,Anemia,,COPD,...,0,0,0,0.0,0.00,132.0,0.000000,0.0,0.0,No
4,4150623,70000001557327,SNF,Female,White,,,Anemia,,COPD,...,0,0,0,0.0,7.26,0.0,0.000000,0.0,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,40004415567,Home Health,Female,White,DM,CKD,Anemia,,COPD,...,2,3,6,0.0,0.00,0.0,1.076667,0.0,0.0,Yes
8477,4042227,14347947026,SNF,Male,White,DM,CKD,,Depression,COPD,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,Yes
8478,4603405,67117733,Hospice,Female,White,,,,,,...,1,0,1,0.0,0.00,0.0,0.000000,0.0,0.0,No
8479,4033677,68564389,Home,Female,White,DM,CKD,,Depression,,...,1,1,5,0.0,0.00,0.0,1.690000,0.0,0.0,Yes


In [100]:
#Checking total numbers of columns
len(df.columns)

57

In [101]:
dataCols = df.columns.values.tolist()

In [102]:
type(dataCols)
dataCols.pop()
#removing the last column

'ReadmissionWithin_90Days'

In [103]:
len(dataCols)

56

# Preprocessing 

In [104]:
print(df.isnull().sum())

PatientId                                0
EncounterId                              0
DischargeDisposision                     0
Gender                                   0
Race                                    93
DiabetesMellitus                      3857
ChronicKidneyDisease                  3906
Anemia                                3002
Depression                            5108
ChronicObstructivePulmonaryDisease    3954
Age                                      0
ChronicDiseaseCount                      0
LengthOfStay                             0
EmergencyVisit                           0
InpatientVisit                           0
OutpatientVisit                          0
TotalVisits                              0
BMIMin                                   0
BMIMax                                   0
BMIMedian                                0
BMIMean                                  0
BPDiastolicMin                           0
BPDiastolicMax                           0
BPDiastolic

In [105]:
def preprocess(df):
    df['EncounterId']= pd.to_numeric(df['EncounterId'],errors = "coerce")
    df["EncounterId"].isnull().sum()
    df['EncounterId'].fillna(0,inplace= True)
    
    #For race
    rmode= df['Race'].mode()
    df['Race'].fillna(rmode [0], inplace=True) #adding 0 so it applies to the entire col
    #for others 
    df['DiabetesMellitus'].fillna("0", inplace=True)
    df['ChronicKidneyDisease'].fillna("0", inplace=True)
    df['Anemia'].fillna("0", inplace=True)
    df['Depression '].fillna("0", inplace=True)
    df['ChronicObstructivePulmonaryDisease'].fillna("0", inplace=True)
    
    le = LabelEncoder()
    cols = ['Race','DischargeDisposision','Gender','DiabetesMellitus','ChronicKidneyDisease','Anemia','Depression ','ChronicObstructivePulmonaryDisease','ReadmissionWithin_90Days']
    
    
    df[cols] = df[cols].apply(le.fit_transform)
    return df    

In [106]:
df = preprocess(df)
df

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Gender,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,ChronicObstructivePulmonaryDisease,...,BetaBlockers,Diuretics,TotalMedicine,CardiacTroponin,Hemoglobin,SerumSodium,SerumCreatinine,BNP,NT-proBNP,ReadmissionWithin_90Days
0,4200412,1.991713e+08,7,1,6,1,0,1,0,1,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,1
1,4055894,2.670434e+07,7,1,6,1,1,1,1,1,...,1,5,8,0.0,0.00,0.0,1.540000,0.0,0.0,0
2,4867407,6.038822e+07,7,1,6,1,1,1,0,1,...,1,1,2,0.0,10.20,0.0,0.000000,0.0,0.0,0
3,4058064,2.746423e+08,11,0,6,1,0,1,0,1,...,0,0,0,0.0,0.00,132.0,0.000000,0.0,0.0,0
4,4150623,7.000000e+13,19,0,6,0,0,1,0,1,...,0,0,0,0.0,7.26,0.0,0.000000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,4.000442e+10,8,0,6,1,1,1,0,1,...,2,3,6,0.0,0.00,0.0,1.076667,0.0,0.0,1
8477,4042227,1.434795e+10,19,1,6,1,1,0,1,1,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,1
8478,4603405,6.711773e+07,10,0,6,0,0,0,0,0,...,1,0,1,0.0,0.00,0.0,0.000000,0.0,0.0,0
8479,4033677,6.856439e+07,7,0,6,1,1,0,1,0,...,1,1,5,0.0,0.00,0.0,1.690000,0.0,0.0,1


In [107]:
print(df.isnull().sum())

PatientId                             0
EncounterId                           0
DischargeDisposision                  0
Gender                                0
Race                                  0
DiabetesMellitus                      0
ChronicKidneyDisease                  0
Anemia                                0
Depression                            0
ChronicObstructivePulmonaryDisease    0
Age                                   0
ChronicDiseaseCount                   0
LengthOfStay                          0
EmergencyVisit                        0
InpatientVisit                        0
OutpatientVisit                       0
TotalVisits                           0
BMIMin                                0
BMIMax                                0
BMIMedian                             0
BMIMean                               0
BPDiastolicMin                        0
BPDiastolicMax                        0
BPDiastolicMedian                     0
BPDiastolicMean                       0


# First model before GA

In [108]:
X = df.drop('ReadmissionWithin_90Days',axis=1)
y = df['ReadmissionWithin_90Days']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2 , random_state = 60)

In [125]:
rm = RandomForestClassifier(n_estimators = 10, max_depth=25, criterion = "gini", min_samples_split=10)
rm.fit(X_train, y_train)
pred = rm.predict(X_test)
print ("Accuracy : " , accuracy_score(y_test,pred)*100)  
print("Report : \n", classification_report(y_test, pred))
print("F1 Score : ",f1_score(y_test, pred, average='macro')*100)

#For comparison 
OriginalAccuracyRFC = accuracy_score(y_test,pred)*100

Accuracy :  70.71302298173246
Report : 
               precision    recall  f1-score   support

           0       0.74      0.88      0.80      1163
           1       0.56      0.33      0.41       534

    accuracy                           0.71      1697
   macro avg       0.65      0.60      0.61      1697
weighted avg       0.68      0.71      0.68      1697

F1 Score :  60.90458066044317


In [126]:
lg = lgb.LGBMClassifier()
lg.fit(X_train, y_train)
pred=lg.predict(X_test)
lg.score(X_test,y_test)
print ("Accuracy : " , accuracy_score(y_test,pred)*100)  
print("Report : \n", classification_report(y_test, pred))
print("F1 Score : ",f1_score(y_test, pred, average='macro')*100)
#For comparison 
OriginalAccuracyLGB = accuracy_score(y_test,pred)*100

Accuracy :  70.59516794342959
Report : 
               precision    recall  f1-score   support

           0       0.74      0.87      0.80      1163
           1       0.55      0.34      0.42       534

    accuracy                           0.71      1697
   macro avg       0.65      0.61      0.61      1697
weighted avg       0.68      0.71      0.68      1697

F1 Score :  61.290509854474216


# GA

In [127]:
def Population(row,col):
    #print("Function: population")
    Allel = [True,False]
    popu = np.random.choice(Allel,size=(Size_rows,col))
    
    return popu    

In [128]:
def Sort(population, accuracy):
    #print("Function: Sorting")
    index_list=[]
    sort_Acc= sorted(accuracy,reverse = True)
    for i in range(len(accuracy)):
        for j in range(len(accuracy)):
            if sort_Acc[i] == accuracy[j]:
                index_list.append(j)
                
    ind_chromo = 0
    for i in range(len(accuracy)):
        population[i],population[index_list[i]] = population[index_list[i]],population[i]
        ind_chromo = ind_chromo + 1
        
    return population

In [129]:
def crossover(population,size):
    #print("Function: crossover")
    split = np.random.randint(10,56)
    # split = 56//2
    size = size//2
    if (size % 2 == 1):
        size+=1
    population = population[:size]   # splitting half the features which gave lower accuraices
    for i in range(0,size,2):
        if i+1 == (size-1):
            break
        # crossover
        c1 = population[i,:split]
        c2 = population[i,split:]
        c3 = population[i+1,:split]
        c4 = population[i+1,split:]
        #joining the cross values
        child1 = np.concatenate((c1,c4),axis=0)
        child2 = np.concatenate((c2,c3),axis=0)
        child1 = child1.reshape(56,1)
        child2 = child2.reshape(56,1)
        S1 = np.concatenate((child1,child2),axis=0)
        S1 = S1.reshape(2,56)
        l = np.concatenate((population,S1),axis=0)

    l = l[:size]
    l = l = np.concatenate((population,l),axis=0)

    return l

In [130]:
def mutation(newPopu):
    #print("Function: mutation")
    for r in range(len(newPopu)):
        random = np.random.randint(56,size = 15)
        for i in random:
            if newPopu[r][i] == False:
                newPopu[r][i] = True
            else:
                newPopu[r][i] = False
    return newPopu

In [144]:
Size_rows = 50
population = Population(Size_rows,56)

for i in range(50):
    accuracy = []
    for i in range(0,Size_rows):

#         =======> LGB CLASSIFIER <=======
#         x = X.loc[:,S_features[i]] #X is training data set
#         X_train, X_test, y_train, y_test = train_test_split(x, y , test_size = 0.2 , random_state = 60)
        
#         lg = lgb.LGBMClassifier()
#         lg.fit(X_train, y_train)
#         pred=lg.predict(X_test)
#         lg.score(X_test,y_test)
#         accuracy.append(accuracy_score(y_test,pred)*100)
        
#       =======> RFC CLASSIFIER <======= 
        x = X.loc[:,population[i]] #X is training data set
        X_train, X_test, y_train, y_test = train_test_split(x, y , test_size = 0.2 , random_state = 60)
        rm = RandomForestClassifier(n_estimators = 20, max_depth=25, criterion = "entropy", min_samples_split=10)
        rm.fit(X_train, y_train)
        pred=rm.predict(X_test)
        accuracy.append(accuracy_score(y_test,pred)*100)

        #Saving latest accuracy incase the  
        ongoing_Acc = accuracy_score(y_test,pred)*100   
        ongoing_max_acc = max(accuracy)
        
    
    print('max : ',max(accuracy))
    
#   CONDITION TO TERMINATE WHEN THRESHOLD/DESIRED ACCURACY IS ACHIEVED
    if max(accuracy) >= 73:
        print("Report : \n", classification_report(y_test, pred))
        print("F1 Score : ",f1_score(y_test, pred, average='macro')*100)
        
        
        
        break

#   CALLING SORT FUNCTION TO SORT THE ACCURICIES (Get the highest ones abovoe and lower ones below)
    population = Sort(population,accuracy)

#   CALLING CROSS OVER FUNCTION (SINGLE POINT CROSSOVER)
    new_population = crossover(population,Size_rows)

#   CALLING MUTATION FUNCTION IF THE RANDOM NUMBER IS ABOVE 0.5
    n = np.random.uniform(0.0,1.0)
    if n > 0.5:
        new_population = mutation(new_population)

#   UPDATING POPULATION FOR NEXT ITERATION
    population = new_population


max :  72.00942840306423
max :  71.65586328815556
max :  72.12728344136711
max :  72.30406599882146
max :  72.18621096051857
max :  71.95050088391278
max :  72.775486152033
max :  72.24513847967
max :  72.24513847967
max :  72.42192103712433
max :  72.30406599882146
max :  72.06835592221567
max :  72.42192103712433
max :  72.12728344136711
max :  71.89157336476134
max :  72.24513847967
max :  72.30406599882146
max :  72.65763111373012
max :  71.53800824985268
max :  72.00942840306423
max :  71.53800824985268
max :  71.89157336476134
max :  71.59693576900412
max :  71.3022981732469
max :  71.77371832645846
max :  72.00942840306423
max :  71.65586328815556
max :  71.4201532115498
max :  71.77371832645846
max :  72.12728344136711
max :  72.3629935179729
max :  71.77371832645846
max :  71.89157336476134
max :  71.95050088391278
max :  72.3629935179729
max :  71.89157336476134
max :  72.3629935179729
max :  73.36476134354743
Report : 
               precision    recall  f1-score   support



In [145]:
print("Comparison:")
print("Result:")
print("=============================================================")
print("Original Accuracy: ",OriginalAccuracyRFC)
print("heighest GA Accuracy: ",ongoing_max_acc)
print("=============================================================")

Comparison:
Result:
Original Accuracy:  70.71302298173246
heighest GA Accuracy:  73.36476134354743


In [146]:
# import pickle
# filename = 'GA.pkl'
# pickle.dump(rm, open(filename, 'wb'))

In [None]:
S