In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report 
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

# Loading Dataset

In [2]:
df=pd.read_excel("Training_Data.xlsx")

In [3]:
df

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Gender,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,ChronicObstructivePulmonaryDisease,...,BetaBlockers,Diuretics,TotalMedicine,CardiacTroponin,Hemoglobin,SerumSodium,SerumCreatinine,BNP,NT-proBNP,ReadmissionWithin_90Days
0,4200412,199171333,Home,Male,White,DM,,Anemia,,COPD,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,Yes
1,4055894,26704337,Home,Male,White,DM,CKD,Anemia,Depression,COPD,...,1,5,8,0.0,0.00,0.0,1.540000,0.0,0.0,No
2,4867407,60388216,Home,Male,White,DM,CKD,Anemia,,COPD,...,1,1,2,0.0,10.20,0.0,0.000000,0.0,0.0,No
3,4058064,274642265,Hospice - Home,Female,White,DM,,Anemia,,COPD,...,0,0,0,0.0,0.00,132.0,0.000000,0.0,0.0,No
4,4150623,70000001557327,SNF,Female,White,,,Anemia,,COPD,...,0,0,0,0.0,7.26,0.0,0.000000,0.0,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,40004415567,Home Health,Female,White,DM,CKD,Anemia,,COPD,...,2,3,6,0.0,0.00,0.0,1.076667,0.0,0.0,Yes
8477,4042227,14347947026,SNF,Male,White,DM,CKD,,Depression,COPD,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,Yes
8478,4603405,67117733,Hospice,Female,White,,,,,,...,1,0,1,0.0,0.00,0.0,0.000000,0.0,0.0,No
8479,4033677,68564389,Home,Female,White,DM,CKD,,Depression,,...,1,1,5,0.0,0.00,0.0,1.690000,0.0,0.0,Yes


In [4]:
dataCols = df.columns.values.tolist()

In [5]:
type(dataCols)
dataCols.pop()
#removing the last column 

'ReadmissionWithin_90Days'

The column "ReadmissionWithin_90Days" was removed, because it is the column containing the labels

In [6]:
len(dataCols)

56

# Data Preprocessing 
### -> Converting to neumeric 
### ->Filling nan values
### -> Using Label Encoder where necessary

In [7]:
print(df.isnull().sum())

PatientId                                0
EncounterId                              0
DischargeDisposision                     0
Gender                                   0
Race                                    93
DiabetesMellitus                      3857
ChronicKidneyDisease                  3906
Anemia                                3002
Depression                            5108
ChronicObstructivePulmonaryDisease    3954
Age                                      0
ChronicDiseaseCount                      0
LengthOfStay                             0
EmergencyVisit                           0
InpatientVisit                           0
OutpatientVisit                          0
TotalVisits                              0
BMIMin                                   0
BMIMax                                   0
BMIMedian                                0
BMIMean                                  0
BPDiastolicMin                           0
BPDiastolicMax                           0
BPDiastolic

In [8]:
def preprocess(df):
    df['EncounterId']= pd.to_numeric(df['EncounterId'],errors = "coerce")
    df["EncounterId"].isnull().sum()
    df['EncounterId'].fillna(0,inplace= True)
    
    #For race
    rmode= df['Race'].mode()
    df['Race'].fillna(rmode [0], inplace=True) #adding 0 so it applies to the entire col
    #for others 
    df['DiabetesMellitus'].fillna("0", inplace=True)
    df['ChronicKidneyDisease'].fillna("0", inplace=True)
    df['Anemia'].fillna("0", inplace=True)
    df['Depression '].fillna("0", inplace=True)
    df['ChronicObstructivePulmonaryDisease'].fillna("0", inplace=True)
    
    le = LabelEncoder()
    cols = ['Race','DischargeDisposision','Gender','DiabetesMellitus','ChronicKidneyDisease','Anemia','Depression ','ChronicObstructivePulmonaryDisease','ReadmissionWithin_90Days']
    
    
    df[cols] = df[cols].apply(le.fit_transform)
    return df    

In [9]:
df = preprocess(df)
df

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Gender,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,ChronicObstructivePulmonaryDisease,...,BetaBlockers,Diuretics,TotalMedicine,CardiacTroponin,Hemoglobin,SerumSodium,SerumCreatinine,BNP,NT-proBNP,ReadmissionWithin_90Days
0,4200412,1.991713e+08,7,1,6,1,0,1,0,1,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,1
1,4055894,2.670434e+07,7,1,6,1,1,1,1,1,...,1,5,8,0.0,0.00,0.0,1.540000,0.0,0.0,0
2,4867407,6.038822e+07,7,1,6,1,1,1,0,1,...,1,1,2,0.0,10.20,0.0,0.000000,0.0,0.0,0
3,4058064,2.746423e+08,11,0,6,1,0,1,0,1,...,0,0,0,0.0,0.00,132.0,0.000000,0.0,0.0,0
4,4150623,7.000000e+13,19,0,6,0,0,1,0,1,...,0,0,0,0.0,7.26,0.0,0.000000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,4.000442e+10,8,0,6,1,1,1,0,1,...,2,3,6,0.0,0.00,0.0,1.076667,0.0,0.0,1
8477,4042227,1.434795e+10,19,1,6,1,1,0,1,1,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,1
8478,4603405,6.711773e+07,10,0,6,0,0,0,0,0,...,1,0,1,0.0,0.00,0.0,0.000000,0.0,0.0,0
8479,4033677,6.856439e+07,7,0,6,1,1,0,1,0,...,1,1,5,0.0,0.00,0.0,1.690000,0.0,0.0,1


In [10]:
print(df.isnull().sum())

PatientId                             0
EncounterId                           0
DischargeDisposision                  0
Gender                                0
Race                                  0
DiabetesMellitus                      0
ChronicKidneyDisease                  0
Anemia                                0
Depression                            0
ChronicObstructivePulmonaryDisease    0
Age                                   0
ChronicDiseaseCount                   0
LengthOfStay                          0
EmergencyVisit                        0
InpatientVisit                        0
OutpatientVisit                       0
TotalVisits                           0
BMIMin                                0
BMIMax                                0
BMIMedian                             0
BMIMean                               0
BPDiastolicMin                        0
BPDiastolicMax                        0
BPDiastolicMedian                     0
BPDiastolicMean                       0


# Model before using PSO  (For comparing)

In [11]:
X = df.drop('ReadmissionWithin_90Days',axis=1)
y = df['ReadmissionWithin_90Days']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2 , random_state = 60)

In [12]:
rm = RandomForestClassifier(n_estimators = 10, max_depth=25, criterion = "gini", min_samples_split=10)
rm.fit(X_train, y_train)
pred = rm.predict(X_test)
print ("Accuracy : " , accuracy_score(y_test,pred)*100)  
print("Report : \n", classification_report(y_test, pred))
print("F1 Score : ",f1_score(y_test, pred, average='macro')*100)

#For comparison 
OriginalAccuracyRFC = accuracy_score(y_test,pred)*100

Accuracy :  70.18267530936949
Report : 
               precision    recall  f1-score   support

           0       0.74      0.86      0.80      1163
           1       0.54      0.35      0.43       534

    accuracy                           0.70      1697
   macro avg       0.64      0.61      0.61      1697
weighted avg       0.68      0.70      0.68      1697

F1 Score :  61.300409222837146


In [13]:
lg = lgb.LGBMClassifier()
lg.fit(X_train, y_train)
pred=lg.predict(X_test)
lg.score(X_test,y_test)
print ("Accuracy : " , accuracy_score(y_test,pred)*100)  
print("Report : \n", classification_report(y_test, pred))
print("F1 Score : ",f1_score(y_test, pred, average='macro')*100)
#For comparison 
OriginalAccuracyLGB = accuracy_score(y_test,pred)*100

Accuracy :  70.59516794342959
Report : 
               precision    recall  f1-score   support

           0       0.74      0.87      0.80      1163
           1       0.55      0.34      0.42       534

    accuracy                           0.71      1697
   macro avg       0.65      0.61      0.61      1697
weighted avg       0.68      0.71      0.68      1697

F1 Score :  61.290509854474216


# Particle swarm optimization (PSO)


## Function for population

In [14]:
def Population(rows):
    #print("Function: population")
    Allel = [1,0]
    popu = np.random.choice(Allel,size=(rows,len(dataCols)))
    return popu

In [15]:
#Testing the population function 
testpopu=Population(1)
testpopu

array([[1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1]])

## function for column selection

In [16]:
def SelectionOfColumn(data,population):
    population = population.reshape(56)
    population_size = len(population)
    population = population.astype(bool)
    CS_df = data.loc[:,population] #column selected df
    return CS_df

In [17]:
#For testing purposes
SelectionOfColumn(X,testpopu)

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,Age,LengthOfStay,...,BPSystolicMedian,TemperatureMin,TemperatureMean,HeartRateMin,RespiratoryRateMean,ARBs,Diuretics,TotalMedicine,CardiacTroponin,NT-proBNP
0,4200412,1.991713e+08,7,6,1,0,1,0,58,3,...,0.0,0.00,0.000000,0,0.000000,0,0,0,0.0,0.0
1,4055894,2.670434e+07,7,6,1,1,1,1,80,6,...,106.0,97.30,98.266667,57,17.833333,0,5,8,0.0,0.0
2,4867407,6.038822e+07,7,6,1,1,1,0,63,8,...,0.0,0.00,0.000000,0,0.000000,0,1,2,0.0,0.0
3,4058064,2.746423e+08,11,6,1,0,1,0,73,4,...,182.0,97.70,97.700000,0,22.000000,0,0,0,0.0,0.0
4,4150623,7.000000e+13,19,6,0,0,1,0,85,3,...,122.5,96.98,97.854286,0,17.000000,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,4.000442e+10,8,6,1,1,1,0,59,21,...,121.5,96.20,97.822222,58,32.885714,1,3,6,0.0,0.0
8477,4042227,1.434795e+10,19,6,1,1,0,1,86,19,...,0.0,0.00,0.000000,0,0.000000,0,0,0,0.0,0.0
8478,4603405,6.711773e+07,10,6,0,0,0,0,94,12,...,146.5,96.98,98.220000,0,14.904762,0,0,1,0.0,0.0
8479,4033677,6.856439e+07,7,6,1,1,0,1,77,3,...,153.0,97.34,97.970000,0,16.875000,0,1,5,0.0,0.0


In [18]:
def functionOfFitness(selectedcols,y):
    X_train, X_test, y_train, y_test = train_test_split(selectedcols, y , test_size = 0.2 , random_state = 70)
    rm = RandomForestClassifier(n_estimators = 20, max_depth=25, criterion = "entropy", min_samples_split=10)
    rm.fit(X_train, y_train)
    pred=rm.predict(X_test)
    a = accuracy_score(y_test,pred)*100
    return a

In [19]:
def calculationOfVelocity(population_best,global_best,x):
    r1 = np.random.uniform(0, 1)
    r2 = np.random.uniform(0, 1)
    c1 = 2
    c2 = 3
    velocities = []
    for i in range(len(population_best)):   
        val = (c1*r1)*(population_best[i] - x[i]) + (c2*r2)*(global_best - x[i])
        velocities.append(val)
    return velocities

In [20]:
rows = 30 # row size of poluation
#making populaion of rows 
populations = Population(rows)
    
accuracy = []

#calling fitness function
for i in range(0,rows):
    ColSelected = SelectionOfColumn(X,populations[i])    
    accuracy.append(functionOfFitness(ColSelected,y))

popu_best = populations
var = populations
for l in range(30):
#   making population and global best
    
    old_population = populations
    maxAcc = max(accuracy)
    i = np.where(accuracy == maxAcc)
    global_best = popu_best[i[0][0]]

            
    print("Max Accuracy: ",max(accuracy))
   
    #Termination condition        
    if max(accuracy) >= 73.5:
        print("Highest Accuracy is: ",max(accuracy))
        bestfeautres= global_best
        maxacc = max(accuracy)
        break

    
    # Calling Velocities
    Vel = calculationOfVelocity(popu_best,global_best,var)
    
    # updation of posi
    populations = populations + Vel
    for i in range(len(populations)):   
        m = max(populations[i])
        populations[i] =populations[i] / m    
    
    
    for i in range(len(populations)):
        for j in range(56):
            if (populations[i][j] > 0.5).all():
                populations[i][j] = 1
            else:
                populations[i][j] = 0
                
    var = populations
    newpopulations = populations
      
    newAccuracies = []
    for i in range(0,rows):
        ColSelected = SelectionOfColumn(X,newpopulations[i])     
        newAccuracies.append(functionOfFitness(ColSelected,y))

    # comparison     
    for i in range(0,rows): 
        if newAccuracies[i] > accuracy[i]:
            populations[i] = newpopulations[i]
            accuracy[i] = newAccuracies[i]
             

    

Max Accuracy:  72.53977607542723
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  72.95226870948733
Max Accuracy:  73.36476134354743
Max Accuracy:  73.36476134354743
Max Accuracy:  73.36476134354743
Max Accuracy:  73.36476134354743
Max Accuracy:  74.1308190925162
Highest Accuracy is:  74.1308190925162


# Comparing accuracy before and after PSO

In [23]:
print("Comparison:")
print("Result:")
print("=============================================================")
print("Original Accuracy: ",OriginalAccuracyRFC)
print("PSO highest accuracy:",maxacc)
print("=============================================================")


Comparison:
Result:
Original Accuracy:  70.18267530936949
PSO highest accuracy: 74.1308190925162


In [24]:
print("Highest Accuracy features:")
SelectionOfColumn(X,bestfeautres)

Highest Accuracy features:


Unnamed: 0,DischargeDisposision,ChronicKidneyDisease,Anemia,LengthOfStay,InpatientVisit,TotalVisits,BMIMin,BMIMedian,BMIMean,BPDiastolicMin,...,PulseRateMean,RespiratoryRateMax,RespiratoryRateMedian,RespiratoryRateMean,ACEInhibitors,ARBs,Diuretics,Hemoglobin,SerumSodium,SerumCreatinine
0,7,0,1,3,8,45,51.48,51.48,51.48,0,...,0.000000,0,0.0,0.000000,0,0,0,0.00,0.0,0.000000
1,7,1,1,6,6,13,31.74,31.74,31.74,46,...,0.000000,22,17.5,17.833333,2,0,5,0.00,0.0,1.540000
2,7,1,1,8,2,5,0.00,0.00,0.00,0,...,0.000000,0,0.0,0.000000,0,0,1,10.20,0.0,0.000000
3,11,0,1,4,4,12,20.55,20.55,20.55,115,...,0.000000,22,22.0,22.000000,0,0,0,0.00,132.0,0.000000
4,19,0,1,3,8,13,0.00,0.00,0.00,38,...,91.476190,20,17.0,17.000000,0,0,0,7.26,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,8,1,1,21,4,7,43.41,43.41,43.41,44,...,0.000000,107,26.0,32.885714,0,1,3,0.00,0.0,1.076667
8477,19,1,0,19,2,4,0.00,0.00,0.00,0,...,0.000000,0,0.0,0.000000,0,0,0,0.00,0.0,0.000000
8478,10,0,0,12,2,2,0.00,0.00,0.00,60,...,111.863636,22,16.0,14.904762,0,0,0,0.00,0.0,0.000000
8479,7,1,0,3,6,6,0.00,0.00,0.00,69,...,79.666667,18,17.0,16.875000,3,0,1,0.00,0.0,1.690000
