In [2]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
#import pixiedust

# Algorithms

# <center>Neural Network Class</center>

In [3]:
class Layer:
        
        def __init__(self,inputs,neurons,activation,derivative):
            """
                creates a layer in the Neural Network
                
                Inputs:
                    inputs     : the number of nodes in previous layer
                    neurons    : the number of neurons in current layer
                    activation : the activation function to be used
            """
            
            #Weight matrix
            self.W = np.random.randn(neurons,inputs+1)
            
            #Bias
            #self.b = np.zeros((neurons,1))
            
            #activation function
            self.act = activation
            
            #derivative function
            self.div = derivative
            
        
        def feedFoward(self,A_prev):
            
            """
                computes the output of current layer
                
                Inputs:
                    A_prev : outputs from the previous layer
                    
                Outputs:
                    A      : output from current Layer
            """
            m,n = A_prev.shape
            self.A_prev = A_prev
            
            
            ones = np.ones((1,n))
            temp = np.vstack([ones,A_prev])
            
    
            self.Z = self.W @ temp
            self.A = self.act(self.Z)
            
            return self.A
        
        
        def backprop(self,dA,alpha):
            
            """
                calculates the derivatives at the current layer
            """
            
            #learning rate
            
            
            
            div_term = self.div(self.A_prev)
            dA_prev = (self.W.T[1:] @ dA) * div_term
           
            return dA_prev
        
        
        
        def set_weights(self,W):
            self.W = W
        




class Neural_Network:
    
    def __init__(self,param,func,deriv):
        
        """
            Initialises NN
            
            Inputs:
                param : the dimensions of the NN
                func  : the activation function that will be used
                deriv : derivative of func
        """
        
        
        #ATTRIBUTES
        
        #number of layers
        self.s = len(param)
        
        #the activation function
        self.func = func
        
        #the derivative of activation function
        self.deriv = deriv
        
        #creating layers#
        self.Layers = []
        
        
        #deltas#
        self.delta = []
        
        #A values#
        self.A_list = []
        
        for i in range(1,len(param),1):
            
            temp = Layer(param[i-1],param[i],self.func,self.deriv)
            self.Layers.append(temp)
        
        
        
    def predict(self,X):
        
        """
            Makes prediction off the dataset given
        """
        
        A = X.T
        self.A_list = [A]
        
        for layer in self.Layers:
            A = layer.feedFoward(A)
            self.A_list.append(A)
        
        #print(self.A_list)
        return A
        
        
    def train(self,X,y,alpha,epoch):
        
        for j in range(epoch):
            A = self.predict(X)
        
            dA = A - y

            self.delta = [dA]

            
            #calculating delta errors
            for i in range(len(self.Layers)-1,0,-1):

                dA = self.Layers[i].backprop(dA,alpha)
                self.delta.insert(0,dA)


            #getting update matrices
            up_D  = self.Deltas()
            
            i = 0
            
            for layer in self.Layers:
                #TODO: Regularise here
                wNew = (alpha* 1/len(y)) * up_D[i]
                layer.W = layer.W - wNew
                i+=1

    
    def Deltas(self):
        
        upper_D = []
        
        for i in range(0,len(self.delta),1):
                     
            m,n = self.A_list[i].shape
            ones = np.ones((1,n))
            aValue = np.vstack([ones,self.A_list[i]])
            
            
            temp = self.delta[i] @ aValue.T
            upper_D.append(temp)
    
        return upper_D
    
    
    def setWeights(self,W,index):
        
        self.Layers[index].set_weights(W)
          

# <center>Logistic Model</center>

# Logistic Regression

using the definition of dot product, logistic regression can be simplified to this expression:
$$\theta = \theta - \frac{\alpha}{m}(h(\theta X^{T}) - y) \cdot X^{T} + \lambda \theta$$

where:

$\alpha$ : the learning rate

$m$ : the number of features

$\theta$ : the initial learning parameters

$X$ : the data given in the design matrix format

$\lambda$ : the regularisation term

$h(\theta X^{T})$ : 

In [4]:
def LogisticRegression(X,y,theta0,alpha,tol,L,num_iter):
    """
        computes theta values using the psuedo inverse
        
        inputs:
            X        : data in the form of the design matrix
            y        : the labels associated with the data
            theta0   : the intial guess on the learning parameters
            alpha    : the learning rate
            tol      : the margin of error
            num_iter : the number of times algorithms must loop
            L        : the regularisation parameter(lambda)
            
        outputs:
            theta    : the learning parameters given the model
    """
    
    
    sigmoid = lambda z: 1/(1+np.exp(-z))
    
    i = 1
    V = np.copy(theta0)
    V[0] = 0
    theta_new  = theta0 - alpha*(sigmoid(theta0 @ X.T) - y) @ X + L*V
    
    while np.linalg.norm(theta_new-theta0) >tol and i <=num_iter:
        i+=1
        theta0 = theta_new
        V = np.copy(theta0)
        V[0] = 0
        theta_new  = theta0 - alpha*(sigmoid(theta0 @ X.T) - y) @ X + L*V
    
    return theta_new

In [5]:
def y_subset(y,value):
    
    """
        creates the sub labels for that specific label to train multiclass logistic regression
        
        inputs:
            y        : the true labels associated with the data
            value    : the label that will be used to generate the y_subset
        
        outputs:
            y_subset : the labels that will be used to train the logistic regression 
    """
    
    ans = []
    
    for i in range(0,len(y),1):
        
        if(y[i]==value):
            ans.append(1)
        else:
            ans.append(0)
    return ans

In [6]:
def One_vs_all(X,y,Theta0,alpha,tol,L,num_iter):

    """
        computes theta Matrix using the logistic Regression
        
        inputs:
            X        : data in the form of the design matrix
            y        : the labels associated with the data
            theta0   : the intial guess on the learning parameters
            alpha    : the learning rate
            tol      : the margin of error
            num_iter : the number of times algorithms must loop
            L        : the regularisation parameter(lambda)
            
        outputs:
            theta    : the learning parameters given the model
    """
    
    outcomes = np.unique(y)
    Param = []
    
    for i in range(len(outcomes)):
        
        value = outcomes[i]
        y_sub = y_subset(y,value)
        temp_theta = Theta0[i,:]
        temp_param = LogisticRegression(X,y_sub,temp_theta,alpha,tol,L,num_iter)
        Param.append(temp_param)
        
    return np.array(Param)

In [7]:
class LogisticModel:

    def __init__(self,multiclass):

        """
            intialises the Logistic model class

            Inputs:
                multiclass: specifics if model is a binary classifier or not
        """

        self.W = None
        self.multiclass = multiclass

    def train(self,X,y,Theta0,alpha,tol,L,num_iter):

        """
        computes theta Matrix using the logistic Regression
        
        inputs:
            X        : data in the form of the design matrix
            y        : the labels associated with the data
            theta0   : the intial guess on the learning parameters
            alpha    : the learning rate
            tol      : the margin of error
            num_iter : the number of times algorithms must loop
            L        : the regularisation parameter(lambda)
        """

        if(self.multiclass == True):
            self.W = self.One_vs_all(X,y,Theta0,alpha,tol,L,num_iter)
        else:
            Self.W = self.LogisticRegression(X,y,theta0,alpha,tol,L,num_iter)
            

    def LogisticRegression(self,X,y,theta0,alpha,tol,L,num_iter):
        """
            computes theta values using the psuedo inverse
            
            inputs:
                X        : data in the form of the design matrix
                y        : the labels associated with the data
                theta0   : the intial guess on the learning parameters
                alpha    : the learning rate
                tol      : the margin of error
                num_iter : the number of times algorithms must loop
                L        : the regularisation parameter(lambda)
                
            outputs:
                theta    : the learning parameters given the model
        """
        
        
        sigmoid = lambda z: 1/(1+np.exp(-z))
        
        i = 1
        V = np.copy(theta0)
        V[0] = 0
        theta_new  = theta0 - alpha*(sigmoid(theta0 @ X.T) - y) @ X + L*V
        
        while np.linalg.norm(theta_new-theta0) >tol and i <=num_iter:
            i+=1
            theta0 = theta_new
            V = np.copy(theta0)
            V[0] = 0
            theta_new  = theta0 - alpha*(sigmoid(theta0 @ X.T) - y) @ X + L*V
        
        return theta_new


    def One_vs_all(self,X,y,Theta0,alpha,tol,L,num_iter):

        """
        computes theta Matrix using the logistic Regression
        
        inputs:
            X        : data in the form of the design matrix
            y        : the labels associated with the data
            theta0   : the intial guess on the learning parameters
            alpha    : the learning rate
            tol      : the margin of error
            num_iter : the number of times algorithms must loop
            L        : the regularisation parameter(lambda)
            
        outputs:
            theta    : the learning parameters given the model
        """
    
        outcomes = np.unique(y)
        Param = []
    
        for i in range(len(outcomes)):
        
            value = outcomes[i]
            y_sub = self.y_subset(y,value)
            temp_theta = Theta0[i,:]
            temp_param = self.LogisticRegression(X,y_sub,temp_theta,alpha,tol,L,num_iter)
            Param.append(temp_param)
        
        return np.array(Param)



    def y_subset(self,y,value):
    
        """
            creates the sub labels for that specific label to train multiclass logistic regression
            
            inputs:
                y        : the true labels associated with the data
                value    : the label that will be used to generate the y_subset
            
            outputs:
                y_subset : the labels that will be used to train the logistic regression 
        """
        
        ans = []
        
        for i in range(0,len(y),1):
            
            if(y[i]==value):
                ans.append(1)
            else:
                ans.append(0)
        return ans

### <center>Confusion Matrix</center>

In [8]:
def confusion_matrix(output,y):
    
   
    outcomes = list(np.unique(y))
    
    matrix = np.zeros((len(outcomes),len(outcomes)))
    
    for i in range(0,len(y),1):
        
        predicted_value = output[i]
        true_value = y[i]
        
        row = outcomes.index(predicted_value)
        col = outcomes.index(true_value)
        
        matrix[row,col] += 1
        
    
    error = 0
    
    for i in range(0,len(outcomes),1):
        error+= matrix[i][i]
        
    error/= len(y)
    con_matrix = pd.DataFrame(data = matrix,index= outcomes,columns=outcomes)
    return error,con_matrix

# Cleaning the data

In [91]:
#Importing the data#
Dataset = pd.read_csv("Data/host_train.csv")

#printing the first 10 values
Dataset.head(10)

Unnamed: 0,case_id,Hospital,Hospital_type,Hospital_city,Hospital_region,Available_Extra_Rooms_in_Hospital,Department,Ward_Type,Ward_Facility,Bed_Grade,patientid,City_Code_Patient,Type of Admission,Illness_Severity,Patient_Visitors,Age,Admission_Deposit,Stay_Days
0,1,8,2,3,2,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,2,5,2,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,4,1,0,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,1,2,1,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,1,2,1,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50
5,6,23,0,6,0,2,anesthesia,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,4449.0,11-20
6,7,32,5,9,1,1,radiotherapy,S,B,3.0,31397,7.0,Emergency,Extreme,2,51-60,6167.0,0-10
7,8,23,0,6,0,4,radiotherapy,Q,F,3.0,31397,7.0,Trauma,Extreme,2,51-60,5571.0,41-50
8,9,1,3,10,1,2,gynecology,R,B,4.0,31397,7.0,Trauma,Extreme,2,51-60,7223.0,51-60
9,10,10,4,1,0,2,gynecology,S,E,3.0,31397,7.0,Trauma,Extreme,2,51-60,6056.0,31-40


In [92]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
case_id                              318438 non-null int64
Hospital                             318438 non-null int64
Hospital_type                        318438 non-null int64
Hospital_city                        318438 non-null int64
Hospital_region                      318438 non-null int64
Available_Extra_Rooms_in_Hospital    318438 non-null int64
Department                           318438 non-null object
Ward_Type                            318438 non-null object
Ward_Facility                        318438 non-null object
Bed_Grade                            318325 non-null float64
patientid                            318438 non-null int64
City_Code_Patient                    313906 non-null float64
Type of Admission                    318438 non-null object
Illness_Severity                     318438 non-null object
Patient_Visitors                     318438 non-null i

In [93]:
#Dropping case ID and patient ID
Dataset = Dataset.drop(['case_id','patientid'],axis =1)
Dataset.head(10)

Unnamed: 0,Hospital,Hospital_type,Hospital_city,Hospital_region,Available_Extra_Rooms_in_Hospital,Department,Ward_Type,Ward_Facility,Bed_Grade,City_Code_Patient,Type of Admission,Illness_Severity,Patient_Visitors,Age,Admission_Deposit,Stay_Days
0,8,2,3,2,3,radiotherapy,R,F,2.0,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,5,2,2,radiotherapy,S,F,2.0,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,10,4,1,0,2,anesthesia,S,E,2.0,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,26,1,2,1,2,radiotherapy,R,D,2.0,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,26,1,2,1,2,radiotherapy,S,D,2.0,7.0,Trauma,Extreme,2,51-60,5558.0,41-50
5,23,0,6,0,2,anesthesia,S,F,2.0,7.0,Trauma,Extreme,2,51-60,4449.0,11-20
6,32,5,9,1,1,radiotherapy,S,B,3.0,7.0,Emergency,Extreme,2,51-60,6167.0,0-10
7,23,0,6,0,4,radiotherapy,Q,F,3.0,7.0,Trauma,Extreme,2,51-60,5571.0,41-50
8,1,3,10,1,2,gynecology,R,B,4.0,7.0,Trauma,Extreme,2,51-60,7223.0,51-60
9,10,4,1,0,2,gynecology,S,E,3.0,7.0,Trauma,Extreme,2,51-60,6056.0,31-40


In [94]:
#Departments#
formatedData = Dataset
Dataset['Department'].value_counts()

gynecology            249486
anesthesia             29649
radiotherapy           28516
TB & Chest disease      9586
surgery                 1201
Name: Department, dtype: int64

In [95]:
"""
    Replacing Department values with integers

    0 : gynecology
    1 : anesthesia
    2 : radiotherapy
    3 : TB & Chest disease
    4 : surgery
"""

formatedData['Department'] = formatedData['Department'].replace(["gynecology","anesthesia","radiotherapy","TB & Chest disease","surgery"],[0,1,2,3,4])

# formatedData = formatedData.replace(to_replace="gynecology",value=0)
# formatedData = formatedData.replace(to_replace="anesthesia",value=1)
# formatedData = formatedData.replace(to_replace="radiotherapy",value=2)
# formatedData = formatedData.replace(to_replace="TB & Chest disease",value=3)
# formatedData = formatedData.replace(to_replace="surgery",value=4)

formatedData.head()

Unnamed: 0,Hospital,Hospital_type,Hospital_city,Hospital_region,Available_Extra_Rooms_in_Hospital,Department,Ward_Type,Ward_Facility,Bed_Grade,City_Code_Patient,Type of Admission,Illness_Severity,Patient_Visitors,Age,Admission_Deposit,Stay_Days
0,8,2,3,2,3,2,R,F,2.0,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,5,2,2,2,S,F,2.0,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,10,4,1,0,2,1,S,E,2.0,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,26,1,2,1,2,2,R,D,2.0,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,26,1,2,1,2,2,S,D,2.0,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [96]:
#Ward Type
Dataset['Ward_Type'].value_counts()

R    127947
Q    106165
S     77794
P      5046
T      1477
U         9
Name: Ward_Type, dtype: int64

In [97]:
"""
    replacing Ward Type with integers

    0 : P
    1 : Q
    2 : R
    3 : S
    4 : T
    5 : U
"""
formatedData['Ward_Type'] = formatedData['Ward_Type'].replace(['P','Q','R','S','T','U'],[0,1,2,3,4,5])

# formatedData = formatedData.replace(to_replace="P",value=0)
# formatedData = formatedData.replace(to_replace="Q",value=1)
# formatedData = formatedData.replace(to_replace="R",value=2)
# formatedData = formatedData.replace(to_replace="S",value=3)
# formatedData = formatedData.replace(to_replace="T",value=4)
# formatedData = formatedData.replace(to_replace="U",value=5)

formatedData.head()

Unnamed: 0,Hospital,Hospital_type,Hospital_city,Hospital_region,Available_Extra_Rooms_in_Hospital,Department,Ward_Type,Ward_Facility,Bed_Grade,City_Code_Patient,Type of Admission,Illness_Severity,Patient_Visitors,Age,Admission_Deposit,Stay_Days
0,8,2,3,2,3,2,2,F,2.0,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,5,2,2,2,3,F,2.0,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,10,4,1,0,2,1,3,E,2.0,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,26,1,2,1,2,2,2,D,2.0,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,26,1,2,1,2,2,3,D,2.0,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [98]:
formatedData['Ward_Facility'].value_counts()

F    112753
E     55351
D     51809
C     35463
B     35156
A     27906
Name: Ward_Facility, dtype: int64

In [99]:
"""
    replacing Ward Facility values with integers

    0 : A
    1 : B
    2 : C
    3 : D
    4 : E
    5 : F

"""
formatedData['Ward_Facility'] = formatedData['Ward_Facility'].replace(["A","B","C","D","E","F"],[0,1,2,3,4,5])
# formatedData = formatedData.replace(to_replace="A",value=0)
# formatedData = formatedData.replace(to_replace="B",value=1)
# formatedData = formatedData.replace(to_replace="C",value=2)
# formatedData = formatedData.replace(to_replace="D",value=3)
# formatedData = formatedData.replace(to_replace="E",value=4)
# formatedData = formatedData.replace(to_replace="F",value=5)

formatedData.head()

Unnamed: 0,Hospital,Hospital_type,Hospital_city,Hospital_region,Available_Extra_Rooms_in_Hospital,Department,Ward_Type,Ward_Facility,Bed_Grade,City_Code_Patient,Type of Admission,Illness_Severity,Patient_Visitors,Age,Admission_Deposit,Stay_Days
0,8,2,3,2,3,2,2,5,2.0,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,5,2,2,2,3,5,2.0,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,10,4,1,0,2,1,3,4,2.0,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,26,1,2,1,2,2,2,3,2.0,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,26,1,2,1,2,2,3,3,2.0,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [100]:
#Type of Admission#
formatedData['Type of Admission'].value_counts()

Trauma       152261
Emergency    117676
Urgent        48501
Name: Type of Admission, dtype: int64

In [101]:
"""
    Replacing Type of Admission values with integers

    0 : Trauma
    1 : Emergency
    2 : Urgent
"""
formatedData['Type of Admission'] = formatedData['Type of Admission'].replace(["Trauma","Emergency","Urgent"],[0,1,2])
# formatedData = formatedData.replace(to_replace="Trauma",value=0)
# formatedData = formatedData.replace(to_replace="Emergency",value=1)
# formatedData = formatedData.replace(to_replace="Urgent",value=2)

formatedData.head()

Unnamed: 0,Hospital,Hospital_type,Hospital_city,Hospital_region,Available_Extra_Rooms_in_Hospital,Department,Ward_Type,Ward_Facility,Bed_Grade,City_Code_Patient,Type of Admission,Illness_Severity,Patient_Visitors,Age,Admission_Deposit,Stay_Days
0,8,2,3,2,3,2,2,5,2.0,7.0,1,Extreme,2,51-60,4911.0,0-10
1,2,2,5,2,2,2,3,5,2.0,7.0,0,Extreme,2,51-60,5954.0,41-50
2,10,4,1,0,2,1,3,4,2.0,7.0,0,Extreme,2,51-60,4745.0,31-40
3,26,1,2,1,2,2,2,3,2.0,7.0,0,Extreme,2,51-60,7272.0,41-50
4,26,1,2,1,2,2,3,3,2.0,7.0,0,Extreme,2,51-60,5558.0,41-50


In [102]:
#illness serverity#
formatedData['Illness_Severity'].value_counts()

Moderate    175843
Minor        85872
Extreme      56723
Name: Illness_Severity, dtype: int64

In [103]:
"""
    Replacing Illness Severity values with integers

    0 : Moderate
    1 : Minor
    2 : Extreme
"""

formatedData['Illness_Severity'] = formatedData['Illness_Severity'].replace(["Moderate","Minor","Extreme"],[0,1,2])

# formatedData = formatedData.replace(to_replace="Moderate",value=0)
# formatedData = formatedData.replace(to_replace="Minor",value=1)
# formatedData = formatedData.replace(to_replace="Extreme",value=2)

formatedData.head()

Unnamed: 0,Hospital,Hospital_type,Hospital_city,Hospital_region,Available_Extra_Rooms_in_Hospital,Department,Ward_Type,Ward_Facility,Bed_Grade,City_Code_Patient,Type of Admission,Illness_Severity,Patient_Visitors,Age,Admission_Deposit,Stay_Days
0,8,2,3,2,3,2,2,5,2.0,7.0,1,2,2,51-60,4911.0,0-10
1,2,2,5,2,2,2,3,5,2.0,7.0,0,2,2,51-60,5954.0,41-50
2,10,4,1,0,2,1,3,4,2.0,7.0,0,2,2,51-60,4745.0,31-40
3,26,1,2,1,2,2,2,3,2.0,7.0,0,2,2,51-60,7272.0,41-50
4,26,1,2,1,2,2,3,3,2.0,7.0,0,2,2,51-60,5558.0,41-50


In [104]:
#Age brackets#
formatedData['Age'].value_counts()

41-50     63749
31-40     63639
51-60     48514
21-30     40843
71-80     35792
61-70     33687
11-20     16768
81-90      7890
0-10       6254
91-100     1302
Name: Age, dtype: int64

In [105]:
"""
    replacing Age brackets with integers

    0 : 0-10
    1 : 11-20
    2 : 21-30
    3 : 31-40
    4 : 41-50
    5 : 51-60
    6 : 61-70
    7 : 71-80
    8 : 81-90
    9 : 91-100
"""

formatedData["Age"] = formatedData["Age"].replace(['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100'],[0,1,2,3,4,5,6,7,8,9])

formatedData.head()

Unnamed: 0,Hospital,Hospital_type,Hospital_city,Hospital_region,Available_Extra_Rooms_in_Hospital,Department,Ward_Type,Ward_Facility,Bed_Grade,City_Code_Patient,Type of Admission,Illness_Severity,Patient_Visitors,Age,Admission_Deposit,Stay_Days
0,8,2,3,2,3,2,2,5,2.0,7.0,1,2,2,5,4911.0,0-10
1,2,2,5,2,2,2,3,5,2.0,7.0,0,2,2,5,5954.0,41-50
2,10,4,1,0,2,1,3,4,2.0,7.0,0,2,2,5,4745.0,31-40
3,26,1,2,1,2,2,2,3,2.0,7.0,0,2,2,5,7272.0,41-50
4,26,1,2,1,2,2,3,3,2.0,7.0,0,2,2,5,5558.0,41-50


In [106]:
#the amount of days the patient might stay#
formatedData['Stay_Days'].value_counts()

21-30                 87491
11-20                 78139
31-40                 55159
51-60                 35018
0-10                  23604
41-50                 11743
71-80                 10254
More than 100 Days     6683
81-90                  4838
91-100                 2765
61-70                  2744
Name: Stay_Days, dtype: int64

In [108]:
"""
    replacing each class of stay into integers

    0  : 0-10
    1  : 11-20
    2  : 21-30
    3  : 31-40
    4  : 41-50
    5  : 51-60
    6  : 61-70
    7  : 71-80
    8  : 81-90
    9  : 91-100
    10 : More than 100 Days
"""

formatedData['Stay_Days'] = formatedData['Stay_Days'].replace(['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100','More than 100 Days'],[0,1,2,3,4,5,6,7,8,9,10])

formatedData.head()

TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'

In [75]:
formatedData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 16 columns):
Hospital                             318438 non-null int64
Hospital_type                        318438 non-null int64
Hospital_city                        318438 non-null int64
Hospital_region                      318438 non-null int64
Available_Extra_Rooms_in_Hospital    318438 non-null int64
Department                           318438 non-null int64
Ward_Type                            318438 non-null int64
Ward_Facility                        318438 non-null int64
Bed_Grade                            318325 non-null float64
City_Code_Patient                    313906 non-null float64
Type of Admission                    318438 non-null int64
Illness_Severity                     318438 non-null int64
Patient_Visitors                     318438 non-null int64
Age                                  318438 non-null object
Admission_Deposit                    318438 non-null float