# *Import Libraries:*

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split



# *Preprocessing:*

In [2]:
df = pd.read_csv('Dataset\Dataset.csv')
df.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


In [3]:
df.shape

(480, 17)

In [4]:
df['Class'].unique()

array(['M', 'L', 'H'], dtype=object)

In [5]:
# convert 'Class' into binary classification
df['Class'] = df['Class'].replace(['H', 'M'], 'A') # accept
df.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,A
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,A
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,A


In [6]:
df.dtypes

gender                      object
NationalITy                 object
PlaceofBirth                object
StageID                     object
GradeID                     object
SectionID                   object
Topic                       object
Semester                    object
Relation                    object
raisedhands                  int64
VisITedResources             int64
AnnouncementsView            int64
Discussion                   int64
ParentAnsweringSurvey       object
ParentschoolSatisfaction    object
StudentAbsenceDays          object
Class                       object
dtype: object

In [7]:
categorical_attr = df.columns[df.dtypes == 'object']
categorical_attr

Index(['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 'GradeID',
       'SectionID', 'Topic', 'Semester', 'Relation', 'ParentAnsweringSurvey',
       'ParentschoolSatisfaction', 'StudentAbsenceDays', 'Class'],
      dtype='object')

In [8]:
# Converting Categorical values to scaler values
le = LabelEncoder()
df[categorical_attr] = df[categorical_attr].apply(le.fit_transform, axis=0)
df.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,4,4,2,1,0,7,0,0,15,16,2,20,1,1,1,0
1,1,4,4,2,1,0,7,0,0,20,20,3,25,1,1,1,0
2,1,4,4,2,1,0,7,0,0,10,7,0,30,0,0,0,1
3,1,4,4,2,1,0,7,0,0,30,25,5,35,0,0,0,1
4,1,4,4,2,1,0,7,0,0,40,50,12,50,0,0,0,0


In [9]:
# Modify the class column --> 1 means accept
swap = {"Class": {1:0, 0:1}} 
df = df.replace(swap)
df.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,4,4,2,1,0,7,0,0,15,16,2,20,1,1,1,1
1,1,4,4,2,1,0,7,0,0,20,20,3,25,1,1,1,1
2,1,4,4,2,1,0,7,0,0,10,7,0,30,0,0,0,0
3,1,4,4,2,1,0,7,0,0,30,25,5,35,0,0,0,0
4,1,4,4,2,1,0,7,0,0,40,50,12,50,0,0,0,1


In [10]:
# X: Features, y: Classes
X = np.array(df.iloc[:, :-1])
y = np.array(df['Class'])

In [11]:
# Deviding Dataset to training and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=12)

In [12]:
# scale = StandardScaler()
# X_train = scale.fit_transform(X_train)
# X_val = scale.fit_transform(X_val)

In [13]:
print('Number of dataset: ', len(X))
print('Number of train set: ', len(X_train))
print('Number of validation set: ', len(X_val))

Number of dataset:  480
Number of train set:  384
Number of validation set:  96


In [14]:
y_train = y_train.reshape(y_train.shape[0], -1) #for making it (biz, 1)
y_val =  y_val.reshape(y_val.shape[0], -1)

In [15]:
y_train = y_train.T
y_val = y_val.T
X_train = X_train.T
X_val = X_val.T
print("X_train: ",X_train.shape)
print("y_train: " ,y_train.shape)
print("X_val: " ,X_val.shape)
print("y_val:" ,y_val.shape)

X_train:  (16, 384)
y_train:  (1, 384)
X_val:  (16, 96)
y_val: (1, 96)


# *Implementing Model:*

### 1) Initialization

In [16]:
# initialize parameters randomly
# using Xavier initialization
def initialize_parameters_random(layer_dims):
    np.random.seed(1)
    parameters = {}
    for l in range(1, len(layer_dims)):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2 / layer_dims[l-1])
        parameters['b' + str(l)] = np.random.randn(layer_dims[l], 1) * np.sqrt(2 / layer_dims[l-1])
        
    return parameters

In [17]:
params = initialize_parameters_random([3,5,2])

print("W1: ", params["W1"])
print("b1: ", params["b1"])

W1:  [[ 1.32627244 -0.49949702 -0.43125043]
 [-0.87607521  0.70660237 -1.87919848]
 [ 1.42463284 -0.62152283  0.26049433]
 [-0.20361006  1.19380613 -1.68209785]
 [-0.26325254 -0.31357907  0.92571887]]
b1:  [[-0.89805746]
 [-0.14078704]
 [-0.7167684 ]
 [ 0.03446738]
 [ 0.47586663]]


In [18]:
# initialize parameters with zero
def initialize_parameters_with_zeros(layer_dims):
    parameters = {}
    for l in range(1, len(layer_dims)):
        parameters['W' + str(l)] = np.zeros((layer_dims[l], layer_dims[l-1]))
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
    return parameters

In [19]:
params = initialize_parameters_with_zeros([3,5,2])

print("W2: ", params["W2"])
print("b2: ", params["b2"])

W2:  [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
b2:  [[0.]
 [0.]]


### 2) Forward Propagation

In [20]:
# calculate Z
def linear_forward(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache

In [21]:
# sigmoid function
def sigmoid(Z):
    A =  1 / (1 + np.exp(-Z))
    activation_cache = Z
    return A, activation_cache

In [22]:
# relu function
def relu(Z):
    A = np.maximum(0, Z)
    activation_cache = Z
    return A, activation_cache

In [23]:
# calculate A
def linear_activation_forward(A_prev, W, b, activation):
    Z, linear_cache = linear_forward(A_prev, W, b)
    
    if activation == "sigmoid":
        A, activation_cache = sigmoid(Z)
    elif activation == "relu":
        A, activation_cache = relu(Z)

    cache = (linear_cache, activation_cache)
    
    return A, cache

In [24]:
# implement forward propagation
def model_forward(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2 # number of layers
    
    for l in range(1, L): # first L-1 layers with relu function
        A_prev = A 
        A, cache = linear_activation_forward(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], activation = "relu")
        caches.append(cache)
       
    AL, cache = linear_activation_forward(A, parameters['W' + str(L)], parameters['b' + str(L)], activation = "sigmoid")
    
    caches.append(cache)
   
    return AL, caches

In [25]:
# calculate loss and cost
# cost: cross entropy cost
def compute_cost(AL, Y):
    m = Y.shape[1]
    
    # limit A to prevent divide by zero in log(AL) and log(1-AL)
    AL = np.clip(AL, 0.00000000000001, 0.9999999999999)

    loss = np.multiply(np.log(AL), Y) + np.multiply((1 - Y), np.log(1 - AL))
    cost = -np.sum(loss) / m
    
    return cost

In [51]:
# calculate accuracy
def compute_accuracy(AL, Y, threshold):
    prediction = np.zeros((1, AL.shape[1]))
    for i in range(AL.shape[1]):
        prediction[0,i] = 1 if AL[0, i] > threshold else 0
    
    m = Y.shape[1]
    accuracy = (np.sum(np.absolute(prediction - Y)) / m) * 100
    
    return accuracy

### 3) Backward Propagation

In [27]:
# derivative of relu function
def relu_backward(cache_activation):
    Z = cache_activation
    dZ = np.greater(Z, 0.).astype(np.float32)
    return dZ

In [28]:
# derivative of sigmoid function
def sigmoid_backward(cache_activation):
    Z = cache_activation
    s, _ = sigmoid(Z)
    dZ = s * (1 - s)
    return dZ

In [29]:
# calculate dA, dW, db
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 1, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [62]:
# calculate dZ
def linear_activation_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(activation_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(activation_cache)
    
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
       
    return dA_prev, dW, db

In [58]:
# implement backward propagation
def model_backward(AL, Y, caches):
    grads = {}
    L = len(caches) # the number of layersm = AL.shape[1]
    Y = Y.reshape(AL.shape) # Y is the same shape as AL
    
    # Initializing the backpropagation
    AL = np.clip(AL, 0.00000000000001, 0.9999999999999)
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    current_cache = caches[-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, 'sigmoid')
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads["dA" + str(l)], grads["dW" + str(l + 1)], grads["db" + str(l + 1)] = linear_activation_backward(grads["dA" + str(l + 1)], current_cache, 'relu')
        
    return grads

In [59]:
# Update Parameters
def update_parameters(params, grads, learning_rate):
    parameters = params.copy()
    L = len(parameters) // 2 # number of layers

    for l in range(L):
        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)]
        
    return parameters

### 4) Model Definition

In [60]:
def model(X, Y, layers_dims, learning_rate = 0.1, num_iterations = 1000, initializer = "Random", threshold = 0.5, print_cost = False):
    np.random.seed(1)
    costs = []
    
    # initialize parameters
    if initializer == "Zero":
        parameters = initialize_parameters_with_zeros(layers_dims)
    elif initializer == "Random":
        parameters = initialize_parameters_random(layers_dims)
    
    # gradient descent
    for i in range(0, num_iterations):

        # Forward propagation
        AL, caches = model_forward(X, parameters)
        
        # Compute cost
        cost = compute_cost(AL, Y)
        
        # Backward propagation
        grads = model_backward(AL, Y, caches)
 
        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)
          
        # Comupe accuracy
        accuracy = compute_accuracy(AL, Y, threshold)
        
        # Print the cost every (num_iterations / 10) iterations
        if print_cost and i % (num_iterations / 10) == 0 or i == num_iterations - 1:
            print("Iteration {}) \t\t Loss: {} \t\t Accuracy: {}".format(i, cost, accuracy))
            
    return parameters

# *Training and Evaluating the models:*

### *2 Layer Model (No hidden layer)*

LINEAR ~> SIGMOID

In [61]:
layers_dims = [X_train.shape[0], 1]
parameters = model(X_train, y_train, layers_dims, print_cost = True)

Iteration 0) 		 Loss: 9.228926881913276 		 Accuracy: 56.25
Iteration 100) 		 Loss: 22.808499265782274 		 Accuracy: 72.65625
Iteration 200) 		 Loss: 22.914865352228656 		 Accuracy: 72.39583333333334
Iteration 300) 		 Loss: 22.966122341203132 		 Accuracy: 72.39583333333334
Iteration 400) 		 Loss: 22.995392234200693 		 Accuracy: 72.39583333333334
Iteration 500) 		 Loss: 23.012101601175374 		 Accuracy: 72.39583333333334
Iteration 600) 		 Loss: 23.024043803908437 		 Accuracy: 72.39583333333334
Iteration 700) 		 Loss: 23.033719997470282 		 Accuracy: 72.39583333333334
Iteration 800) 		 Loss: 23.04157824919456 		 Accuracy: 72.39583333333334
Iteration 900) 		 Loss: 23.047529030756937 		 Accuracy: 72.39583333333334
Iteration 999) 		 Loss: 23.05267637734867 		 Accuracy: 72.39583333333334


### *3 Layer Model (One hidden layer)*

LINEAR ~> RELU ~> LINEAR ~> SIGMOID

### *5 Layer Model (Three hidden layer)*

LINEAR ~> RELU ~> LINEAR ~> RELU ~> LINEAR ~> RELU ~> LINEAR ~> SIGMOID

In [35]:
print(parameters)

{'W1': array([[-0.01646702, -0.11274307, -0.11261294, -0.03398006, -0.07472652,
        -0.0122396 , -0.13460496, -0.01295575, -0.01009116, -1.15598967,
        -1.34752605, -0.93072921, -1.05851138, -0.01393229, -0.01510418,
        -0.01458333]]), 'b1': array([[-0.0254514]])}


In [50]:
a = np.array([[0.55, 0.9, 0.2]])

for i in range(3):
    a = 1 if a[0,i] > 0.5 else 0

print(a)

TypeError: 'int' object is not subscriptable