In [31]:
import pandas as pd

In [32]:
df = pd.read_csv('Admission.csv')

In [33]:
df.drop(columns=['ses', 'Gender_Male', 'Race'], inplace=True)

In [34]:
df.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [35]:
df['rank'].unique()

array([3, 1, 4, 2])

## One-Hot Encoding

In [36]:
from sklearn.preprocessing import OneHotEncoder

In [37]:
ohe = OneHotEncoder()
one_hot_encoder = ohe.fit_transform(df[['rank']])

In [38]:
encoded_df = pd.DataFrame(one_hot_encoder.toarray(),
                          columns=ohe.get_feature_names_out())
dfe = pd.concat([df, encoded_df], axis=1)
dfe.drop(columns=['rank'], inplace=True)

In [39]:
dfe.head()

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,0.0,0.0,1.0,0.0
1,1,660,3.67,0.0,0.0,1.0,0.0
2,1,800,4.0,1.0,0.0,0.0,0.0
3,1,640,3.19,0.0,0.0,0.0,1.0
4,0,520,2.93,0.0,0.0,0.0,1.0


## Normalizando os Dados

In [40]:
from sklearn.preprocessing import StandardScaler

In [41]:
scaler = StandardScaler()

In [42]:
dfe[['gre', 'gpa']] = scaler.fit_transform(dfe[['gre', 'gpa']])

In [43]:
dfe

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,-1.800263,0.579072,0.0,0.0,1.0,0.0
1,1,0.626668,0.736929,0.0,0.0,1.0,0.0
2,1,1.840134,1.605143,1.0,0.0,0.0,0.0
3,1,0.453316,-0.525927,0.0,0.0,0.0,1.0
4,0,-0.586797,-1.209974,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
395,0,0.279964,1.605143,0.0,1.0,0.0,0.0
396,0,-0.240093,-0.920570,0.0,0.0,1.0,0.0
397,0,-1.106854,-1.999259,0.0,1.0,0.0,0.0
398,0,0.973373,0.684310,0.0,1.0,0.0,0.0


## Gradient Descent

- Using Mean Squared Error (MSE)

$\frac{1}{2m}\sum_{u}(y^u - ŷ^u)^2$

- **Weighted Sum (z)**:For a single neuron, the first step is calculating a weighted sum of inputs, $z = \sum(w_i * x_i)$
- **Activation ($f(z)$)**: The weighted sum is then passed through as activation function, such as the sigmoid function, to produce the neuron's output, $ŷ = f(z)$
- **Loss Function (L)**: Finally, a loss function is used to compare the neuron's output, ŷ, to the true target value, y, resulting in an error, L(y, ŷ). A common choice is the mean squared error (MSE), $L = \frac{1}{2} (y - ŷ)^2$

**The chain rule of backpropagation**
- To update the weights, we must find the derivative of the loss function with respect to each weight ($\frac{\partial L}{\partial w_i}$) to perform gradient descent. The chain rule is used to break this complex calculation into smaller parts

$\frac{\partial L}{\partial w_i} = \frac{\partial L}{\partial ŷ} \cdot \frac{\partial ŷ}{\partial z} \cdot \frac{\partial z}{\partial w_i}$

- Combining the terms:

$\frac{\partial L}{\partial w_i} = (ŷ - y) \cdot f'(z) \cdot x_i$

Algorithm for updating the weights with gradient descent:

- Set the weight step to zero $\Delta w_i = 0$
- For each record in the training data:
    - Make a forward pass through the network, calculating the output $ŷ = f(\sum_{i}w_{i}x_{i})$
    - Calculate the error term for the output unit, $\delta = (y - ŷ) * f' (\sum_{i}w_{i}x_{i})$
    - Update the weight step $\Delta w_i = \Delta w_i + \delta x_i$
- Update the weights $w_i = w_i + \eta w_i/m$ where $\eta$ is the learning rate and $m$ is the number of records. Here we're averaging the weight steps to help reduce any large variation in the training data
- Repeat for $e$ epochs

- **Sigmoid**: $f(h) = \frac{1}{1 + e^{-h}}$
- Gradient of sigmoid: $f'(h) = f(h)(1-f(h))$
- Where $h$ is the input to the output unit: $h = \sum_{i} w_i x_i$

In [44]:
import numpy as np

- We'll initialize the weights from a normal distribution centered at 0. A good value for the scale is $\frac{1}{\sqrt{n}}$ where $n$ is the number of input units. This keeps the input to the sigmoid low for increasing numbers of input units

In [45]:
targets = dfe['admit']

In [46]:
features = dfe.drop(columns=['admit'])

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
features_train, features_test, targets_train, targets_test = train_test_split(
    features, targets, test_size=0.2, random_state=163)

In [49]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [50]:
def update_weights(weights, features, targets, learning_rate):
    del_w = np.zeros(weights.shape)

    # Loop through all records, x is the input, y is the target
    for x, y in zip(features.values, targets):
        # Calculate the output of f(h) by passing h (the dot product
        # of x and weights) into the activation function (sigmoid).
        output = sigmoid(np.dot(weights, x))

        # Calculate the error by subtracting the network output
        # from the target (y)
        error = y - output

        # Calculate the error term by multiplying the error by the gradient.
        # Recall that the gradient of the sigmoid f(h) is f(h) * (1 - f(h))
        ## The error term is a gradient that quantifies the contribution of a
        ## specific neuron to the overall network error. It's the partial
        ## derivative of the neuron's loss function to the input of a given
        ## neuron's activation function
        error_term = error * output * (1 - output)

        # Update the weight step by multiplying the error term by the input (x)
        # and adding to the current weight step.
        del_w += error_term * x

    n_records = features.shape[0]
    # Update the weights by adding the learning rate times the change in
    # weights divided by the number of records.
    weights += learning_rate * del_w / n_records

    return weights

In [51]:
def gradient_descent(features, targets, epochs=1000, learning_rate=0.5):
    np.random.seed(42)

    # Initialize loss and weights
    last_loss = None
    n_features = features.shape[1]
    weights = np.random.normal(scale=1/np.sqrt(n_features), size=n_features)

    # Repeatedly update the weights based on the number of epochs
    for e in range(epochs):
        weights = update_weights(weights, features, targets, learning_rate)

        # Printing out the MSE on the training set every 10 epochs.
        if e % (epochs / 10) == 0:
            out = sigmoid(np.dot(features, weights))
            loss = np.mean((out - targets) ** 2)
            if last_loss and last_loss < loss:
                print("Train loss: ", loss, "  WARNING - Loss Increasing")
            else:
                print("Train loss: ", loss)
            last_loss = loss

    return weights

In [52]:
weights = gradient_descent(features_train, targets_train)
test_out = sigmoid(np.dot(features_test, weights))
predictions = test_out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))

Train loss:  0.26565187897249887
Train loss:  0.20490879644045248
Train loss:  0.1962097123388084
Train loss:  0.19390028872667187
Train loss:  0.1929998165315287
Train loss:  0.19257469383929685
Train loss:  0.1923521145827145
Train loss:  0.19222782241854391
Train loss:  0.19215520304243788
Train loss:  0.1921112979874254
Prediction accuracy: 0.675


## Multilayer Perceptron

In [53]:
# Number of records and input units
n_records, n_inputs = features_train.shape

# Number of hidden units
n_hidden = 2

weights_input_to_hidden = np.random.normal(0, np.sqrt(n_inputs),
                                           size=(n_inputs, n_hidden))

$h_j = \sum_i w_{ij}x_i$

- Example:

[$x_1$ $x_2$ $x_3$] $\cdot \begin{pmatrix} w_{11} & w_{12} \\ w_{21} & w_{22} \\ w_{31} & w_{32} \end{pmatrix}$

In [54]:
hidden_units = np.dot(features_train, weights_input_to_hidden)

In [55]:
def forward_pass(x, weights_input_to_hidden, weights_hidden_to_output):
    """
    Make a forward pass through the network
    """
    # Calculate the input to the hidden layer.
    hidden_layer_in = np.dot(x, weights_input_to_hidden)

    # Calculate the hidden layer output.
    hidden_layer_out = sigmoid(hidden_layer_in)

    print('Hidden-layer Output:')
    print(hidden_layer_out)

    # Calculate the input to the output layer.
    output_layer_in = np.dot(hidden_layer_out, weights_hidden_to_output)

    # Calculate the output of the network.
    output_layer_out = sigmoid(output_layer_in)

    print('Output-layer Output:')
    print(output_layer_out)

    return hidden_layer_out, output_layer_out

In [56]:
# Network size
N_input = 4
N_hidden = 3
N_output = 2

# Make some fake data
np.random.seed(42)
x = np.random.randn(N_input)
weights_input_to_hidden = np.random.normal(0, scale=0.1, size=(N_input, N_hidden))
weights_hidden_to_output = np.random.normal(0, scale=0.1, size=(N_hidden, N_output))

# Run forward_pass with fake data
hidden_layer_out, output_layer_out = forward_pass(x, weights_input_to_hidden, weights_hidden_to_output)

Hidden-layer Output:
[0.41492192 0.42604313 0.5002434 ]
Output-layer Output:
[0.49815196 0.48539772]


## Backpropagation

In [57]:
import numpy as np

def sigmoid(x):
    """
    Calculate sigmoid
    """
    return 1/(1+np.exp(-x))

def forward_pass(x, weights_input_to_hidden, weights_hidden_to_output):
    """
    Make a forward pass through the network
    """
    # Calculate the input to the hidden layer.
    hidden_layer_in = np.dot(x, weights_input_to_hidden)
    # Calculate the hidden layer output.
    hidden_layer_out = sigmoid(hidden_layer_in)

    # Calculate the input to the output layer.
    output_layer_in = np.dot(hidden_layer_out, weights_hidden_to_output)
    # Calculate the output of the network.
    output_layer_out = sigmoid(output_layer_in)

    return hidden_layer_out, output_layer_out

def backward_pass(x, target, learnrate, hidden_layer_out, \
                  output_layer_out, weights_hidden_to_output):
    """
    Make a backward pass through the network
    """
    # TODO: Calculate output error
    error = target - output_layer_out

    # Calculate error term for output layer
    output_error_term = error * output_layer_out * (1 - output_layer_out)

    # Calculate error term for hidden layer
    hidden_error_term = weights_hidden_to_output * output_error_term * \
    hidden_layer_out * (1 - hidden_layer_out)

    # Calculate change in weights for hidden layer to output layer
    delta_w_h_o = learnrate * output_error_term * hidden_layer_out

    # Calculate change in weights for input layer to hidden layer
    delta_w_i_h = learnrate * hidden_error_term * x[:, None]

    return delta_w_h_o, delta_w_i_h

# Create data to run through the network
x = np.array([0.5, 0.1, -0.2])
target = 0.6
learnrate = 0.5
weights_input_to_hidden = np.array([
    [0.5, -0.6],
    [0.1, -0.2],
    [0.1, 0.7]
])
weights_hidden_to_output = np.array([0.1, -0.3])

# Forward pass
hidden_layer_out, output_layer_out = forward_pass(
    x, weights_input_to_hidden, weights_hidden_to_output
)

# Backward pass
delta_w_h_o, delta_w_i_h = backward_pass(
    x, target, learnrate, hidden_layer_out, output_layer_out, \
    weights_hidden_to_output
)

print('Change in weights for hidden layer to output layer:')
print(delta_w_h_o)
print('Change in weights for input layer to hidden layer:')
print(delta_w_i_h)

Change in weights for hidden layer to output layer:
[0.00804047 0.00555918]
Change in weights for input layer to hidden layer:
[[ 1.77005547e-04 -5.11178506e-04]
 [ 3.54011093e-05 -1.02235701e-04]
 [-7.08022187e-05  2.04471402e-04]]


## Backpropagation on the csv data

**General Algorithm for updating the weights with backpropagation**

- Set the weights steps for each layer to zero
    - The input to hidden weights:
$\Delta w_{ij} = 0$
    - The hidden to output weights:
$\Delta W_{j} = 0$

- For each record in the training data:
    - Make a forward pass through the network, calculating the output ŷ
    - Calculate the error gradient in the output unit,
$\delta^o = (y - ŷ)f'(z)$ where $z = \sum_j W_j a_j$, the input to the output unit
    - Propagate the errors to the hidden layer
$\delta^h_j = \delta^o W_J f1(h_j)$
    - Update the weights steps, where
$\eta$ is the learning rate:
    - $\Delta W_j = \eta \delta^o a_j$
    - $\Delta w_{ij} = \eta \delta^h_j a_j$
- Update the weights, where $m$ is the number of records:
    - $W_j = W_j + \Delta W_j / m$
    - $w_{ij} = w_{ij} + \Delta w_{ij} / m$
- Repeat for $e$ epochs

In [58]:
import numpy as np

def sigmoid(x):
    """
    Calculate sigmoid
    """
    return 1 / (1 + np.exp(-x))

def forward_pass(x, weights_input_to_hidden, weights_hidden_to_output):
    """
    Make a forward pass through the network
    """
    # Calculate the input to the hidden layer.
    hidden_layer_in = np.dot(x, weights_input_to_hidden)
    # Calculate the hidden layer output.
    hidden_layer_out = sigmoid(hidden_layer_in)

    # Calculate the input to the output layer.
    output_layer_in = np.dot(hidden_layer_out, weights_hidden_to_output)
    # Calculate the output of the network.
    output_layer_out = sigmoid(output_layer_in)

    return hidden_layer_out, output_layer_out


def backward_pass(x, target, learnrate, hidden_layer_out,
                  output_layer_out, weights_hidden_to_output):
    """
    Make a backward pass through the network
    """
    # Calculate output error
    error = target - output_layer_out

    # Calculate error term for output layer
    output_error_term = error * output_layer_out * (1 - output_layer_out)

    # Calculate error term for hidden layer
    hidden_error_term = np.dot(output_error_term, weights_hidden_to_output) * \
                    hidden_layer_out * (1 - hidden_layer_out)

    # Calculate change in weights for hidden layer to output layer
    delta_w_h_o = learnrate * output_error_term * hidden_layer_out

    # Calculate change in weights for input layer to hidden layer
    delta_w_i_h = learnrate * hidden_error_term * x[:, None]

    return delta_w_h_o, delta_w_i_h

def update_weights(weights_input_to_hidden, weights_hidden_to_output,
                   features, targets, learnrate):
    """
    Complete a single epoch of gradient descent and return updated weights
    """
    delta_w_i_h = np.zeros(weights_input_to_hidden.shape)
    delta_w_h_o = np.zeros(weights_hidden_to_output.shape)

    # Loop through all records, x is the input, y is the target
    for x, y in zip(features.values, targets):
        ## Forward pass ##

        # Calculate the output using the forward_pass function.
        hidden_layer_out, output_layer_out = forward_pass(
            x, weights_input_to_hidden, weights_hidden_to_output
        )

        ## Backward pass ##

        # Calculate the change in weights using the backward_pass
        # function.
        delta_w_h_o, delta_w_i_h = backward_pass(x, y, learnrate,
            hidden_layer_out, output_layer_out, weights_hidden_to_output
        )

    n_records = features.shape[0]
    # Update weights  (don't forget division by n_records or number
    # of samples). Pay attention to the order of variables returned by
    # backward_pass
    weights_input_to_hidden += delta_w_i_h / n_records
    weights_hidden_to_output += delta_w_h_o / n_records

    return weights_input_to_hidden, weights_hidden_to_output

def gradient_descent(features, targets, epochs=2000, learnrate=0.9):
    """
    Perform the complete gradient descent process on a given dataset
    """
    # Use to same seed to make debugging easier
    np.random.seed(11)

    # Initialize loss and weights
    last_loss = None
    n_features = features.shape[1]
    n_hidden = 2
    weights_input_hidden = np.random.normal(scale=1 / n_features ** .5,
                                        size=(n_features, n_hidden))
    weights_hidden_output = np.random.normal(scale=1 / n_features ** .5,
                                         size=n_hidden)

    # Repeatedly update the weights based on the number of epochs
    for e in range(epochs):
        weights_input_hidden, weights_hidden_output = update_weights(
            weights_input_hidden, weights_hidden_output, features,
            targets, learnrate
        )

        # Printing out the MSE on the training set every 10 epochs.
        if e % (epochs / 10) == 0:
            hidden_output = sigmoid(np.dot(features, weights_input_hidden))
            out = sigmoid(np.dot(hidden_output,
                                 weights_hidden_output))
            loss = np.mean((out - targets) ** 2)
            if last_loss and last_loss < loss:
                print("Train loss: ", loss, "  WARNING - Loss Increasing")
            else:
                print("Train loss: ", loss)
            last_loss = loss

    return weights_input_hidden, weights_hidden_output

def calculate_accuracy(features, targets, weights_input_hidden,
                       weights_hidden_output):
    """
    Given features, targets, and weights for both hidden and output
    layers, calculate the accuracy of predictions
    """
    hidden_out = sigmoid(np.dot(features, weights_input_hidden))
    output_out = sigmoid(np.dot(hidden_out, weights_hidden_output))
    predictions = output_out > 0.5
    accuracy = np.mean(predictions == targets)
    return accuracy

In [59]:
# Calculate accuracy on test data
weights_input_hidden, weights_hidden_output = gradient_descent(
    features, targets)
accuracy = calculate_accuracy(features_test, targets_test,
                              weights_input_hidden, weights_hidden_output)
print("Prediction accuracy: {:.3f}".format(accuracy))

Train loss:  0.27657563216621484
Train loss:  0.2744848089432056
Train loss:  0.2724533230248846
Train loss:  0.27047939337474697
Train loss:  0.2685613549484122
Train loss:  0.26669765507826015
Train loss:  0.2648868492881651
Train loss:  0.26312759674181774
Train loss:  0.2614186554785868
Train loss:  0.2597588775505114
Prediction accuracy: 0.338
