Maxime Marchand
# Astrophysics and Data Science : Project 5
## Neural Network from scratch, part 1

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

## The dataset

In this part, we load the dataset, comment its properties and prepare it to be used with the network. The linear normalization has been made using the following formula :

\begin{equation}
\hat{y_i} = \frac{2(y_i - y_{min})}{y_{max} - y_{min}} - 1
\end{equation}

Where $\hat{y}_i$ is the normalized value of the element $y_i$.

In [None]:
# Loading data file
inFileName = 'Data/wheat.dat'
data = np.loadtxt(inFileName, delimiter=',')

# Randomize the data
np.random.shuffle(data)

# Split the dataset
vectors = data[:, :-1]
classes = data[:, -1]

# Normalize the dataset (see eq. above)
for i in range(len(vectors[0])):
    maxim = np.max(vectors[:, i])
    minim = np.min(vectors[:, i])
    vectors[:, i] = 2 * (vectors[:, i] - minim) / (maxim - minim) - 1

Let make a corner plot to have a look at the data

In [None]:
# List of the quantities described in the dataset
# (see https://machinelearningmastery.com/standard-machine-learning-datasets/)
data_info = ['Area', 'Perimeter', 'Compactnes', 'Length of kernel', 
             'Width of kernel', 'Asymmetry coefficient', 'Length of kernel groove']

# Boolean arrays to select the different classes
cl1 = data[:, -1] == 1.0
cl2 = data[:, -1] == 2.0
cl3 = data[:, -1] == 3.0

fig, ax = plt.subplots(nrows=len(vectors[0]), ncols=len(vectors[0]), figsize=(20, 15), dpi=300)
fig.subplots_adjust(hspace=.25, wspace=.35)

# Plotting the data
for i in range(len(vectors[0])):
    y = vectors[:, i]
    for j in range(0, i):
        x = vectors[:, j]
        ax[i, j].scatter(x[cl1], y[cl1], c='b', alpha=0.3, label='class 1')
        ax[i, j].scatter(x[cl2], y[cl2], c='r', alpha=0.3, label='class 2')
        ax[i, j].scatter(x[cl3], y[cl3], c='g', alpha=0.3, label='class 3')
        
        ax[i, j].set_xlim([-1.1, 1.1])
        ax[i, j].set_ylim([-1.1, 1.1])

# Adding legend 
ax[1, 0].legend(bbox_to_anchor=(1, 1));

# Adding axis labels
for i in range(len(vectors[0])):
    ax[len(vectors[0])-1, i].set_xlabel(data_info[i])
    ax[i, 0].set_ylabel(data_info[i])

# Removing upper triangle
for i in range(len(vectors[0])):
    for j in range(i, len(vectors[0])):
        fig.delaxes(ax[i, j])

We now create a variable targets, where we convert the values of the classes variable using the following definition :

\begin{eqnarray}
1 & \longrightarrow & [1, 0, 0] \\
2 & \longrightarrow & [0, 1, 0] \\
3 & \longrightarrow & [0, 0, 1]
\end{eqnarray}

We also split the dataset so that 80% of the data will be used for training, the rest for validation.

In [None]:
targets = np.zeros(shape=(len(classes), 3))    # List containing the targets [[1, 0, 0], [0, 1, 0], [0, 0, 1], ...]

for i in range(len(classes)):
    cl = int(classes[i])
    targets[i, cl-1] = 1.0

training_percent = int(len(vectors) * 0.8)     # Percentage of the dataset that will be used for training (80 %)

# We store training and validation data sets in two different dictionnaries
training_set = {
    'vectors' : vectors[:training_percent],
    'classes' : classes[:training_percent],
    'targets' : targets[:training_percent]
}

validation_set = {
    'vectors' : vectors[training_percent:],
    'classes' : classes[training_percent:],
    'targets' : targets[training_percent:]
}

Our data is ready. :-)

### Creating the neural network
We now implement a function that creates the neural network. It is implemented as a list of layers.

In [None]:
def generate_neural_network(num_neurons_input, num_hidden_layers, num_neurons_hidden, num_neurons_output):
    """
    Creates the neural network
    
    PARAMETERS
        num_neurons_input  : Number of neurons in the input layer
        num_hidden_layers  : Number of hidden layers in the network
        num_neurons_hidden : Number of neurons in the hidden layers (same number for each hidden layer)
        num_neurons_output : Number of neurons in the output layer
        
    RETURNS
        output : The network (implemented as a list of dictionnaries)
    """

    output = []   # Variable to be returned
    
    # Creating the input layer ----------------------------------------------------------
    input_layer = {}
    
    input_layer['I']  = np.hstack([np.zeros(num_neurons_input), np.ones(1)])
    input_layer['W']  = np.random.normal(loc=0.0, scale=1/np.sqrt(len(input_layer['I'])/2), size=(num_neurons_input, len(input_layer['I']))) 
    input_layer['Y']  = np.zeros(shape=num_neurons_input)
    input_layer['O']  = np.zeros(shape=num_neurons_input)
    input_layer['dl'] = np.zeros(shape=num_neurons_input)
    input_layer['DW'] = np.zeros(shape=input_layer['W'].shape)
    
    output.append(input_layer)
    
    # Creating the hidden layers --------------------------------------------------------
    # Looping on each hidden layer. As we have already added the input layer, output[i] 
    # corresponds to the previous layer
    for i in range(num_hidden_layers):
        ilayer = {}
        
        ilayer['I']  = np.hstack([np.zeros(len(output[i]['O'])), np.ones(1)])
        ilayer['W']  = np.random.normal(loc=0.0, scale=1/np.sqrt(len(ilayer['I'])/2), size=(num_neurons_hidden, len(ilayer['I'])))
        ilayer['Y']  = np.zeros(shape=num_neurons_hidden)
        ilayer['O']  = np.zeros(shape=num_neurons_hidden)
        ilayer['dl'] = np.zeros(shape=num_neurons_hidden)
        ilayer['DW'] = np.zeros(shape=ilayer['W'].shape)
        
        output.append(ilayer)

    # Creating the output layer ----------------------------------------------------------
    output_layer = {}

    output_layer['I']  = np.hstack([np.zeros(len(output[-1]['O'])), np.ones(1)])
    output_layer['W']  = np.random.normal(loc=0.0, scale=1/np.sqrt(len(output_layer['I'])/2), size=(num_neurons_output, len(output_layer['I'])))
    output_layer['Y']  = np.zeros(shape=num_neurons_output)
    output_layer['O']  = np.zeros(shape=num_neurons_output)
    output_layer['dl'] = np.zeros(shape=num_neurons_output)
    output_layer['DW'] = np.zeros(shape=output_layer['W'].shape)
    
    output.append(output_layer)
    
    # Checking the shapes ------------------------
    # (Only for debugging purposes)
    #for i in range(len(output)):
    #    print("W_{0} : {1} | I_{0} : {2} | Y_{0} : {3} | O_{0} : {4} | DW_{0} : {5} | dl_{0} : {6}".format(i, output[i]['W'].shape, output[i]['I'].shape, output[i]['Y'].shape, output[i]['O'].shape, output[i]['DW'].shape, output[i]['dl'].shape))
    # --------------------------------------------
    
    return output

### Forward propagation
We now implement all the functions that are used in order to do the forward proparagtion

##### Neuron activation
The neuron activation $Y_l$ for the layer $l$ is given by : $Y_l = W_l \cdot I_l$.

In [None]:
def neuron_activation(network, layer):
    return network[layer]['W'] @ network[layer]['I'] 

##### Softmax function (provided)
The softmax function is given by :
\begin{equation}
S(Y_{L-1}) = \frac{1}{\sum_{m=0}^{M-1} \exp(Y_{L-1, m})} \cdot \begin{bmatrix} \exp(Y_{L-1, 0}) \\ \vdots \\ \exp(Y_{L-1, M-1})  \end{bmatrix}
\end{equation}

In [None]:
def softmax(x, derivative=False):
    exp_shifted = np.exp(x - x.max()) # for stability the values are shifted

    if derivative:
        return exp_shifted / np.sum(exp_shifted, axis=0) * (1 - exp_shifted / np.sum(exp_shifted, axis=0))
    
    else:
        return exp_shifted / np.sum(exp_shifted, axis=0)

In [None]:
softmax(np.array([1.25, 1.25]), True)

##### Transfert functions

The sigmoïd function and its derivative are given by :

\begin{equation}
f(x) = \frac{1}{1+\exp(-x)}
\hspace{3cm}
f'(x) = \frac{\exp(-x)}{(1+\exp(-x))^2}
\end{equation}

The ReLU function and its derivative are given by :

\begin{equation}
f(x) = \left\{ \begin{array}{cc} 0 & x < 0 \\ x & x \geq 0 \end{array}  \right. 
\hspace{3cm}
f'(x) = \left\{ \begin{array}{cc} 0 & x < 0 \\ 1 & x \geq 0 \end{array}  \right. 
\end{equation}

The ReLU dying function and its derivative are given by :

\begin{equation}
f(x) = \left\{ \begin{array}{rc} 0.01\cdot x & x < 0 \\ x & x \geq 0 \end{array}  \right. 
\hspace{3cm}
f'(x) = \left\{ \begin{array}{rc} 0.01 & x < 0 \\ 1 & x \geq 0 \end{array}  \right. 
\end{equation}

In [None]:
def transfert_sigmoid(network, layer, derivative=False):
    """
    /!\ This function should not be called directly
    """
    Y = network[layer]['Y']
    if not derivative:
        return 1 / (1 + np.exp(-Y))
    else:
        return np.exp(-Y) / (1 + np.exp(-Y))**2

In [None]:
def transfert_ReLU(network, layer, derivative=False):
    """
    /!\ This function should not be called directly
    """
    Y = network[layer]['Y']
    lower = Y < 0.0    

    if not derivative:
        Y[lower] = 0.0
        return Y
    
    else:
        Y[lower] = 0.0
        Y[~lower] = 1.0
        return Y

In [None]:
def transfert_ReLU_dying(network, layer, derivative=False):
    """
    /!\ This function should not be called directly
    """
    Y = network[layer]['Y']
    lower = Y < 0.0
    
    if not derivative:
        Y[lower] = 0.01 * Y[lower]
        return Y

    else:
        Y[lower] = 0.01
        Y[~lower] = 1.0
        return Y

In [None]:
def transfert(network, layer, derivative=False, mode='sig'):
    if   mode == 'sig':
        return transfert_sigmoid(network, layer, derivative)
    elif mode == 'ReLU':
        return transfert_ReLU(network, layer, derivative)
    elif mode == 'ReLU_dying':
        return transfert_ReLU_dying(network, layer, derivative)
    else:
        raise ValueError("{} is not a valid keyword. (choose between 'sig', 'ReLU' or 'ReLU_dying')".format(mode))

In [None]:
def forward_propagation(network, entry, transfert_mode):
    # Input layer
    network[0]['I'] = np.hstack([entry, np.ones(1)]) # Initializing first layer with data
    network[0]['Y'] = neuron_activation(network, layer=0)
    network[0]['O'] = transfert(network, layer=0, derivative=False, mode=transfert_mode)
    
    for l in range(1, len(network)-1):
        network[l]['I'] = np.hstack([network[l-1]['O'], np.ones(1)])
        network[l]['Y'] = neuron_activation(network, layer=l)
        network[l]['O'] = transfert(network, layer=l, derivative=False, mode=transfert_mode)
    
    # Output layer
    network[-1]['I'] = np.hstack([network[-2]['O'], np.ones(1)])
    network[-1]['Y'] = neuron_activation(network, layer=-1)
    network[-1]['O'] = softmax(network[-1]['Y'])

    return network

### Cost function
We now implement the cost function and its gradient.

The cost function $C : \mathbb{R}^n \longrightarrow \mathbb{R}$ is given by the euclidean distance between the output vector $O = (O_0, \dots, O_{n-1})\in\mathbb{R}^n$ and the target vector $t = (t_0, t_1, ..., t_{n-1}) \in \mathbb{R}^n$ :
    
\begin{equation}
    C = \sqrt{\sum_{i=0}^{n-1} (O_i - t_i)^2}
\end{equation}
    
The gradient $\nabla C \in \mathbb{R}^n$ is thus given by :
    
\begin{equation}
    %\nabla C = \begin{bmatrix} 2\cdot(O_0 - t_0) \\ \vdots \\ 2\cdot(O_{n-1} - t_{n-1}) \end{bmatrix}
    \nabla C = \frac{1}{\sqrt{\sum_{i=0}^{n-1} (O_i - t_i)^2}} \cdot \begin{bmatrix} (O_0 - t_0) \\ \vdots \\ (O_{n-1} - t_{n-1}) \end{bmatrix}
\end{equation}

In [None]:
def cost(result, target):
    return np.sqrt(np.sum((result - target)**2))

In [None]:
def cost_gradient(result, target):
    return (result - target) / cost(result, target)

### Backward propagation
We now implement all the functions that are used in order to do the backward propagation in our network.

##### Back propagated error $\delta_l$
We now want to compute the back-propagated error $\delta_l = \frac{\partial C}{\partial Y_l}$ as a function of $\delta_{l+1}$. Let first compute the back-propagated error of the last layer, $\delta_{L-1} = \frac{\partial C}{\partial Y_{L-1}}$ :

\begin{equation}
\frac{\partial C}{\partial Y_{L-1}} = 
\begin{bmatrix} 
    \frac{\partial C}{\partial Y_{L-1, 0}} \\ \vdots \\ \frac{\partial C}{\partial Y_{L-1, n-1}} 
\end{bmatrix} = 
\begin{bmatrix} 
    \frac{\partial C}{\partial S_0(Y_{L-1})} \frac{\partial S_0(Y_{L-1})}{\partial Y_{L-1}} \\ \vdots \\
    \frac{\partial C}{\partial S_{n-1}(Y_{L-1})} \frac{\partial S_{n-1}(Y_{L-1})}{\partial Y_{L-1}}
\end{bmatrix}
\end{equation}

Where we used the definition of the transfert function of the output layer, which is the softmax function : $O(Y_{L-1}) = S(Y_{L-1})$. In order to compute the back propagated error for a layer $l$, it is easier to start from a concrete example. Let compute $\delta_l$ as a function of $\delta_{l+1}$, between two layers of size $3$ and $2$ as illustrated in the figure below.

<img src='images/05_neuron.png' width=200>

The activation of the layer $l+1$ is given by $Y_{l+1} = W_{l+1} \cdot I_{l+1}$ with :

\begin{equation}
W_{l+1} = 
\begin{bmatrix} 
\omega_{11} & \omega_{12} & \omega_{13} & {b_1} \\ 
\omega_{21} & \omega_{22} & \omega_{23} & {b_2}
\end{bmatrix}
\quad ; \quad
I_{l+1} = 
\begin{bmatrix}
f_1(Y_{l, 1}) \\ 
f_2(Y_{l, 2}) \\ 
f_3(Y_{l, 3}) \\ 
1 
\end{bmatrix}
\end{equation}

We have $\delta_{l}$ given by :

\begin{equation}
\delta_l = \frac{\partial C}{\partial Y_l} = \frac{\partial Y_{l+1}}{\partial Y_l} \frac{\partial C}{\partial Y_{l+1}} = \frac{\partial Y_{l+1}}{\partial Y_l} \cdot \delta_{l+1}
\end{equation}

In this example, $\delta_{l+1} \in \mathbb{R}^{2}$, and thus $\frac{\partial Y_{l+1}}{\partial Y_l} \in Mat_{3, 2}(\mathbb{R})$. Let now write $Y_{l+1}$ explicitely :

\begin{equation}
Y_{l+1} =
\begin{bmatrix}
Y_{l+1, 1} \\ Y_{l+1, 2}
\end{bmatrix} =
\begin{bmatrix}
\omega_{11} f_1(Y_{l, 1}) + \omega_{12} f_2(Y_{l, 2}) + \omega_{13} f_3(Y_{l, 3}) + {b_1} \\
\omega_{21} f_1(Y_{l, 1}) + \omega_{22} f_2(Y_{l, 2}) + \omega_{23} f_3(Y_{l, 3}) + {b_2} \\
\end{bmatrix}
\end{equation}

The partial derivatives are :

\begin{eqnarray}
\frac{\partial Y_{l+1, 1}}{\partial Y_{l, 1}} = \omega_{11} f_1'(Y_{l, 1}) & \quad ; \quad &
\frac{\partial Y_{l+1, 2}}{\partial Y_{l, 1}} = \omega_{21} f_1'(Y_{l, 1}) \\
\frac{\partial Y_{l+1, 1}}{\partial Y_{l, 2}} = \omega_{12} f_2'(Y_{l, 2}) & \quad ; \quad &
\frac{\partial Y_{l+1, 2}}{\partial Y_{l, 2}} = \omega_{22} f_2'(Y_{l, 2}) \\
\frac{\partial Y_{l+1, 1}}{\partial Y_{l, 3}} = \omega_{13} f_3'(Y_{l, 3}) & \quad ; \quad &
\frac{\partial Y_{l+1, 2}}{\partial Y_{l, 3}} = \omega_{23} f_3'(Y_{l, 3})
\end{eqnarray}

In matrix form, $\dfrac{\partial Y_{l+1}}{\partial Y_l}$ is thus given by :

\begin{equation}
\frac{\partial Y_{l+1}}{\partial Y_l} = 
\begin{bmatrix}
\omega_{11} f_1'(Y_{l, 1}) & \omega_{21} f_1'(Y_{l, 1}) \\
\omega_{12} f_2'(Y_{l, 2}) & \omega_{22} f_2'(Y_{l, 2}) \\
\omega_{13} f_3'(Y_{l, 3}) & \omega_{23} f_3'(Y_{l, 3})
\end{bmatrix}
\end{equation}

As one can observe, it corresponds to the weight matrix $W_{l+1}$ without the last column (the bias values) that has been transposed, and where each line has been multiplied by the derivative of the transfert function of the corresponding neuron in the layer $l$.

In [None]:
def backward_propagation(network, target, transfert_mode):
    # Looping backward from the output layer to the input layer
    for l in range(len(network)-1, -1, -1):
        if l == len(network)-1: # output layer
            gradC = cost_gradient(network[-1]['O'], target)
            derivS = softmax(network[-1]['Y'], derivative=True)
            network[-1]['dl'] = (gradC * derivS)

        else:
            derivatives = transfert(network, layer=l, derivative=True, mode=transfert_mode)
            M = network[l+1]['W'][:, :-1].T.copy()   # Corresponds to the matrix dY_{l+1}/dY_{l}
            
            for i in range(len(derivatives)):
                M[i, :] = M[i, :] * derivatives[i]
            
            network[l]['dl'] = M @ network[l+1]['dl']
            
    return network

##### Weights update

Let now compute the matrix $\Delta W_l = \frac{\partial C}{\partial W_l}$ as a function of $\delta_l$. Let $m$ and $n$ be respectively the number of neurons in the layers $l-1$ and $l$. Thus, $\Delta W_{l}$ will be a matrix of $n$ lines and $m+1$ columns. The element $(\Delta W)_{ij}$ is given by :

\begin{equation}
\Big(\Delta W_l\Big)_{ij} = \frac{\partial C}{\partial W_{l, ij}} = \frac{\partial Y_{l,i}}{\partial W_{l,ij}} \frac{\partial C}{\partial Y_{l,i}} = \frac{\partial Y_{l,i}}{\partial W_{l,ij}} \delta_{l,i}
\end{equation}

Let write $W_{l, ij} \equiv \omega_{ij}$. Thus :

\begin{equation}
\Big(\Delta W_l\Big)_{ij} = \frac{\partial}{\partial \omega_{ij}} \left( \sum_{k=0}^{M-1} \omega_{lik} I_{lk} \right)  \delta_{l,i} = \sum_{k=0}^{M-1} I_{lk} \frac{\partial\omega_{lik}}{\partial\omega_{lij}} \delta_{l, i} = I_{l, j} \cdot \delta_{l, i}
\end{equation}


Where $\frac{\partial\omega_{lik}}{\partial\omega_{lij}}$ is the Kronecker delta. The element $\Delta W_{lij}$ is given by :

\begin{equation}
\Delta W_{lij} = I_{lj} \cdot \delta_{li} = 
\begin{bmatrix}
I_{l, 0} \delta_{l, 0}    & I_{l, 1} \delta_{l, 0} & \cdots     & I_{l, m} \delta_{l, 0}   \\
I_{l, 0} \delta_{l, 1}    & I_{l, 1} \delta_{l, 1} & \cdots     & I_{l, m} \delta_{l, 1}   \\
\vdots                    & \vdots                 & \ddots     & \vdots                     \\
I_{l, 0} \delta_{l, n-1}  & \cdots                 & \cdots     & I_{l, m} \delta_{l, n-1}
\end{bmatrix}
\end{equation}

In [None]:
def compute_delta_weights(network):
    for l in range(len(network)):
        d = network[l]['dl']
        I = network[l]['I']
        network[l]['DW'] = (I[None, :] * d[:, None])
        
    return network

### Training functions

In [None]:
def update_network(network, data, target, transfert_mode):
    """
    Initializes input layer with data, and update the network values
    """
    network[0]['I'] = np.hstack([data, np.ones(1)])  # Initializing input layer with data
    network = forward_propagation(network, data, transfert_mode)
    network = backward_propagation(network, target, transfert_mode)
    network = compute_delta_weights(network)
    
    return network

In [None]:
def train_network(network, num_epoch, learning_rate, training_set, transfert_mode):
    
    vec_cost = np.ndarray(num_epoch)  # Var. to store the evolution of the cost function
    
    for iepoch in tqdm(range(num_epoch)):
        icost = 0
        
        for i in range(len(training_set['vectors'])):
            network = update_network(network, training_set['vectors'][i], training_set['targets'][i], transfert_mode)
            icost += cost(network[-1]['O'], training_set['targets'][i])
            
            # Updating the weights matrices
            for l in range(len(network)):
                network[l]['W'] = network[l]['W'] - learning_rate * network[l]['DW']
            
        vec_cost[iepoch] = icost/len(training_set['vectors'])

            
    return network, vec_cost

Wheat classification seem to work very fine with 1 hidden layer of 14 neurons, 100 epoch, learning rate 0.05 and sigmoid transfert function

In [None]:
data_dim  = len(training_set['vectors'][0])          # Dimention of the data (7 in the case of wheat database)
class_dim = len(np.unique(training_set['classes']))  # Number of possible outputs (3 in the case of wheat database)

# Architecture of the network
num_neurons_input  = data_dim
num_hidden_layers  = 1
num_neurons_hidden = 14
num_neurons_output = class_dim

# Training parameters
num_epochs     = 100     # Good results with 100
learning_rate  = 0.05    # Good results with 0.05
transfert_mode = 'sig'   # Choose between 'sig', 'ReLU' and 'ReLU_dying'

network = generate_neural_network(num_neurons_input, num_hidden_layers, num_neurons_hidden, num_neurons_output)
network, vec_cost = train_network(network, num_epochs, learning_rate, training_set, transfert_mode)

In [None]:
fig, ax = plt.subplots()
ax.set_ylim(0, 1)
ax.set_xlabel("Epoch")
ax.set_ylabel("Cost function")
ax.plot(vec_cost);

### Running on validation sample

In [None]:
def class_from_output(output):
    """
    Converts target vector to class
    
    PARAMETERS
        output : np.array
        
    RETURNS
        1 if output == [1, 0, 0]
        2 if output == [0, 1, 0]
        3 if output == [0, 0, 1]
    """
    return float(np.where(output == np.max(output))[0]+1)

## Confusion matrix and Accuracy
The accuracy $A$ is computed with the confusion matrix $M$ by :

\begin{equation}
A = \frac{\textrm{Tr}(M)}{\sum_{i, j} M_{ij}}
\end{equation}

In [None]:
# We test the network with the validation data set
predicted_values = np.ndarray(len(validation_set['vectors']))

for i in range(len(validation_set['vectors'])):
    entry      = validation_set['vectors'][i]                              # Data to put in network
    network    = forward_propagation(network, entry, transfert_mode='sig') # Forward propagation 
    net_output = network[-1]['O']                                          # Predicted value from network
    predicted_values[i] = class_from_output(net_output)                    # Storing the predicted class

CM = np.zeros(shape=(class_dim, class_dim)) # Confusion Matrix

for c in range(1, 4):     # c - class
    for p in range(1, 4): # p - prediction
        sel = (predicted_values == p) * (validation_set['classes'] == c)   # The * operator acts as the logic gate AND
        CM[c-1, p-1] = len(sel[sel])

print("Confusion matrix \n")
print("{:<10} : {:>6} {:>6} {:>6}".format("Predicted", 1, 2, 3))
print("Classes")
for i in range(len(CM)):
    [print("{:^12} {:>6} {:>6} {:>6}".format(i+1, *CM[i]))];
    
print("\nAccuracy : {:.2f}%".format(100*np.trace(CM)/np.sum(CM)))