<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-functions" data-toc-modified-id="Load-functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load functions</a></span></li><li><span><a href="#Loading-loaders" data-toc-modified-id="Loading-loaders-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Loading loaders</a></span></li><li><span><a href="#Training-a-first-simple-NN" data-toc-modified-id="Training-a-first-simple-NN-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Training a first simple NN</a></span></li></ul></div>

# CNN Notebook

## Load functions

In [1]:
import models.cnn_model
from models.cnn_preprocessing import main_preprocessing_cnn
from models.cnn_train_test import train, test
from models.cnn_model_statistics import main_cnn_stats_model
from models.loading_model_data import main_loading_model_data

import torch.nn as nn 
import torch.nn.functional as F
import torch
import numpy as np

## Training a first simple NN

In [2]:
class classifier_GD_1(nn.Module):
    # Observed accuracy on test set with V4 and taxonomy level 1 is around 35% with these parameters

    def __init__(self, n_out_features: int):
        super(classifier_GD_1, self).__init__()
        # PARAMETERS
        self.out_channel_1 = out_channel_1
        self.out_channel_2 = out_channel_2
        self.kernel_size_1 = kernel_size_1
        self.max_pool_stride_1 = max_pool_stride_1
        self.max_pool_stride_2 = max_pool_stride_2
        self.ratio_fc_1 = ratio_fc_1
        # COPIED PARAMETERS
        self.kernel_size_max_pool_1 = self.kernel_size_1
        self.kernel_size_2 = self.kernel_size_1
        # SIZE COMPUTATION
        self.L_out_conv_1 = 300 - self.kernel_size_1 + 1
        self.L_out_max_pool_1 = int((self.L_out_conv_1 - self.kernel_size_1) // self.max_pool_stride_1) + 1
        self.L_out_conv_2 = self.L_out_max_pool_1 - self.kernel_size_2 + 1
        self.L_out_max_pool_2 = int((self.L_out_conv_2 - self.kernel_size_2) // self.max_pool_stride_2) + 1
        self.L_out_fc_1 = int(self.out_channel_2 * self.L_out_max_pool_2 * self.ratio_fc_1)
        # Layers
        self.conv1 = nn.Conv1d(in_channels=4, out_channels=self.out_channel_1,
                               kernel_size=self.kernel_size_1, padding=0)
        self.bn1 = nn.BatchNorm1d(self.out_channel_1)
        self.ReLU1 = nn.ReLU()
        self.conv2 = nn.Conv1d(in_channels=self.out_channel_1, out_channels=self.out_channel_2,
                               kernel_size=self.kernel_size_2, padding=0)
        self.bn2 = nn.BatchNorm1d(self.out_channel_2)
        self.ReLU2 = nn.ReLU()
        self.fc1 = nn.Linear(in_features=self.out_channel_2 * self.L_out_max_pool_2,
                             out_features=self.L_out_fc_1)
        self.ReLU3 = nn.ReLU()
        self.fc2 = nn.Linear(in_features=self.L_out_fc_1,
                             out_features=n_out_features)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.ReLU1(x)
        x = F.max_pool1d(x, kernel_size=self.kernel_size_1, stride=self.max_pool_stride_1)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.ReLU2(x)
        x = F.max_pool1d(x, kernel_size=self.kernel_size_2, stride=self.max_pool_stride_2)
        x = x.view(-1, self.out_channel_2 * self.L_out_max_pool_2)
        x = self.fc1(x)
        x = self.ReLU3(x)
        x = F.dropout(x, p=0.5)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [3]:
train_loader, test_loader, dict_class_to_id, dict_id_to_class = main_preprocessing_cnn(selected_primer='V4', taxonomy_level=1)
X_train, X_test, y_train, y_test = main_loading_model_data(sequence_origin='DairyDB', primers_origin='DairyDB', selected_primer='V4', taxonomy_level=1)
n_out_features = len(dict_class_to_id)

out_channel_1 = 8
out_channel_2 = 8
kernel_size_1 = 4
max_pool_stride_1 = 8
max_pool_stride_2 = 8
ratio_fc_1 = 1 / 2

n_epochs = 10

conv_class = classifier_GD_1(n_out_features)

loss_fn = nn.CrossEntropyLoss()
learning_rate = 1e-3
optimizer_cl = torch.optim.Adam(conv_class.parameters(), lr=learning_rate)

loss_train, acc_train, loss_test, acc_test = train(conv_class, train_loader, test_loader, loss_fn, optimizer_cl, n_epochs=n_epochs)
_, _, y_test_torch, y_pred_torch = test(conv_class, test_loader, loss_fn)

Train - Loss: 0.0409 Acc: 0.3362
Test - Loss: 0.0314 Acc: 0.4516
Train - Loss: 0.0249 Acc: 0.5896
Test - Loss: 0.0199 Acc: 0.7074
Train - Loss: 0.0174 Acc: 0.7364
Test - Loss: 0.0169 Acc: 0.7404
Train - Loss: 0.0153 Acc: 0.7526
Test - Loss: 0.0150 Acc: 0.7590
Train - Loss: 0.0134 Acc: 0.7785
Test - Loss: 0.0132 Acc: 0.7814
Train - Loss: 0.0124 Acc: 0.7916
Test - Loss: 0.0124 Acc: 0.7878
Train - Loss: 0.0117 Acc: 0.8033
Test - Loss: 0.0119 Acc: 0.7941
Train - Loss: 0.0111 Acc: 0.8079
Test - Loss: 0.0117 Acc: 0.8011
Train - Loss: 0.0107 Acc: 0.8152
Test - Loss: 0.0114 Acc: 0.8032
Train - Loss: 0.0106 Acc: 0.8148
Test - Loss: 0.0109 Acc: 0.8197
Test - Loss: 0.0106 Acc: 0.8144


In [4]:
acc_test

array([0.45159574, 0.70744681, 0.74042553, 0.75904255, 0.78138298,
       0.78776596, 0.79414894, 0.80106383, 0.80319149, 0.81968085])

In [5]:
main_cnn_stats_model(y_train, y_test_torch, y_pred_torch, dict_id_to_class, loss_train, loss_test, acc_train, acc_test,
                         make_plot=True,
                         model_name='CNN - Aoki',
                         model_class=conv_class,
                         model_preprocessing='OHE of letters in 4 dimensions (k=1)',
                         sequence_origin='DairyDB',
                         primers_origin='DairyDB',
                         taxonomy_level=1,
                         selected_primer='V4')