# GRIDSEARCH CNN

on definit un réseau CNN avec quelques paramètres comme le nombre de convolution, le nombre de neurones et le dropout

In [1]:
import os
import sys
sys.path.append('../src')
import time

from balance import load_balanced
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score, precision_score

import torch.nn as nn
import torch
from torch.utils.data import TensorDataset, DataLoader
from torchsummary import summary
from torch import optim
from sklearn.model_selection import GridSearchCV
from skorch import NeuralNetClassifier
import skorch


librairie pour sauver le gridsearch

In [2]:
import joblib

## Chargement du dataset

In [3]:
X1,y1 = load_balanced.load('../data/processed/mitbih_train_smote_perturb_50000.csv')
X1 = np.expand_dims(X1,axis=1)
X_test,y_test = load_balanced.load('../data/raw/mitbih_test.csv')
X_test = np.expand_dims(X_test,axis=1)


train_set = TensorDataset(torch.from_numpy(X1).float(), torch.Tensor(torch.from_numpy(y1).long()))
dataloader = DataLoader(train_set, batch_size= X1.shape[0],shuffle=True)

#shuffle obligatoire sinon ca marche pas le fit
X,y = next(iter(dataloader))

In [4]:

# Définir le modèle de régression linéaire
class myCNN(nn.Module):
    def __init__(self, num_classes, n_channels_conv=64,n_linear_hidden=32,dropout=0.3):
        super(myCNN, self).__init__()
        
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=n_channels_conv, kernel_size=11, stride=1, padding=0)
        self.batch1 = nn.BatchNorm1d(num_features=n_channels_conv)
        self.MaxPool1 = nn.MaxPool1d(5, stride=5) # shape: (batch_size, n, 35)       
        
        self.conv2 = nn.Conv1d(in_channels=n_channels_conv, out_channels=n_channels_conv, kernel_size=7, stride=1, padding=0) # shape: (batch_size, n, 29)
        self.batch2 = nn.BatchNorm1d(num_features=n_channels_conv)
        self.MaxPool2 =  nn.MaxPool1d(2, stride=2) # shape: (batch_size, n, 14)
        
        self.conv3 = nn.Conv1d(in_channels=n_channels_conv, out_channels=n_channels_conv, kernel_size=3, stride=1, padding=0) # shape: (batch_size, n, 12)
        self.batch3 = nn.BatchNorm1d(num_features=n_channels_conv) 
        self.MaxPool3 = nn.MaxPool1d(2, stride=2) # shape: (batch_size, n, 6)
        
        self.Lin1 =  nn.Linear(6*n_channels_conv, n_linear_hidden)
        self.dropout = nn.Dropout(dropout)
        self.Lin2 = nn.Linear(n_linear_hidden, num_classes)
       
    def forward(self, x):
        x = nn.ReLU()(self.conv1(x))  
        x =  self.batch1(x)
        x =  self.MaxPool1(x)
    
        x = nn.ReLU() (self.conv2(x))
        x =  self.batch2(x)
        x =  self.MaxPool2(x)
        
        x = nn.ReLU()(self.conv3(x))
        x =  self.batch3(x)
        x =  self.MaxPool3(x)
    
        x = nn.Flatten()(x)
        x = nn.ReLU() (self.Lin1(x))
        x = self.dropout(x)
        x = self.Lin2(x)
       
        return x



## NeuralNetClassifier pour utiliser le gridsearchcv de sklearn

In [5]:
net = NeuralNetClassifier(myCNN,
                        module__num_classes=5,  
                        criterion=nn.CrossEntropyLoss,
                        optimizer=optim.Adam,
                        optimizer__lr=0.001,
                        batch_size=50,
                         max_epochs=20)

In [6]:
params = {
    'module__dropout': [ 0.2, 0.4],
    'module__n_channels_conv': [32,48,64,80],
    'module__n_linear_hidden':[20,30,50]
}

In [7]:
gs = GridSearchCV(net, params, cv=3, scoring='average_precision', verbose=1)
grid_result = gs.fit(X, y)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.2398[0m       [32m0.9393[0m        [35m0.1819[0m  29.2199
      2        [36m0.1183[0m       [32m0.9671[0m        [35m0.0933[0m  23.9640
      3        [36m0.0928[0m       [32m0.9724[0m        [35m0.0815[0m  23.1193
      4        [36m0.0791[0m       [32m0.9778[0m        [35m0.0666[0m  23.1605
      5        [36m0.0696[0m       0.9733        0.0802  23.2004
      6        [36m0.0648[0m       [32m0.9791[0m        [35m0.0593[0m  23.2344
      7        [36m0.0585[0m       [32m0.9823[0m        [35m0.0532[0m  23.0594
      8        [36m0.0547[0m       0.9794        0.0607  23.0777
      9        [36m0.0513[0m       [32m0.9832[0m        [35m0.0506[0m  22.9469
     10        [36m0.0492[0m       0.9792        0.0640  23.1008
     11        [36m0.0457

In [8]:

#save your model or results
joblib.dump(gs, 'gridsearh.pkl')

['gridsearh.pkl']

In [11]:
gs.best_params_

{'module__dropout': 0.2,
 'module__n_channels_conv': 80,
 'module__n_linear_hidden': 50}

In [12]:
gs.best_score_

0.9993186208418029

In [13]:
gs.best_estimator_

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=myCNN(
    (conv1): Conv1d(1, 80, kernel_size=(11,), stride=(1,))
    (batch1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (MaxPool1): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
    (conv2): Conv1d(80, 80, kernel_size=(7,), stride=(1,))
    (batch2): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (MaxPool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv3): Conv1d(80, 80, kernel_size=(3,), stride=(1,))
    (batch3): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (MaxPool3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (Lin1): Linear(in_features=480, out_features=50, bias=True)
    (Lin2): Linear(in_features=50, out_features=5, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  ),
)

In [14]:
bm=  gs.best_estimator_


In [15]:
X_test.shape

(21892, 1, 187)

In [16]:
y_pred = bm.predict(torch.from_numpy(X_test).float())

In [17]:
y_pred

array([0, 0, 0, ..., 4, 4, 4], dtype=int64)

In [18]:
table = pd.crosstab(y_test,y_pred,rownames=['True'],colnames=['Predicted'])
print(table)

print(classification_report(y_test, y_pred))

Predicted      0    1     2    3     4
True                                  
0.0        17819  151    40   89    19
1.0           77  474     2    2     1
2.0           40    8  1367   28     5
3.0           11    0     5  146     0
4.0           11    0     3    0  1594
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99     18118
         1.0       0.75      0.85      0.80       556
         2.0       0.96      0.94      0.95      1448
         3.0       0.55      0.90      0.68       162
         4.0       0.98      0.99      0.99      1608

    accuracy                           0.98     21892
   macro avg       0.85      0.93      0.88     21892
weighted avg       0.98      0.98      0.98     21892



In [21]:
gs.cv_results_['mean_test_score']*100

array([99.84960545, 99.88809003, 99.89691713, 99.9133527 , 99.91528812,
       99.9253428 , 99.89935452, 99.90532495, 99.92051921, 99.86752374,
       99.92758545, 99.93186208, 99.84911886, 99.86183133, 99.88819839,
       99.87065757, 99.87207107, 99.90977058, 99.91299036, 99.90442414,
       99.92273839, 99.92063937, 99.91705817, 99.9275783 ])

Tous les modèles donnent les mêmes performances car les data de crossvalidations sont proches des entrainements

In [36]:
pd.DataFrame({'dropout': gs.cv_results_['param_module__dropout'],
              'conv':gs.cv_results_['param_module__n_channels_conv'],
              'n_linear_hidden': gs.cv_results_['param_module__n_linear_hidden'],
              'score': gs.cv_results_['mean_test_score']*100,
              'std':gs.cv_results_['std_test_score']*100})
#gs.cv_results_['param_module__n_channels_conv'].data
#'param_module__n_linear_hidden'

Unnamed: 0,dropout,conv,n_linear_hidden,score,std
0,0.2,32,20,99.849605,0.017459
1,0.2,32,30,99.88809,0.010195
2,0.2,32,50,99.896917,0.016818
3,0.2,48,20,99.913353,0.018354
4,0.2,48,30,99.915288,0.004132
5,0.2,48,50,99.925343,0.011966
6,0.2,64,20,99.899355,0.008706
7,0.2,64,30,99.905325,0.012947
8,0.2,64,50,99.920519,0.017414
9,0.2,80,20,99.867524,0.059339


## performances sur la base 2

In [37]:
X_anormal,y_anormal = load_balanced.load('../data/raw/ptbdb_abnormal.csv')
X_anormal= np.expand_dims(X_anormal,axis=1)

In [38]:
y_anormal_pred = bm.predict(torch.from_numpy(X_anormal).float())

In [40]:
y_anormal_pred[y_anormal_pred>0]=1

In [44]:
pd.Series(y_anormal_pred).value_counts()

0    9506
1    1000
Name: count, dtype: int64

In [43]:
X_normal,y_normal = load_balanced.load('../data/raw/ptbdb_normal.csv')
X_normal= np.expand_dims(X_normal,axis=1)
y_normal_pred = bm.predict(torch.from_numpy(X_normal).float())
y_normal_pred[y_normal_pred>0]=1
pd.Series(y_normal_pred).value_counts()

0    3719
1     327
Name: count, dtype: int64