<h1><b>Hiperparametrização com o algoritmo BayesSearchCV</b></h1>
<p><b>Classificadores:</b>
<ol>
  <li>ExtraTree (ET)</li>
  <li>Logistic regression (LR)</li>
  <li>Multilayer Perceptron (MLP)</li>
  <li>Multinomial Naive Bayes (MNB)</li>
  <li>Passive Aggressive (PA)</li>
  <li>Stochastic Gradient Descent (SGD)</li>
  <li>Support Vector Machine (SVM)</li>
</ol>
<br><b>Base de dados:</b> PROMISE_exp.
<br><b>Natureza do problema:</b> Hiperparametrização com validação cruzada, utilizando 11 subclasses de Requisitos Não-Funcionais (RNF) disponíveis na base de dados:</p>
<table style="text-align:center;" align=left>
    <tr>
        <th>Tipo de Requisito:</th>
        <th>Classe:</th>
        <th>Quantidade:</th>
    </tr>
    <tr>
        <td>Disponibilidade</td>
        <td>A</td>
        <td>31</td>
    </tr>
    <tr>
        <td>Tolerância à Falha</td>
        <td>FT</td>
        <td>18</td>
    </tr>
    <tr>
        <td>Legal</td>
        <td>L</td>
        <td>15</td>
    </tr>
    <tr>
        <td>Aparência</td>
        <td>LF</td>
        <td>49</td>
    </tr>
    <tr>
        <td>Manutenibilidade</td>
        <td>MN</td>
        <td>24</td>
    </tr>
    <tr>
        <td>Operacional</td>
        <td>O</td>
        <td>77</td>
    </tr>
    <tr>
        <td>Performance</td>
        <td>PE</td>
        <td>67</td>
    </tr>
    <tr>
        <td>Portabilidade</td>
        <td>PO</td>
        <td>12</td>
    </tr>
    <tr>
        <td>Escalabilidade</td>
        <td>SC</td>
        <td>22</td>
    </tr>
    <tr>
        <td>Segurança</td>
        <td>SE</td>
        <td>125</td>
    </tr>
    <tr>
        <td>Usabilidade</td>
        <td>US</td>
        <td>85</td>
    </tr>
    <tr>
        <td>Total:</td>
        <td>11</td>
        <td>525</td>
    </tr>
</table>

<h4>Bibliotecas:</h4>

In [1]:
import csv
import warnings
import joblib
import numpy as np
import pandas as pd
from skopt import BayesSearchCV
from sklearn.svm import SVC as SVM
from scipy.sparse.csr import csr_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from skopt.space import Real, Categorical, Integer
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.ensemble import ExtraTreesClassifier as ET
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import PassiveAggressiveClassifier as PA

<h3>Parâmetros e classificadores</h3>

In [2]:
Clf_Prt = {}
NomeClf = {}

<h4>ExtraTree (ET)</h4>

In [3]:
parametros = {}
parametros['n_estimators'] = Integer(150, 1100)
parametros['criterion'] = Categorical(['gini', 'entropy'])
parametros['max_depth'] = Integer(20, 120)
parametros['min_samples_split'] = Integer(2,20)
parametros['min_samples_leaf'] = [1]
parametros['max_features'] = Categorical(['auto', 'sqrt', 'log2'])
parametros['max_leaf_nodes'] = Integer(50, 150)
parametros['warm_start'] = [True, False]
parametros['max_samples'] = Real(0.01, 0.9)
Clf_Prt[ET()] = parametros
NomeClf[ET().__class__.__name__] = 'ET'

<h4>Logistic regression (LR)</h4>

In [4]:
parametros = {}
parametros['penalty'] = Categorical(['none'])
parametros['tol'] = Real(1e-5, 1e-3)
parametros['C'] = Real(1e-1, 1.2)
parametros['fit_intercept'] = [True, False]
parametros['intercept_scaling'] = Real(1e-3, 1e3)
parametros['solver'] = Categorical(['newton-cg', 'lbfgs', 'sag'])
parametros['max_iter'] = Integer(20, 1000)
parametros['multi_class'] = Categorical(['auto', 'ovr'])
parametros['warm_start'] = [True, False]
parametros['n_jobs'] = [-1]
parametros['l1_ratio'] = Real(0, 1)
Clf_Prt[LR()] = parametros
NomeClf[LR().__class__.__name__] = 'LR'

<h4>Multi-layer Perceptron (MLP)</h4>

In [5]:
parametros = {}
parametros['hidden_layer_sizes'] = Integer(20, 250)
parametros['activation'] = Categorical(['tanh', 'relu'])
parametros['solver'] = Categorical(['adam'])
parametros['batch_size'] = Integer(32, 480)
parametros['learning_rate_init'] = Real(1e-3, 0.1)
parametros['validation_fraction'] = [0.1, 0.2]
parametros['n_iter_no_change'] = [5, 10]
parametros['early_stopping'] = [True]
parametros['max_iter'] = Integer(20, 500)
parametros['tol'] = Real(1e-5, 1e-3)
parametros['warm_start'] = [True, False]
Clf_Prt[MLP()] = parametros
NomeClf[MLP().__class__.__name__] = 'MLP'

<h4>Multinomial Naive Bayes (MNB)</h4>

In [6]:
parametros = {}
parametros['alpha'] = Real(1e-3, 1e3)
parametros['fit_prior'] = [True, False]
Clf_Prt[MNB()] = parametros
NomeClf[MNB().__class__.__name__] = 'MNB'

<h4>Passive Aggressive (PA)</h4>

In [7]:
parametros = {}
parametros['tol'] = Real(1e-5, 1e-3)
parametros['C'] = Real(1e-1, 1.2)
parametros['fit_intercept'] = [True, False]
parametros['max_iter'] = Integer(20, 1000)
parametros['early_stopping'] = [True]
parametros['validation_fraction'] = [0.1, 0.2]
parametros['n_iter_no_change'] = [5, 10]
parametros['loss'] = ['hinge', 'squared_hinge']
parametros['warm_start'] =[True, False]
parametros['n_jobs'] = [-1]
Clf_Prt[PA()] = parametros
NomeClf[PA().__class__.__name__] = 'PA'

<h4>Stochastic Gradient Descent (SGD)</h4>

In [8]:
parametros = {}
parametros['loss'] = Categorical(['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'])
parametros['penalty'] = Categorical(['l1', 'l2', 'elasticnet'])
parametros['alpha'] = Real(1e-4, 1e-2)
parametros['l1_ratio'] = Real(0, 1)
parametros['fit_intercept'] = [True, False]
parametros['max_iter'] = Integer(20, 1000)
parametros['tol'] = Real(1e-5, 1e-3)
parametros['shuffle'] = [True, False]
parametros['epsilon'] = Real(1e-2, 1) 
parametros['n_jobs'] = [-1]
parametros['learning_rate'] = Categorical(['optimal', 'invscaling', 'adaptive'])
parametros['eta0'] = Real(1e-2, 1e1)
parametros['power_t'] = Real(0, 0.1)
parametros['early_stopping'] = [True]
parametros['validation_fraction'] = [0.1, 0.2]
parametros['n_iter_no_change'] = [5, 10]
parametros['warm_start'] = [True, False]
parametros['average'] = [True, False]
Clf_Prt[SGD()] = parametros
NomeClf[SGD().__class__.__name__] = 'SGD'

<h4>Support Vector Machine (SVM)</h4>

In [9]:
parametros = {}
parametros['C'] = Real(1e-1, 1e1)
parametros['kernel'] = Categorical(['linear', 'rbf'])
parametros['gamma'] = Categorical(['scale', 'auto'])
parametros['shrinking'] = [True, False]
parametros['probability'] = [True, False]
parametros['tol'] = Real(1e-5, 1e-3)
parametros['cache_size'] = [500]
parametros['decision_function_shape'] = Categorical(['ovo', 'ovr'])
Clf_Prt[SVM()] = parametros
NomeClf[SVM().__class__.__name__] = 'SVM'

<h4>Time Monitoring</h4>

In [10]:
class timeTool():
    import time
    iniTime = 0.0
    finalTime = 0.0
    totalTime = '00h00m00s'
    endDataTime = 'Date Hour year'
    initDataTime = 'Date Hour year'
    def init(self):
        self.iniTime = self.time.time()
        self.initDataTime = self.time.ctime()
    def end(self):
        self.finalTime = self.time.time()
        self.endDataTime = self.time.ctime()
        hour = 0 
        minute = 0 
        second = 0
        value = self.finalTime - self.iniTime
        if value >= 3600:
            hour = int(value/3600)
            helper = value%3600
            if helper >= 60:
                minute = int(helper/60)
                second = int(helper%60)
            else:
                second = int(helper)
            self.totalTime = '{0}h:{1}m:{2}s'.format(hour, minute, second)
        elif value >= 60:
            minute = int(value/60)
            second = int(value%60)
            self.totalTime = '{0}h:{1}m:{2}s'.format(hour, minute, second)
        else:
            second = int(value)
            self.totalTime = '{0}h:{1}m:{2}s'.format(hour, minute, second)
    def getExecuTime(self):
        return self.totalTime
    def getInDateTime(self):
        return self.initDataTime
    def getEnDataTime(self):
        return self.endDataTime

<h3>Hiperparametrização:</h3>

In [11]:
resamplings = ['origin', 'tomek', 'adasyn', 'smote', 'bdsmote', 'smotetomek']

In [12]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])


def get_data(resampling, n_intr, n_data):
    train_tfidf = load_sparse_csr('../../results/datasets/data_'+str(n_intr)+'/train/'+resampling + '_train(' + str(n_data) + ').npz')
    train_class = pd.read_csv('../../results/datasets/data_'+str(n_intr)+'/train/'+resampling + '_train(' + str(n_data) + ').csv')

    return train_tfidf, train_class

In [None]:
iterations = 10
folds = 6
n_splits = 6
n_iter = 150
for i in range(1, 2):
    for j in range(2, folds):
        print("\nInteration "+str(i+1), "Dobra "+str(j+1))
        for resample in resamplings:

            X_Tokens, train_class = get_data(resample, i+1, j+1)
            y_Class = np.array(train_class['Class'])
            warnings.filterwarnings('ignore')

            cv = ShuffleSplit(n_splits=n_splits, test_size=0.2)

            for classificador in Clf_Prt:
                modelName = classificador.__class__.__name__
                timeT = timeTool()
                timeT.init()
                print('\nStart of the ' + modelName + ' algorithm with ' + resample + ' at ' + timeT.getInDateTime())
                modelo = BayesSearchCV(estimator=classificador, search_spaces=Clf_Prt[classificador], 
                                       n_iter=n_iter, scoring='f1_macro', cv=cv, refit=True, return_train_score=False, n_jobs=3, n_points=3, pre_dispatch=3)

                modelo.fit(X_Tokens, y_Class)
                timeT.end()

                #Salvando modelo
                clf = modelo.best_estimator_
                filename = '../../results/hyperparametrization/models/data_'+str(i+1)+'/'+resample+'/'+clf.__class__.__name__+'('+str(j+1)+').joblib.pkl'
                _ = joblib.dump(clf, filename, compress=9)

                #Tratamento dos resultados:
                dt = pd.DataFrame(modelo.cv_results_)
                linhas = {'Algorithm': NomeClf[modelName],
                          'DataSample': resample+'_'+str(j+1), 
                          'n_inter': modelo.n_iter, 'n_div': modelo.n_splits_, 'Initial Date/Hour': timeT.getInDateTime(), 
                          'Final Date/Hour': timeT.getEnDataTime(), 'Execution time': timeT.getExecuTime(),
                          'f1-score Macro': '{:.0%}'.format(modelo.best_score_), 'Params': modelo.best_params_}
                path = '../../results/hyperparametrization/data_'+str(i+1)+'/'+resample+'/hypeResults'+modelo.__class__.__name__+'('+NomeClf[modelName] +').csv'
                try:
                    open(path, 'r')
                    with open(path, 'a') as arq:
                        writer = csv.writer(arq)
                        writer.writerow(linhas.values())
                except IOError:
                    dataF = pd.DataFrame(columns=linhas.keys())
                    dataF = dataF.append(linhas, ignore_index=True)
                    dataF.to_csv(path, index=False)
                print('End of the ' + modelName + ' algorithm with ' + resample + ' at ' + timeT.getInDateTime() + '\nTotal run time: ' + timeT.getExecuTime())

                