In [1]:
# Heterogeneous pooling
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm

# supress warnings
import warnings
warnings.filterwarnings("ignore")

warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
# all the data will be stored in this dataframe, with the method name, mean accuracy, standard deviation, lower and upper bound
df = pd.DataFrame(columns=['method', 'mean', 'std', 'lower', 'upper'])

df_per_fold = pd.DataFrame(columns=['method', 'fold', 'mean','std', 'lower', 'upper'])



In [3]:
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')


# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [4]:
# Zero Rule Baseline
from sklearn.dummy import DummyClassifier

clf = DummyClassifier(strategy='most_frequent')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

df = df.append({'method': 'ZR', 'mean': scores.mean(), 'std': scores.std(), 'lower': scores.mean() - scores.std(), 'upper': scores.mean() + scores.std()}, ignore_index=True)


In [5]:
def train_model(model,params_grid,name, df): 
    cv_inner = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=36851234)
    cv_outer = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

    pipe = Pipeline(steps=[('s', StandardScaler()), ('m', model)])
    df_per_fold = pd.DataFrame(columns=['method', 'fold', 'mean','std', 'lower', 'upper'])
    params = params_grid
    counter = 0
    scores = []
    best_acc = 0
    for train_ix, test_ix in tqdm(cv_outer.split(X_train, y_train)):
        # split data
        X_train_inner, X_test_inner = X_train.iloc[train_ix, :], X_train.iloc[test_ix, :]
        y_train_inner, y_test_inner = y_train.iloc[train_ix], y_train.iloc[test_ix]

        # define search
        search = GridSearchCV(pipe, param_grid=params, scoring='accuracy', cv=cv_inner, n_jobs=-1)

        # execute search
        result = cross_val_score(search, X_train_inner, y_train_inner.values.ravel(), cv=cv_inner, n_jobs=-1)

        scores.extend([result.mean()])
        # df_per_fold = df_per_fold.append({'method': name, 'fold': counter, 'mean': result.mean(), 'std': result.std(), 'lower': result.mean() - result.std(), 'upper': result.mean() + result.std()}, ignore_index=True)
        counter += 1
        # check the best model
        if result.mean() > best_acc:
            best_acc = result.mean()
            best_model = search

    df_awnser = pd.concat([df, pd.DataFrame({'method': [name], 'mean': [np.mean(scores)], 'std': [np.std(scores)], 'lower': [np.mean(scores) - np.std(scores)], 'upper': [np.mean(scores) + np.std(scores)]})], ignore_index=True)
    return df_awnser, df_per_fold

# def train_model(model,params_grid,name, df): 
#     scalar = StandardScaler()
#     pipe = Pipeline(steps=[('s',scalar), ('m', model)])

#     gs = GridSearchCV(pipe, param_grid=params_grid, scoring='accuracy', cv=4, n_jobs=-1)

#     rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)
   

#     scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)

#     df_awnser = pd.concat([df, pd.DataFrame({'method': [name], 'mean': [np.mean(scores)], 'std': [np.std(scores)], 'lower': [np.mean(scores) - np.std(scores)], 'upper': [np.mean(scores) + np.std(scores)]})], ignore_index=True)
    
#     return df_awnser, " "

In [6]:
from sklearn.ensemble import BaggingClassifier


bg = BaggingClassifier(n_estimators=3)

name = 'BA'

params_grid = {
    'm__n_estimators': [3,9,15,21]
    } 

df, df_per_fold = train_model(bg,params_grid,name,df)


30it [01:43,  3.44s/it]


In [7]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=3)

name = 'AB'

params_grid = {
    'm__n_estimators': [3,9,15,21]
    }

df,df_per_fold = train_model(ada,params_grid,name,df)

30it [01:10,  2.36s/it]


In [8]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier()

name = 'RF'

params_grid = {
    'm__n_estimators': [3,9,15,21]
} 


df,df_per_fold = train_model(rf,params_grid,name,df)

30it [01:19,  2.64s/it]


In [9]:
df

Unnamed: 0,method,mean,std,lower,upper
0,ZR,0.177174,0.015893,0.161281,0.193066
1,BA,0.459815,0.016853,0.442962,0.476669
2,AB,0.262023,0.012471,0.249553,0.274494
3,RF,0.459703,0.019993,0.43971,0.479695


### HP

In [10]:
# save df 
df.to_csv('df.csv', index=False)

In [11]:
# import base estimators
from sklearn.base import BaseEstimator
# import classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


class HeterogeneousEnsemble(BaseEstimator):
    # define o construtor para o classificador
    def __init__(self,n_samples=3):
        
        self.classifiers  =  [DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB()]
        self.n_samples = n_samples
        self.trained_classifiers = []


    def train_classifiers(self, X_train, y_train):

        # converter para numpy array
        X_train = X_train.to_numpy()
        y_train = y_train.to_numpy()

        # faz o loop sobre os classificadores individuais
        for clf in self.classifiers:
            # treina o classificador no conjunto de treinamento atual
            clf.fit(X_train, y_train)
            # adiciona o classificador treinado à lista
            self.trained_classifiers.append(clf)
        # retorna a lista de classificadores treinados
        return self.trained_classifiers

    def sample_data(self,X_train, y_train, random_state):
        # amostra as características com reposição e obtém os rótulos correspondentes
        X_train_sampled = X_train.sample(frac=1, replace=True, random_state=random_state)
        y_train_sampled = y_train.loc[X_train_sampled.index]
        # retorna o conjunto de dados amostrado
        return X_train_sampled, y_train_sampled


    def predict_hp(self, X_test, class_order):
        # cria um dicionário para armazenar as predições de cada classificador
        votes = {}
        X_test = X_test.to_numpy().reshape(1, -1)
        # faz o loop sobre os classificadores individuais
        for clf in self.trained_classifiers:
            # prediz a classe do exemplo de teste usando o classificador atual
            pred = clf.predict(X_test)
            # armazena a predição no dicionário
            if pred[0] in votes:
                votes[pred[0]] += 1
            else:
                votes[pred[0]] = 1
                

        # obtém a(s) classe(s) mais votada(s) e as armazena em uma lista
        max_votes = max(votes.values())
        most_voted_classes = [k for k,v in votes.items() if v == max_votes]
        
        hp_pred = None

        # se houver mais de uma classe mais votada, quebra o empate usando a ordem das classes do conjunto de treinamento
        if len(most_voted_classes) > 1:
            for c in class_order:
                if c in most_voted_classes:
                    hp_pred = c
                    break
            if hp_pred is None:
                hp_pred = most_voted_classes[0]
        # caso contrário, retorna a classe mais votada como a predição do conjunto HP
        else:
            hp_pred = most_voted_classes[0]

        # retorna a predição
        return hp_pred



    def fit(self,X_train,y_train): 

        # # if data is numpy array, convert to pandas dataframe
        if isinstance(X_train, np.ndarray):
            X_train = pd.DataFrame(X_train)
        if isinstance(y_train, np.ndarray):
            y_train = pd.Series(y_train)

        # reset index
        X_train.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)

        classifiers = [] # ciclo para treinar os classificadores individuais
        for i in range(self.n_samples):
            # se for a primeira iteração, use os dados de treinamento originais
            if i == 0:
                X_train_current = X_train.copy()
                y_train_current = y_train.copy()
            # caso contrário, crie um novo conjunto de treinamento amostrando com reposição os dados originais usando a função sample_data
            else:
                X_train_current, y_train_current = self.sample_data(X_train, y_train, i)
            
            # treina os classificadores individuais nos dados de treinamento atuais usando a função train_classifiers e os estende à lista
            classifiers.extend(self.train_classifiers(X_train_current, y_train_current))

    def predict(self,X_test): 
        if(isinstance(X_test, np.ndarray)):
            X_test = pd.DataFrame(X_test)
        
        class_order = y_train.value_counts().index.tolist()
        # cria uma lista vazia para armazenar as predições do conjunto HP
        hp_predictions = []
        # faz o loop sobre os exemplos de teste
        for index, row in X_test.iterrows():
            # predict the class of the test example using the predict_hp function and append it to the list # prediz a classe do exemplo de teste usando a função predict_hp e a adiciona à lista
            hp_pred = self.predict_hp(row, class_order)
            hp_predictions.append(hp_pred)
        return hp_predictions

    

In [12]:
hp_Test = HeterogeneousEnsemble(n_samples=9)

hp_Test.fit(X_train,y_train)

hp_predictions = hp_Test.predict(X_test)

# avalia a acurácia do conjunto HP no conjunto de teste
hp_accuracy = accuracy_score(y_test, hp_predictions)
print(f'The accuracy of HP ensemble is {hp_accuracy:.4f}')

The accuracy of HP ensemble is 0.4167


In [13]:


hp = HeterogeneousEnsemble()

name = 'HP'

params_grid = {
    'm__n_samples': [3,9,15,21]
    }

df,df_per_fold = train_model(hp,params_grid,name,df)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

In [None]:
df.to_csv('df.csv', index=False)