# Configurações

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import datasets
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [2]:
n_splits = 10
n_repeats = 3
n_estimators = [3,9,15,21]

### Load dataset

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/VitorBonella/PL-Dataset/main/dataset.csv',sep=";") #Leitura dos dados para o pandas

Como minha matrícula é terminada em 9, utilizarei 7 descritores de Hu e 6 descritores de Haralick.

In [4]:
# Lista de Indices
FOURIER = ['df01', 'df02', 'df03', 'df04','df05', 'df06', 'df07', 'df08', 'df09', 'df10']
HU = ['i1', 'i2', 'i3', 'i4','i5', 'i6', 'i7']
HARALICK = ['probmax', 'energia', 'entropia', 'contraste','homogeneidade', 'correlacao']
ALL = FOURIER + HU + HARALICK

df[ALL].head(2)

Unnamed: 0,df01,df02,df03,df04,df05,df06,df07,df08,df09,df10,...,i4,i5,i6,i7,probmax,energia,entropia,contraste,homogeneidade,correlacao
0,879606602603602,540590780493764,252096142058536,274713661921584,128341271697431,837372424071006,459165042660484,516286747775679,474694756923231,307480942393677,...,5717751047.37315,380815.942436415,-141733745.231029,315336.969940052,890374128851521,317034178175013,65716176171224,-307129899022437,376304934586401,30352446061056
1,183325142757933,641922380636373,213351158490625,28934652413586,12263060875406,112299899510848,356611493895844,586153710536664,374648636964241,377850078446776,...,1011576.08366481,-0.539593107944031,-6859.66112547544,0.390722879150443,874335002692948,305605231787486,756143396285353,-371051952158663,372262223245045,268692571829909


In [5]:
df.set_index('id',inplace=True) #Transformando a coluna id no indice da tabela

df[ALL] = df[ALL].apply(lambda x: x.str.replace(',', '.').astype(float), axis=1)

#Observem que a classe esta separada em duas colunas então devemos concatenadas para formar uma coluna só chamada classe
df['classe'] = df['tipo_lampada'].str.replace(" ", "") + df['potencia'].astype(str) 

In [6]:
df['classe']

id
355    metalica400
356    metalica400
357    metalica400
358    metalica400
359    metalica400
          ...     
656    metalica250
657    metalica250
658    metalica250
659    metalica250
660    metalica250
Name: classe, Length: 297, dtype: object

In [7]:
df[ALL].head(2)

Unnamed: 0_level_0,df01,df02,df03,df04,df05,df06,df07,df08,df09,df10,...,i4,i5,i6,i7,probmax,energia,entropia,contraste,homogeneidade,correlacao
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
355,0.087961,0.054059,0.02521,0.027471,0.012834,0.008374,0.004592,0.005163,0.004747,0.003075,...,5.717751e-05,3.808159e-09,-1.417337e-06,3.15337e-09,0.890374,3.170342,6.571618,-30.71299,3.763049,0.000304
356,0.018333,0.064192,0.021335,0.028935,0.012263,0.01123,0.003566,0.005862,0.003746,0.003779,...,1.011576e-08,-5.395931e-15,-6.859661e-11,3.907229e-15,0.874335,3.056052,7.561434,-37.105195,3.722622,0.000269


In [8]:
from sklearn.preprocessing import StandardScaler


X = df[ALL]
y = df['classe']


In [9]:
def classification_report(scores):
    print(f'Media: {scores.mean():.8f}, Desvio Padrao: {scores.std():.8f}')
    inf, sup = stats.norm.interval(0.95, loc=scores.mean(), 
                               scale=scores.std()/np.sqrt(len(scores)))
    print(f'Intervalo de confiança (95%): [{inf:.8f},{sup:.8f}]')

## ZeroR

In [10]:
from sklearn.dummy import DummyClassifier

zR = DummyClassifier()

rkf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state = 36851234)

scoresZR = cross_val_score(zR, X, y, cv = rkf)

classification_report(scoresZR)

Media: 0.16505747, Desvio Padrao: 0.01088258
Intervalo de confiança (95%): [0.16116326,0.16895168]


### Bagging

In [11]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

parameters = {'bagging__n_estimators':n_estimators}

bg = BaggingClassifier(random_state=11)

pipe = Pipeline([('scaler', StandardScaler()), ('bagging', bg)])

clf = GridSearchCV(pipe, parameters, cv=4)

rkf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state = 36851234)

scoresBA = cross_val_score(clf, X, y, cv = rkf)

classification_report(scoresBA)

Media: 0.63563218, Desvio Padrao: 0.07451964
Intervalo de confiança (95%): [0.60896616,0.66229821]


### AdaBoost

In [12]:
from sklearn.ensemble import AdaBoostClassifier

parameters = {'boosting__n_estimators':n_estimators}

adb = AdaBoostClassifier(random_state=11)

pipe = Pipeline([('scaler', StandardScaler()), ('boosting',adb)])

clf = GridSearchCV(pipe, parameters,cv=4)

rkf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state = 36851234)

scoresAB = cross_val_score(clf, X, y, cv = rkf)

classification_report(scoresAB)

Media: 0.33904215, Desvio Padrao: 0.02916987
Intervalo de confiança (95%): [0.32860403,0.34948026]


### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

parameters = {'randomforest__n_estimators':n_estimators}

rF = RandomForestClassifier(random_state=11)

pipeline = Pipeline([('scaler', StandardScaler()), ('randomforest', rF)])

gs = GridSearchCV(pipeline, parameters, cv=4)

rkf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state = 36851234)

scoresRF = cross_val_score(gs, X, y, cv=rkf)

classification_report(scoresRF)

Media: 0.62314176, Desvio Padrao: 0.07457384
Intervalo de confiança (95%): [0.59645635,0.64982718]


### Heterogeneus Pooling

In [25]:
#Heterogeneous Pooling
from collections import Counter
from sklearn.utils import resample
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, ClassifierMixin


class HeterogeneousPooling(BaseEstimator, ClassifierMixin):
    def __init__(self, n_samples=1):
        self.n_samples = n_samples
        self.classifiers = []
        self.base_classifiers = [DecisionTreeClassifier(),
                                 GaussianNB(),
                                 KNeighborsClassifier()]

    def fit(self, X, y):
        self.classes_ = sorted(set(y), key=lambda x: y.tolist().count(x), reverse=True)
        
        random_state = 0
        for i in range(self.n_samples):
            if i == 0:
                # Treina a partir da original
                X_train, y_train = X, y
            else:
                # Cria uma base de treino resample a partir da original
                random_state += 1
                X_train, y_train = resample(X, y, replace=True, random_state=random_state)

             # Treina os classificadores base e os adiciona ao HeterogeneousPooling
            for classifier in self.base_classifiers:
                clf = classifier.fit(X_train, y_train)
                self.classifiers.append(clf)

    def predict(self, X):
        #As previsões de cada classificador são adicionadas à lista predictions
        predictions = []
        #iteramos sobre cada classificador armazenado em self.classifiers e realizamos
        #a previsão para o conjunto de dados de teste X
        for classifier in self.classifiers:
            predictions.append(classifier.predict(X))

        #final_predictions para armazenar as previsões finais
        final_predictions = []
        for i in range(len(X)):
            votes = [predictions[j][i] for j in range(len(self.classifiers))]
            #contamos o número de votos para cada classe usando o Counter
            vote_count = Counter(votes)
            #Encontramos o valor máximo de votos com max_vote
            max_vote = max(vote_count.values())

            #buscamos a classe mais frequente entre as que receberam o número máximo de votos.
            most_frequent_class = None
            for cls, count in vote_count.items():
                if count == max_vote:
                    if most_frequent_class is None:
                        most_frequent_class = cls
                    elif self.classes_.index(cls) < self.classes_.index(most_frequent_class):
                        most_frequent_class = cls
            #adicionamos a classe mais votada most_frequent_class à lista final_predictions
            #e continuamos iterando até processar todos os exemplos de teste.
            final_predictions.append(most_frequent_class)

        #Por fim, retornamos a lista final_predictions, que contém as previsões finais
        #para o conjunto de dados de teste.
        return final_predictions

In [26]:
# Definição dos parâmetros para a busca em grade
parameters = {'heterogeneous__n_samples': [1, 3, 5, 7]}

# Instanciação do HeterogeneousPooling
hp = HeterogeneousPooling()

# Criação do pipeline com o HeterogeneousPooling e padronização
pipe = Pipeline([('scaler', StandardScaler()), ('heterogeneous', hp)])

# Instanciação do GridSearchCV com o pipeline e os parâmetros
clf = GridSearchCV(pipe, parameters, cv=4)

# Definição do esquema de validação cruzada estratificada
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

# Avaliação do modelo usando o cross_val_score
HP_scores = cross_val_score(clf, X, y, cv=rkf)

classification_report(HP_scores)
HP_scores

Media: 0.58482759, Desvio Padrao: 0.10235024
Intervalo de confiança (95%): [0.54820270,0.62145248]


array([0.63333333, 0.3       , 0.5       , 0.6       , 0.56666667,
       0.53333333, 0.6       , 0.62068966, 0.51724138, 0.75862069,
       0.53333333, 0.43333333, 0.56666667, 0.56666667, 0.53333333,
       0.8       , 0.63333333, 0.5862069 , 0.62068966, 0.55172414,
       0.66666667, 0.73333333, 0.56666667, 0.76666667, 0.63333333,
       0.43333333, 0.6       , 0.48275862, 0.65517241, 0.55172414])

In [44]:
from sklearn.base import BaseEstimator
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.utils import check_random_state
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import Counter


class HeterogeneousPooling(BaseEstimator, ClassifierMixin):
    def __init__(self, n_samples=1):
        self.n_samples = n_samples
        self.classifiers = []
        
    def fit(self, X, y):
        self.classes_ = sorted(set(y), key=lambda x: y.tolist().count(x), reverse=True)
        random_state = 0
        
        dt = DecisionTreeClassifier()
        nb = GaussianNB()
        knn = KNeighborsClassifier()
        
        for i in range(self.n_samples):
            if i == 0:
                X_train = X
                y_train = y
            else:
                random_state += 1
                X_train, y_train = resample(X, y, replace=True, random_state=random_state)

            clf_dt = dt.fit(X_train, y_train)
            clf_nb = nb.fit(X_train, y_train)
            clf_knn = knn.fit(X_train, y_train)
            
            self.classifiers.append(clf_dt)
            self.classifiers.append(clf_nb)
            self.classifiers.append(clf_knn)

    def predict(self, X):
        predictions = [classifier.predict(X) for classifier in self.classifiers]

        final_predictions = []
        for i in range(len(X)):
            votes = [prediction[i] for prediction in predictions]

            vote_count = Counter(votes)
            max_vote = max(vote_count.values())

            most_frequent_class = None
            for cls, count in vote_count.items():
                if count == max_vote:
                    if most_frequent_class is None:
                        most_frequent_class = cls
                    elif self.classes_.index(cls) < self.classes_.index(most_frequent_class):
                        most_frequent_class = cls

            final_predictions.append(most_frequent_class)

        return final_predictions

In [None]:
parameters = {'heterogeneous__n_samples': [1, 3, 5, 7]}

hp = HeterogeneousPooling()

pipe = Pipeline([('scaler', StandardScaler()), ('heterogeneous', hp)])

clf = GridSearchCV(pipe, parameters, cv=4)

rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

HP_scores = cross_val_score(clf, X, y, cv=rkf)
classification_report(HP_scores)
HP_scores