# Configurações

In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import datasets
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [3]:
n_splits = 10
n_repeats = 3
n_estimators = [3,9,15,21]

rkf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state = 36851234)

### Load dataset

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/VitorBonella/PL-Dataset/main/dataset.csv',sep=";") #Leitura dos dados para o pandas

Como minha matrícula é terminada em 9, utilizarei 7 descritores de Hu e 6 descritores de Haralick.

In [5]:
# Lista de Indices
FOURIER = ['df01', 'df02', 'df03', 'df04','df05', 'df06', 'df07', 'df08', 'df09', 'df10']
HU = ['i1', 'i2', 'i3', 'i4','i5', 'i6', 'i7']
HARALICK = ['probmax', 'energia', 'entropia', 'contraste','homogeneidade', 'correlacao']
ALL = FOURIER + HU + HARALICK

df[ALL].head(2)

Unnamed: 0,df01,df02,df03,df04,df05,df06,df07,df08,df09,df10,...,i4,i5,i6,i7,probmax,energia,entropia,contraste,homogeneidade,correlacao
0,879606602603602,540590780493764,252096142058536,274713661921584,128341271697431,837372424071006,459165042660484,516286747775679,474694756923231,307480942393677,...,5717751047.37315,380815.942436415,-141733745.231029,315336.969940052,890374128851521,317034178175013,65716176171224,-307129899022437,376304934586401,30352446061056
1,183325142757933,641922380636373,213351158490625,28934652413586,12263060875406,112299899510848,356611493895844,586153710536664,374648636964241,377850078446776,...,1011576.08366481,-0.539593107944031,-6859.66112547544,0.390722879150443,874335002692948,305605231787486,756143396285353,-371051952158663,372262223245045,268692571829909


In [6]:
df.set_index('id',inplace=True) #Transformando a coluna id no indice da tabela

#Observem que a classe esta separada em duas colunas então devemos concatenadas para formar uma coluna só chamada classe
df['classe'] = df['tipo_lampada'].str.replace(" ", "") + df['potencia'].astype(str) 

In [7]:
df['classe']

id
355    metalica400
356    metalica400
357    metalica400
358    metalica400
359    metalica400
          ...     
656    metalica250
657    metalica250
658    metalica250
659    metalica250
660    metalica250
Name: classe, Length: 297, dtype: object

In [8]:
'''
Os dados dos descritores como podemos observar na tabela acima estão com ","
porém o padrão para ponto flutuante (float) em python é "." logo a biblioteca 
entendeu que se tratavam de strings (objects para o pandas) e não de floats 
assim é importante transformar esses dados para float.
'''
df[ALL] = df[ALL].apply(lambda x: x.str.replace(',', '.').astype(float), axis=1)
print(list(df[ALL].dtypes))
df[ALL].head(2)

[dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64')]


Unnamed: 0_level_0,df01,df02,df03,df04,df05,df06,df07,df08,df09,df10,...,i4,i5,i6,i7,probmax,energia,entropia,contraste,homogeneidade,correlacao
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
355,0.087961,0.054059,0.02521,0.027471,0.012834,0.008374,0.004592,0.005163,0.004747,0.003075,...,5.717751e-05,3.808159e-09,-1.417337e-06,3.15337e-09,0.890374,3.170342,6.571618,-30.71299,3.763049,0.000304
356,0.018333,0.064192,0.021335,0.028935,0.012263,0.01123,0.003566,0.005862,0.003746,0.003779,...,1.011576e-08,-5.395931e-15,-6.859661e-11,3.907229e-15,0.874335,3.056052,7.561434,-37.105195,3.722622,0.000269


In [9]:
from sklearn.preprocessing import StandardScaler

# dts_x = df[ALL]
dts_y = df['classe']
 
# Aplicando a normalização ao conjundo de dados
dts_x = StandardScaler().fit_transform(df[ALL])

### ZERO R

In [10]:
def classification_report(scores):
    from scipy import stats
    import numpy as np
    print(f'Media: {scores.mean():.2f}, Desvio Padrao: {scores.std():.2f}')
    inf, sup = stats.norm.interval(0.95, loc=scores.mean(), 
                               scale=scores.std()/np.sqrt(len(scores)))
    print(f'Intervalo de confiança (95%): [{inf:.2f},{sup:.2f}]')

In [11]:
from sklearn.dummy import DummyClassifier

zR = DummyClassifier()

scores1 = cross_val_score(zR, dts_x, dts_y, cv = rkf)

classification_report(scores1)

Media: 0.17, Desvio Padrao: 0.01
Intervalo de confiança (95%): [0.16,0.17]


### Bagging

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

parameters = {'bagging__n_estimators':n_estimators}

bg = BaggingClassifier(random_state=11)

pipe = Pipeline([('scaler', StandardScaler()), ('bagging', bg)])

clf = GridSearchCV(pipe, parameters, cv=4)

scores2 = cross_val_score(clf, dts_x, dts_y, cv = rkf)

classification_report(scores2)

Media: 0.64, Desvio Padrao: 0.07
Intervalo de confiança (95%): [0.61,0.66]


### AdaBoost

In [14]:
from sklearn.ensemble import AdaBoostClassifier

parameters = {'boosting__n_estimators':n_estimators}

adb = AdaBoostClassifier(random_state=11)

pipe = Pipeline([('scaler', StandardScaler()), ('boosting',adb)])

clf = GridSearchCV(pipe, parameters,cv=4)

scores3 = cross_val_score(clf, dts_x, dts_y, cv = rkf)

classification_report(scores3)

Media: 0.34, Desvio Padrao: 0.03
Intervalo de confiança (95%): [0.33,0.35]


### RandomForest

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

parameters = {'randomforest__n_estimators':n_estimators}

rF = RandomForestClassifier(random_state=11)

pipeline = Pipeline([('scaler', StandardScaler()), ('randomforest', rF)])

gs = GridSearchCV(pipeline, parameters, cv=4)

scores3 = cross_val_score(gs, dts_x, dts_y, cv=rkf)

classification_report(scores3)

Media: 0.62, Desvio Padrao: 0.07
Intervalo de confiança (95%): [0.60,0.65]
