# Configurações

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import datasets
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [15]:
n_splits = 10
n_repeats = 3
n_estimators = [4,9,15,21]

rkf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state = 36851234)
skf = StratifiedKFold(n_splits=4)

### Load dataset

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/VitorBonella/PL-Dataset/main/dataset.csv',sep=";") #Leitura dos dados para o pandas

Como minha matrícula é terminada em 9, utilizarei 7 descritores de Hu e 6 descritores de Haralick.

In [4]:
# Lista de Indices
# FOURIER = ['df01', 'df02', 'df03', 'df04','df05', 'df06', 'df07', 'df08', 'df09', 'df10']
HU = ['i1', 'i2', 'i3', 'i4','i5', 'i6', 'i7']
HARALICK = ['probmax', 'energia', 'entropia', 'contraste','homogeneidade', 'correlacao']
ALL = HU + HARALICK

df[ALL].head(2)

Unnamed: 0,i1,i2,i3,i4,i5,i6,i7,probmax,energia,entropia,contraste,homogeneidade,correlacao
0,168083620456198,614508479637171,13077624387800.0,5717751047.37315,380815.942436415,-141733745.231029,315336.969940052,890374128851521,317034178175013,65716176171224,-307129899022437,376304934586401,30352446061056
1,164506924542429,10869024475826,428761904.63528,1011576.08366481,-0.539593107944031,-6859.66112547544,0.390722879150443,874335002692948,305605231787486,756143396285353,-371051952158663,372262223245045,268692571829909


In [5]:
df.set_index('id',inplace=True) #Transformando a coluna id no indice da tabela

#Observem que a classe esta separada em duas colunas então devemos concatenadas para formar uma coluna só chamada classe
df['classe'] = df['tipo_lampada'].str.replace(" ", "") + df['potencia'].astype(str) 

In [6]:
df['classe']

id
355    metalica400
356    metalica400
357    metalica400
358    metalica400
359    metalica400
          ...     
656    metalica250
657    metalica250
658    metalica250
659    metalica250
660    metalica250
Name: classe, Length: 297, dtype: object

In [7]:
'''
Os dados dos descritores como podemos observar na tabela acima estão com ","
porém o padrão para ponto flutuante (float) em python é "." logo a biblioteca 
entendeu que se tratavam de strings (objects para o pandas) e não de floats 
assim é importante transformar esses dados para float.
'''
df[ALL] = df[ALL].apply(lambda x: x.str.replace(',', '.').astype(float), axis=1)
print(list(df[ALL].dtypes))
df[ALL].head(2)

[dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64')]


Unnamed: 0_level_0,i1,i2,i3,i4,i5,i6,i7,probmax,energia,entropia,contraste,homogeneidade,correlacao
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
355,0.168084,0.000615,0.000131,5.717751e-05,3.808159e-09,-1.417337e-06,3.15337e-09,0.890374,3.170342,6.571618,-30.71299,3.763049,0.000304
356,0.164507,0.001087,4.3e-05,1.011576e-08,-5.395931e-15,-6.859661e-11,3.907229e-15,0.874335,3.056052,7.561434,-37.105195,3.722622,0.000269


In [8]:
dts_x = df[ALL]
dts_y = df['classe']

### ZERO R

In [20]:
def classification_report(scores):
    from scipy import stats
    import numpy as np
    print(f'Media: {scores.mean():.2f}, Desvio Padrao: {scores.std():.2f}')
    inf, sup = stats.norm.interval(0.95, loc=scores.mean(), 
                               scale=scores.std()/np.sqrt(len(scores)))
    print(f'Intervalo de confiança (95%): [{inf:.2f},{sup:.2f}]')

In [21]:
from sklearn.dummy import DummyClassifier

zR = DummyClassifier()

scores1 = cross_val_score(zR, dts_x, dts_y, cv = rkf)

classification_report(scores1)

Media: 0.17, Desvio Padrao: 0.01
Intervalo de confiança (95%): [0.16,0.17]


### Bagging

In [22]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

parameters = {'bagging__n_estimators':n_estimators}

bg = BaggingClassifier(estimator=GaussianNB(), random_state=11)

pipe = Pipeline([('scaler', StandardScaler()), ('bagging', bg)])

clf = GridSearchCV(pipe, parameters,cv=4)

scores2 = cross_val_score(clf, dts_x, dts_y, cv = rkf)

classification_report(scores2)

Media: 0.47, Desvio Padrao: 0.08
Intervalo de confiança (95%): [0.44,0.50]


### AdaBoost

In [23]:
from sklearn.ensemble import AdaBoostClassifier

parameters = {'boosting__n_estimators':n_estimators}

adb = AdaBoostClassifier(estimator=GaussianNB(), random_state=11)

pipe = Pipeline([('scaler', StandardScaler()), ('boosting',adb)])

clf = GridSearchCV(pipe, parameters,cv=4)

scores3 = cross_val_score(clf, dts_x, dts_y, cv = rkf)

classification_report(scores3)

Media: 0.39, Desvio Padrao: 0.08
Intervalo de confiança (95%): [0.36,0.42]


### RandomFlorest

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()

rF = RandomForestClassifier()

pipeline = Pipeline([('transformer', scalar), ('estimator', rF)])

grade={'estimator__n_estimators':n_estimators}

gs = GridSearchCV(estimator=pipeline, param_grid = grade, cv = 4)

scores3 = cross_val_score(gs, dts_x, dts_y, cv = rkf)

classification_report(scores3)

Media: 0.65, Desvio Padrao: 0.07
Intervalo de confiança (95%): [0.63,0.68]
