# Atividade 4 - Atributos Categóricos e Valores Faltantes


### Bibliotecas

In [61]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from scipy.stats import ttest_ind_from_stats
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from joblib import Parallel, delayed
from tqdm.notebook import tqdm


### Dataset

Nesta atividade vamos trabalhar com um subconjunto da base de dados “Mushrooms”. Esta base de dados
é famosa por ter apenas atributos categóricos. Cada instância descreve uma amostra de cogumelo. Cada
amostra possui 22 atributos. Seu objetivo é construir um classificador que seja capaz de reconhecer dado
cogumelo como comestível (edible) ou venenoso (poisonous)

Importar csv retirando a coluna que possui dados faltantes.

In [62]:
df = pd.read_csv('agaricus_lepiota_small_c.csv')
y = df['class'].replace('e', 0).replace('p', 1).values.ravel()
X = df.drop(['class', 'stalk-root'], axis=1)

y = pd.DataFrame(data=y, columns=['class'])

### Ordinal Encoder na coluna "class"

In [63]:
# declaro o transformer
transformers = [('oe_class', OrdinalEncoder(), ['class'])]
column_transformer = ColumnTransformer(transformers, remainder='passthrough')

y_oe = column_transformer.fit_transform(y)
y_oe = pd.DataFrame(data=y_oe, columns=['class'])

### Aplicando One Hot Enconding

Com base nas descrições das colunas do dataframe, você decidiu que o One Hot Encoding seria o mais adequado, pois nenhum dos valores das colunas possuía uma noção de ordem ou importância. Ao aplicar esse encoder, foi observado que a dimensionalidade do conjunto de dados aumentou consideravelmente, resultando em 113 colunas.

In [64]:
X.isnull().sum()
allColumns = X.columns.tolist()

# Utilizar one hot encoding para todas as columns
transformer_hot = [('oh_cat', OneHotEncoder(), allColumns)]

ct_oh = ColumnTransformer(transformer_hot, remainder='passthrough')
X_oh = ct_oh.fit_transform(X).todense()

columns = ct_oh.get_feature_names()
X_oh = pd.DataFrame(data=X_oh, columns=columns)

X_oh



Unnamed: 0,oh_cat__x0_b,oh_cat__x0_f,oh_cat__x0_k,oh_cat__x0_s,oh_cat__x0_x,oh_cat__x1_f,oh_cat__x1_s,oh_cat__x1_y,oh_cat__x2_b,oh_cat__x2_c,...,oh_cat__x19_s,oh_cat__x19_v,oh_cat__x19_y,oh_cat__x20_d,oh_cat__x20_g,oh_cat__x20_l,oh_cat__x20_m,oh_cat__x20_p,oh_cat__x20_u,oh_cat__x20_w
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
998,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Validação cruzada

In [65]:
X_oh = X_oh.values
y_oe = y_oe.values.ravel()

### KNN em 2 folders

In [66]:
def treinando_knn():
    # Serão utilizadas 10 vias no primeiro nível da validação cruzada
    folders = 10
    twoFolders = 5
    n_neighbors = {'n_neighbors' : range(1,50, 2)}
    acuracyKNN = []
    skf = StratifiedKFold(n_splits=folders, shuffle=True, random_state=1)

    for treino_idx, teste_idx in skf.split(X_oh, y_oe):
        
        X_treino = X_oh[treino_idx]
        y_treino = y_oe[treino_idx]

        X_teste = X_oh[teste_idx]
        y_teste = y_oe[teste_idx]
        
        ss = StandardScaler()
        ss.fit(X_treino)
        X_treino = ss.transform(X_treino)
        X_teste = ss.transform(X_teste)
        
        knn = KNeighborsClassifier()
        knn = GridSearchCV(knn, n_neighbors, cv=StratifiedKFold(n_splits=twoFolders))
        knn.fit(X_treino, y_treino)
        predict = knn.predict(X_teste)
    
        acuracyKNN.append(accuracy_score(y_teste, predict))
        return acuracyKNN

### SVM em 2 folders

In [67]:
# Realiza o treinamento para cada hiperparâmetro
def treina_svm(C, gamma, X_treino, X_val, y_treino, y_val):
    svm = SVC(C= C, gamma= gamma)
    svm.fit(X_treino, y_treino)
    pred = svm.predict(X_val)
    return accuracy_score(y_val, pred)

def selecionar_melhor_svm(Cs, gamma,X_treino:np.ndarray, X_val:np.ndarray, y_treino:np.ndarray, y_val:np.ndarray, n_jobs=4):
    acuracyValueSVM = []
    hiperparams = list(itertools.product(Cs, gamma))
    acuracyValueSVM = Parallel(n_jobs= n_jobs)(delayed(treina_svm)
            (c, g, X_treino, X_val, y_treino, y_val) for c, g in hiperparams)

    melhorAcuracy = max(acuracyValueSVM)
    melhorCombinacao = hiperparams[np.argmax(acuracyValueSVM)]
    
    svm = SVC(C= melhorCombinacao[0], gamma=melhorCombinacao[1])
    svm.fit(np.vstack((X_treino, X_val)), [*y_treino, *y_val])

    return svm, melhorCombinacao[0], melhorCombinacao[1], melhorAcuracy

In [68]:
def treinando_svm():
    folders = 10
    twoFolders = 5
    skf = StratifiedKFold(n_splits=folders, shuffle=True, random_state=1)
    param_grid = {'C': [1, 10, 100, 1000], 'gamma': ['scale', 'auto', 2e-2, 2e-3, 2e-4],'kernel': ['rbf']}

    AcuraciaSVM = []

    for idx_treino, idx_teste in skf.split(X_oh, y_oe):
        X_treino = X_oh[idx_treino]
        y_treino = y_oe[idx_treino]

        X_teste = X_oh[idx_teste]
        y_teste = y_oe[idx_teste]

        ss.fit(X_treino)
        X_treino = ss.transform(X_treino)
        X_teste = ss.transform(X_teste)

        svm = SVC()
        svm = GridSearchCV(svm, param_grid, cv=StratifiedKFold(n_splits=twoFolders), refit=True)
        svm.fit(X_treino, y_treino)

        predictionSVM = svm.predict(X_teste)
        acuracy_svm = accuracy_score(y_teste, predictionSVM)

        AcuraciaSVM.append(acuracy_svm)

    return AcuraciaSVM

In [69]:
knnAcuracy = treinando_knn()
svmAcuracy = treinando_svm()

print("\n-------- KNN --------\n- min: %.2f\n- max: %.2f\n- avg +- std: %.2f +- %.2f\n" % (min(knnAcuracy), max(knnAcuracy), np.mean(knnAcuracy), np.std(knnAcuracy)))
print("\n-------- SVM --------\n- min: %.2f\n- max: %.2f\n- avg +- std: %.2f +- %.2f\n" % (min(svmAcuracy), max(svmAcuracy), np.mean(svmAcuracy), np.std(svmAcuracy)))


-------- KNN --------
- min: 0.99
- max: 0.99
- avg +- std: 0.99 +- 0.00


-------- SVM --------
- min: 0.89
- max: 0.99
- avg +- std: 0.94 +- 0.03



Após analisar os resultados das acurácias do KNN e SVM, observamos que há uma pequena sobreposição nas acurácias, mas a pergunta crucial é se essas diferenças são estatisticamente significativas. Para avaliar isso, utilizaremos um nível de significância de 0,05, que representa o valor mínimo necessário para rejeitar a hipótese nula. Com esse valor, teremos 95% de confiança para afirmar que a diferença entre as duas distribuições é estatisticamente significativa.

### Teste-T

In [46]:
statistic, pvalue = ttest_ind_from_stats(np.mean(svm_acuracias), np.std(svm_acuracias), len(svm_acuracias), np.mean(acuracias_folds), np.std(acuracias_folds), len(acuracias_folds))

if(pvalue <= 0.05):
    print("É possível rejeitar a hipótese nula")
else:
    print("Não é possível rejeitar a hipótese nula")

print("PValue tem valor igual à: %.4f" % pvalue)

Não é possível rejeitar a hipótese nula
PValue tem valor igual à: 0.0691


#### Você usaria algum classificador que criou para decidir se comeria ou não um cogumelo classificado por ele? Justifique usando o desempenho obtido e o resultado do teste de hipótese. Esta resposta deve estar no final do caderno jupyter, após a análise estatística


Sim, apesar do classificar não possuir 100% de acurácia, temos um resultado proximo a 90% para mais, logo, possui a probabilidade de não dar problemas, pois apenas comeria cogumelos desconhecidos em caso de sobrevivência e pura necessidade.