# Trabalho 1 - Nivelamento

Considere os datasets abaixo, estratégias de pré-processamento, medidas de avaliação, métodos de comparação estatística e os seguintes algoritmos de aprendizado de máquina: árvore de decisão, random forest e k-nearest neighbor. A partir disso, responda as seguintes perguntas:

1. Qual o algoritmo de AM mais adequado para cada problema?
2. Qual o algoritmo de AM mais adequado para todos os problemas?

Para responder essas questões construa um notebook no colab ou um ambiente similar. Documente de forma clara cada passo e justifique suas decisões.

In [1]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn import tree
import matplotlib.patches as mpatches
import matplotlib as plt
from scipy.io import arff
import urllib.request
import pandas as pd
import numpy as np
import io

In [10]:
PhishingWebsites_link = "https://www.openml.org/data/download/1798106/phpV5QYya"
arrhythmia_link       = "https://www.openml.org/data/download/53551/arrhythmia.arff"
Satellite_link        = "https://www.openml.org/data/download/16787463/phpZrCzJR"
airlines_link         = "https://www.openml.org/data/download/66526/phpvcoG8S"
AedesSex_link         = "https://github.com/denismr/Classification-and-Counting-with-Recurrent-Contexts/raw/master/codeAndData/data/AedesSex.csv"
phoneme_link          = "https://www.openml.org/data/download/1592281/php8Mz7BG"
adult_link            = "https://www.openml.org/data/download/1595261/phpMawTba"

In [9]:
def loadDataset_Arff(url: str) -> pd.DataFrame:
  ftpstream = urllib.request.urlopen(url)
  return pd.DataFrame(arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))[0])


In [11]:
def get_dummies(df: pd.DataFrame, col:str) -> pd.DataFrame:
  return pd.get_dummies(df,prefix=col,prefix_sep='.',columns=[col]).copy()

In [12]:
def select_nominals(dataset: pd.DataFrame) -> list:
    """
    Seleciona os atributos não numéricos do dataset.
    """
    att: list = []
    for i, v in dataset.dtypes.items():
        if(v == "object"):
            att.append(i)
    return att

def is_binary_nominal(nominals: list) -> "tuple[bool, list[str]]":
    """
    Retorna se uma lista de atributos nominais contém apenas 2 valores distintos ou não.
    """
    unique_att: "list[str]" = []
    for n in nominals:
        if(not n in unique_att):
            unique_att.append(n)
    return len(unique_att) == 2, unique_att

In [6]:
def convert_binary_nominal_to_numeric(dataset: pd.DataFrame, attribute: str) -> pd.DataFrame:
    """
    Converte atributos nominais binários de um dataset para numéricos binários (0 ou 1).
    """
    new_dt = dataset.copy()
    attributes = new_dt[attribute]
    unq_attr: dict['str', int] = {}
    # Pegando os atributos únicos
    i = 0
    for a in attributes:
        if(not a in unq_attr):
            unq_attr[a] = i
            i += 1
    # Substituindo os valores nominais por numéricos.
    for u in unq_attr:
        new_dt.loc[new_dt[attribute] == u, attribute] = unq_attr[u]
    return new_dt

In [13]:
def convert_nominal_to_numeric(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Converte todos os atributos nominais de um dataset para atributos numéricos.
    """
    nominals = select_nominals(dataset)
    new_dt = dataset.copy()
    for att in nominals:
        binary, values = is_binary_nominal(new_dt[att])
        if(binary): # Atributo nominal binário
            new_dt = convert_binary_nominal_to_numeric(new_dt, att)
        else: # Atributo nominal não binário
            new_dt = get_dummies(new_dt, att)
    return new_dt

In [14]:
def proportion(Dataset,target="Target"):
  target_count = Dataset[target].value_counts()
  print('Class 0:', target_count[0])
  print('Class 1:', target_count[1])
  print('Proporção:', round(target_count[1] / target_count[0], 2), ': 1')
  target_count.plot(kind='bar', title='Count (target)',color = ['#1F77B4', '#FF7F0E']);

In [15]:
def normalize(dataset) -> pd.DataFrame:
  # Separação dos dados preditivos e dos valores
  X = dataset.drop(['Target'], axis=1).copy()
  Y = dataset['Target'].copy()

  # Normalização
  scaler = MinMaxScaler()
  X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
  dataset = X
  dataset["Target"] = Y
  return dataset

In [16]:
def knn(dataset,n_neighbors=3):
  X_train, X_test, y_train, y_test = train_test_split(dataset.drop(['Target'], axis=1), dataset['Target'], test_size=0.3)
  model = KNeighborsClassifier(n_neighbors=3)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(accuracy_score(y_test, y_pred)*100)

# Lendo todos os datasets

In [25]:
# Dataset_PhishingWebsites = loadDataset_Arff(PhishingWebsites_link)
Dataset_arrhythmia = loadDataset_Arff(arrhythmia_link)
# Dataset_Satellite = loadDataset_Arff(Satellite_link)
# Dataset_airlines = loadDataset_Arff(airlines_link)
# Dataset_AedesSex = pd.read_csv(AedesSex_link)
# Dataset_phoneme = loadDataset_Arff(phoneme_link)
# Dataset_adult = loadDataset_Arff(adult_link)

Abaixo será feito a conversão das classes targets para True e False, e trocado o nome da coluna para Target para as que não estão. Também será feito uma analise de distribuição e procura de dados nominais.

In [90]:
Dataset_PhishingWebsites['Target'] = [True if x == '-1'.encode() else False for x in Dataset_PhishingWebsites['Result']]
Dataset_PhishingWebsites = Dataset_PhishingWebsites.drop(['Result'],axis=1)
# Dataset_PhishingWebsites.info()
# proportion(Dataset_PhishingWebsites)

In [26]:

Dataset_arrhythmia['Target'] = [True if x == 'P'.encode() else False for x in Dataset_arrhythmia['binaryClass']]
Dataset_arrhythmia = Dataset_arrhythmia.drop(['binaryClass','J'],axis=1)#Remoção da coluna J por ter 98% dos dados NaN
# Dataset_arrhythmia
# proportion(Dataset_arrhythmia)

In [92]:
##Sem nominais usando a função select_nominals
Dataset_Satellite['Target'] = [True if x == 'Anomaly'.encode() else False for x in Dataset_Satellite['Target']]
# proportion(Dataset_Satellite)


In [None]:
##Dataset com muitos dados nominais, para aplicar o knn será necessário usar a função getdummies em AirportFrom e AirportTo.
Dataset_airlines['Target'] = [True if x == '1'.encode() else False for x in Dataset_airlines['Delay']]
Dataset_airlines = Dataset_airlines.drop(['delay'],axis=1)
# Dataset_airlines = get_dummies(Dataset_airlines,'AirportFrom')
# Dataset_airlines = get_dummies(Dataset_airlines,'AirportTo')
# proportion(Dataset_airlines)
# Dataset_airlines

In [95]:
Dataset_AedesSex['Target'] = [True if x == 'F' else False for x in Dataset_AedesSex['sex']]
Dataset_AedesSex = Dataset_AedesSex.drop(['sex'],axis=1)
# proportion(Dataset_AedesSex)
# Dataset_AedesSex


In [96]:
##Dataset desbalanceado, analisar como se comporta nos algoritmos de aprendizagem para saber se é necessário balancear.
Dataset_phoneme['Target'] = [True if x == '1'.encode() else False for x in Dataset_phoneme['Class']]
Dataset_phoneme = Dataset_phoneme.drop(['Class'],axis=1)
# proportion(Dataset_phoneme)
# Dataset_phoneme

In [97]:
##Dataset com muitas colunas nominais, será necessário gerar o dummie das mesmas.
Dataset_adult['Target'] = [True if x == '<=50K'.encode() else False for x in Dataset_adult['class']]
Dataset_adult = Dataset_adult.drop(['class'],axis=1)
# Dataset_adult

Normalização de todos os datasets

In [35]:
## NOR_Dataset_PhishingWebsites = normalize(Dataset_PhishingWebsites)
## NOR_Dataset_arrhythmia = normalize(Dataset_arrhythmia)
## NOR_Dataset_Satellite = normalize(Dataset_Satellite)
## NOR_Dataset_AedesSex = normalize(Dataset_AedesSex)
## NOR_Dataset_phoneme = normalize(Dataset_phoneme)

##Para normalizar esses datasets precisará gerar os dumies
## #NOR_Dataset_airlines = normalize(Dataset_airlines)
## # NOR_Dataset_adult = normalize(Dataset_adult)
Dataset_adult



NameError: name 'Dataset_adult' is not defined

Aplicando o knn em todos os datasets para saber a acurácia.

In [34]:
# knn(NOR_Dataset_PhishingWebsites) #93.8498643352427
knn(NOR_Dataset_arrhythmia)
# NOR_Dataset_arrhythmia=NOR_Dataset_arrhythmia.dropna()

NOR_Dataset_arrhythmia
# knn(NOR_Dataset_Satellite) #99.2156862745098
# knn(NOR_Dataset_AedesSex) #98.73611111111111
# knn(NOR_Dataset_phoneme) #88.03945745992601

72.22222222222221


Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,...,chV6_QwaveAmp,chV6_RwaveAmp,chV6_SwaveAmp,chV6_RPwaveAmp,chV6_SPwaveAmp,chV6_PwaveAmp,chV6_TwaveAmp,chV6_QRSA,chV6_QRSTA,Target
0,0.903614,0.0,0.125926,0.435294,0.270677,0.368321,0.501805,0.241758,0.590244,0.457478,...,1.000000,0.381356,0.968531,0.0,0.0,0.53125,0.741667,0.507519,0.569579,False
1,0.674699,1.0,0.088889,0.341176,0.195489,0.332061,0.610108,0.150183,0.190244,0.577713,...,1.000000,0.360169,1.000000,0.0,0.0,0.31250,0.675000,0.485714,0.500971,False
2,0.650602,0.0,0.099259,0.523529,0.624060,0.311069,0.555957,0.282051,0.497561,0.785924,...,1.000000,0.402542,0.916084,0.0,0.0,0.34375,0.783333,0.424812,0.566990,False
3,0.662651,0.0,0.103704,0.517647,0.338346,0.385496,0.534296,0.260073,0.697561,0.586510,...,1.000000,0.516949,0.923077,0.0,0.0,0.37500,0.716667,0.592481,0.648544,True
5,0.156627,0.0,0.094815,0.264706,0.338346,0.318702,0.321300,0.241758,0.443902,0.818182,...,0.853659,0.516949,0.902098,0.0,0.0,0.53125,0.683333,0.433835,0.451133,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,0.638554,1.0,0.081481,0.376471,0.187970,0.379771,0.541516,0.168498,0.570732,0.395894,...,1.000000,0.182203,0.825175,0.0,0.0,0.46875,0.550000,0.299248,0.246602,True
448,0.445783,0.0,0.125926,0.464706,0.338346,0.261450,0.465704,0.340659,0.356098,0.756598,...,1.000000,0.661017,0.944056,0.0,0.0,0.37500,0.700000,0.618045,0.653722,False
449,0.433735,0.0,0.090370,0.364706,0.398496,0.335878,0.480144,0.315018,0.565854,0.255132,...,1.000000,0.690678,0.000000,0.0,0.0,0.71875,0.583333,0.000000,0.034951,False
450,0.385542,1.0,0.074074,0.288235,0.285714,0.202290,0.555957,0.402930,0.307317,0.662757,...,0.902439,0.508475,0.975524,0.0,0.0,0.40625,0.700000,0.520301,0.551456,True


In [125]:
Dataset_airlines = get_dummies(Dataset_airlines,'AirportFrom')
Dataset_airlines = get_dummies(Dataset_airlines,'AirportTo')