# Trabalho 1 - Nivelamento

Considere os datasets abaixo, estratégias de pré-processamento, medidas de avaliação, métodos de comparação estatística e os seguintes algoritmos de aprendizado de máquina: árvore de decisão, random forest e k-nearest neighbor. A partir disso, responda as seguintes perguntas:

1. Qual o algoritmo de AM mais adequado para cada problema?
2. Qual o algoritmo de AM mais adequado para todos os problemas?

Para responder essas questões construa um notebook no colab ou um ambiente similar. Documente de forma clara cada passo e justifique suas decisões.

In [11]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.patches as mpatches
import matplotlib as plt
from scipy.io import arff
import urllib.request
import pandas as pd
import numpy as np
import io

In [2]:
PhishingWebsites_link = "https://www.openml.org/data/download/1798106/phpV5QYya"
arrhythmia_link       = "https://www.openml.org/data/download/53551/arrhythmia.arff"
Satellite_link        = "https://www.openml.org/data/download/16787463/phpZrCzJR"
airlines_link         = "https://www.openml.org/data/download/66526/phpvcoG8S"
AedesSex_link         = "https://github.com/denismr/Classification-and-Counting-with-Recurrent-Contexts/raw/master/codeAndData/data/AedesSex.csv"
phoneme_link          = "https://www.openml.org/data/download/1592281/php8Mz7BG"
adult_link            = "https://www.openml.org/data/download/1595261/phpMawTba"

In [3]:
def loadDataset_Arff(url: str) -> pd.DataFrame:
  ftpstream = urllib.request.urlopen(url)
  return pd.DataFrame(arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))[0])
  
def get_dummies(df: pd.DataFrame, col:str) -> pd.DataFrame:
  return pd.get_dummies(df,prefix=col,prefix_sep='.',columns=[col]).copy()

In [4]:
def select_nominals(dataset: pd.DataFrame) -> list:
    """
    Seleciona os atributos não numéricos do dataset.
    """
    att: list = []
    for i, v in dataset.dtypes.items():
        if(v == "object"):
            att.append(i)
    return att

def is_binary_nominal(nominals: list) -> "tuple[bool, list[str]]":
    """
    Retorna se uma lista de atributos nominais contém apenas 2 valores distintos ou não.
    """
    unique_att: "list[str]" = []
    for n in nominals:
        if(not n in unique_att):
            unique_att.append(n)
    return len(unique_att) == 2, unique_att

In [5]:
def convert_binary_nominal_to_numeric(dataset: pd.DataFrame, attribute: str) -> pd.DataFrame:
    """
    Converte atributos nominais binários de um dataset para numéricos binários (0 ou 1).
    """
    new_dt = dataset.copy()
    attributes = new_dt[attribute]
    unq_attr: dict['str', int] = {}
    # Pegando os atributos únicos
    i = 0
    for a in attributes:
        if(not a in unq_attr):
            unq_attr[a] = i
            i += 1
    # Substituindo os valores nominais por numéricos.
    for u in unq_attr:
        new_dt.loc[new_dt[attribute] == u, attribute] = unq_attr[u]
    return new_dt

In [6]:
def convert_nominal_to_numeric(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Converte todos os atributos nominais de um dataset para atributos numéricos.
    """
    nominals = select_nominals(dataset)
    new_dt = dataset.copy()
    for att in nominals:
        binary, values = is_binary_nominal(new_dt[att])
        if(binary): # Atributo nominal binário
            new_dt = convert_binary_nominal_to_numeric(new_dt, att)
        else: # Atributo nominal não binário
            new_dt = get_dummies(new_dt, att)
    return new_dt

In [7]:
def proportion(Dataset,target="Target"):
  target_count = Dataset[target].value_counts()
  print('Class 0:', target_count[0])
  print('Class 1:', target_count[1])
  target_count.plot(kind='bar', title='Count (target)');
  
def normalize(dataset,target='Target') -> pd.DataFrame:
  # Separação dos dados preditivos e dos valores
  X = dataset.drop([target], axis=1).copy()
  Y = dataset[target].copy()

  # Normalização
  scaler = MinMaxScaler()
  X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
  dataset = X
  dataset[target] = Y
  return dataset

In [8]:
def knn(dataset,n_neighbors=3,target='Target'):
  X_train, X_test, y_train, y_test = train_test_split(dataset.drop([target], axis=1), dataset[target], test_size=0.3)
  model = KNeighborsClassifier(n_neighbors=3)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(accuracy_score(y_test, y_pred)*100)

def treeDecision(dataset,target='Target'):
  X_train, X_test, y_train, y_test = train_test_split(dataset.drop([target], axis=1), dataset[target], test_size=0.3)
  model = DecisionTreeClassifier()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(accuracy_score(y_test, y_pred)*100)

def RForestDecision(dataset,target='Target'):
  X_train, X_test, y_train, y_test = train_test_split(dataset.drop([target], axis=1), dataset[target], test_size=0.3)
  model = RandomForestClassifier()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(accuracy_score(y_test, y_pred)*100)
  

In [None]:
# from sklearn.model_selection import cross_val_score

# y = df_balanced['Target']
# x = df_balanced.drop(['Target'],axis=1)
# classificador = KNeighborsClassifier(n_neighbors=5,metric='euclidean')
# cv_result_enhanced = cross_val_score(classificador, x, y, cv=20, scoring="accuracy")
# print("Acurácia com cross validation:", cv_result_enhanced.mean()*100)

In [12]:
def TTS_knn(dataset, x ,y ,n_neighbors=3):
  model = KNeighborsClassifier(n_neighbors=3)
  return cross_val_score(model, x, y, cv=20, scoring="accuracy")

def TTS_treeDecision(dataset, x ,y):
  model = DecisionTreeClassifier()
  return cross_val_score(model, x, y, cv=20, scoring="accuracy")

def TTS_RForestDecision(dataset, x ,y):
  model = RandomForestClassifier()
  return cross_val_score(model, x, y, cv=20, scoring="accuracy")

# Lendo todos os datasets

In [10]:
Dataset_PhishingWebsites = loadDataset_Arff(PhishingWebsites_link)
Dataset_arrhythmia = loadDataset_Arff(arrhythmia_link)
Dataset_Satellite = loadDataset_Arff(Satellite_link)
# Dataset_airlines = loadDataset_Arff(airlines_link)
Dataset_AedesSex = pd.read_csv(AedesSex_link)
Dataset_phoneme = loadDataset_Arff(phoneme_link)
Dataset_adult = loadDataset_Arff(adult_link)

Abaixo será feito a conversão das classes targets para True e False, e trocado o nome da coluna para Target para as que não estão. Também será feito uma analise de distribuição e procura de dados nominais.

In [13]:
Dataset_PhishingWebsites['Target'] = [True if x == '-1'.encode() else False for x in Dataset_PhishingWebsites['Result']]
Dataset_PhishingWebsites = Dataset_PhishingWebsites.drop(['Result'],axis=1)

In [14]:

Dataset_arrhythmia['Target'] = [True if x == 'P'.encode() else False for x in Dataset_arrhythmia['binaryClass']]
Dataset_arrhythmia = Dataset_arrhythmia.drop(['binaryClass','J'],axis=1) #Remoção da coluna J por ter 98% dos dados NaN
Dataset_arrhythmia = Dataset_arrhythmia.dropna() #Remoção das linhas com valores faltantes

In [15]:
##Sem nominais usando a função select_nominals
Dataset_Satellite['Target'] = [True if x == 'Anomaly'.encode() else False for x in Dataset_Satellite['Target']]

In [None]:
##Dataset com muitos dados nominais, para aplicar o knn será necessário usar a função getdummies em AirportFrom e AirportTo.
Dataset_airlines['Target'] = [True if x == '1'.encode() else False for x in Dataset_airlines['Delay']]
Dataset_airlines = Dataset_airlines.drop(['Delay'],axis=1)
Dataset_airlines['Airline'] = [x.decode('UTF-8') for x in Dataset_airlines['Airline']]
Dataset_airlines['DayOfWeek'] = [int(x.decode('UTF-8')) for x in Dataset_airlines['DayOfWeek']]
Dataset_airlines['AirportFrom'] = [x.decode('UTF-8') for x in Dataset_airlines['AirportFrom']]
Dataset_airlines['AirportTo'] = [x.decode('UTF-8') for x in Dataset_airlines['AirportTo']]

Dataset_airlines = get_dummies(Dataset_airlines,'AirportFrom')
Dataset_airlines = get_dummies(Dataset_airlines,'AirportTo')

In [16]:
Dataset_AedesSex['Target'] = [True if x == 'F' else False for x in Dataset_AedesSex['sex']]
Dataset_AedesSex = Dataset_AedesSex.drop(['sex'],axis=1)

In [17]:
##Dataset desbalanceado, analisar como se comporta nos algoritmos de aprendizagem para saber se é necessário balancear.
Dataset_phoneme['Target'] = [True if x == '1'.encode() else False for x in Dataset_phoneme['Class']]
Dataset_phoneme = Dataset_phoneme.drop(['Class'],axis=1)

In [23]:
# ##Dataset com muitas colunas nominais, será necessário gerar o dummie das mesmas.
# Dataset_adult['Target'] = [True if x == '<=50K'.encode() else False for x in Dataset_adult['class']]
# Dataset_adult = Dataset_adult.drop(['class'],axis=1)
Dataset_adult['workclass'] = [x.decode('UTF-8') for x in Dataset_adult['workclass']]
Dataset_adult['education'] = [x.decode('UTF-8') for x in Dataset_adult['education']]
Dataset_adult['marital-status'] = [x.decode('UTF-8') for x in Dataset_adult['marital-status']]
Dataset_adult['occupation'] = [x.decode('UTF-8') for x in Dataset_adult['occupation']]
Dataset_adult['relationship'] = [x.decode('UTF-8') for x in Dataset_adult['relationship']]
Dataset_adult['race'] = [x.decode('UTF-8') for x in Dataset_adult['race']]
Dataset_adult['sex'] = [x.decode('UTF-8') for x in Dataset_adult['sex']]
Dataset_adult['native-country'] = [x.decode('UTF-8') for x in Dataset_adult['native-country']]

Dataset_adult = Dataset_adult.dropna() #Remoção das linhas com valores faltantes

Dataset_adult = get_dummies(Dataset_adult,'workclass')
Dataset_adult = get_dummies(Dataset_adult,'education')
Dataset_adult = get_dummies(Dataset_adult,'marital-status')
Dataset_adult = get_dummies(Dataset_adult,'occupation')
Dataset_adult = get_dummies(Dataset_adult,'relationship')
Dataset_adult = get_dummies(Dataset_adult,'race')
Dataset_adult = get_dummies(Dataset_adult,'sex')
Dataset_adult = get_dummies(Dataset_adult,'native-country')
Dataset_adult

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,Target,workclass.?,workclass.Federal-gov,workclass.Local-gov,...,native-country.Portugal,native-country.Puerto-Rico,native-country.Scotland,native-country.South,native-country.Taiwan,native-country.Thailand,native-country.Trinadad&Tobago,native-country.United-States,native-country.Vietnam,native-country.Yugoslavia
0,25.0,226802.0,7.0,0.0,0.0,40.0,True,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38.0,89814.0,9.0,0.0,0.0,50.0,True,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28.0,336951.0,12.0,0.0,0.0,40.0,False,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,44.0,160323.0,10.0,7688.0,0.0,40.0,False,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,18.0,103497.0,10.0,0.0,0.0,30.0,True,1,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27.0,257302.0,12.0,0.0,0.0,38.0,True,0,0,0,...,0,0,0,0,0,0,0,1,0,0
48838,40.0,154374.0,9.0,0.0,0.0,40.0,False,0,0,0,...,0,0,0,0,0,0,0,1,0,0
48839,58.0,151910.0,9.0,0.0,0.0,40.0,True,0,0,0,...,0,0,0,0,0,0,0,1,0,0
48840,22.0,201490.0,9.0,0.0,0.0,20.0,True,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Normalização de todos os datasets

In [24]:
NOR_Dataset_PhishingWebsites = normalize(Dataset_PhishingWebsites)
NOR_Dataset_arrhythmia = normalize(Dataset_arrhythmia)
NOR_Dataset_Satellite = normalize(Dataset_Satellite)
NOR_Dataset_AedesSex = normalize(Dataset_AedesSex)
NOR_Dataset_phoneme = normalize(Dataset_phoneme)
## #NOR_Dataset_airlines = normalize(Dataset_airlines) # Dataset estourando a memoria.

##Para normalizar esses datasets precisará gerar os dumies
## # NOR_Dataset_adult = normalize(Dataset_adult)



Aplicando o knn em todos os datasets para saber a acurácia.

In [26]:
knn(NOR_Dataset_PhishingWebsites) #93.8498643352427
# knn(NOR_Dataset_arrhythmia) 
knn(NOR_Dataset_Satellite) #99.2156862745098
knn(NOR_Dataset_AedesSex) #98.73611111111111
knn(NOR_Dataset_phoneme) #88.03945745992601


treeDecision(NOR_Dataset_PhishingWebsites)#96.02050045221586
# treeDecision(NOR_Dataset_arrhythmia)
treeDecision(NOR_Dataset_Satellite)#99.08496732026144
treeDecision(NOR_Dataset_AedesSex)#98.19444444444444
treeDecision(NOR_Dataset_phoneme)#87.05302096177559


RForestDecision(NOR_Dataset_PhishingWebsites)#96.80434127223396
# RForestDecision(NOR_Dataset_arrhythmia)
RForestDecision(NOR_Dataset_Satellite)#99.281045751634
RForestDecision(NOR_Dataset_AedesSex)#98.38888888888889
RForestDecision(NOR_Dataset_phoneme)#90.9987669543773



94.30208019294544
98.95424836601308
98.54166666666667
87.6078914919852
95.38739825143202
98.82352941176471
98.08333333333333
86.68310727496917
97.01537533916189
99.34640522875817
98.58333333333333
91.49198520345253


In [30]:
NOR_Dataset_PhishingWebsites_y = NOR_Dataset_PhishingWebsites['Target']
NOR_Dataset_PhishingWebsites_x = NOR_Dataset_PhishingWebsites.drop(['Target'],axis=1)

In [35]:

# TTS_knn(NOR_Dataset_PhishingWebsites,NOR_Dataset_PhishingWebsites_x,NOR_Dataset_PhishingWebsites_y)
# TTS_treeDecision(NOR_Dataset_PhishingWebsites,NOR_Dataset_PhishingWebsites_x,NOR_Dataset_PhishingWebsites_y)
TTS_RForestDecision(NOR_Dataset_PhishingWebsites,NOR_Dataset_PhishingWebsites_x,NOR_Dataset_PhishingWebsites_y)

array([0.9801085 , 0.98915009, 0.9801085 , 0.98553345, 0.96383363,
       0.98372514, 0.98553345, 0.98191682, 0.98372514, 0.98191682,
       0.960217  , 0.97830018, 0.96202532, 0.97468354, 0.96925859,
       0.94202899, 0.9692029 , 0.94746377, 0.95289855, 0.97282609])