# IDH Classification

Using the classification below, we use the ID3 tree decision to classificate the presented IDH

    0,800 – 1,000 (muito alto)
    0,700 – 0,799 (alto)
    0,555 – 0,699 (médio)
    0,350 – 0,554 (baixo)
    0,0 – 0,349 (muito baixo)


In [1]:
from sklearn import tree
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import random

columns=['PopEstimada_2018','PopCenso_2010','IDHM','ReceitasRealizadas_2014','DespesasEmpenhadas_2014','Pib_2014','LocalUF_AC','LocalUF_AL','LocalUF_AM','LocalUF_AP','LocalUF_BA','LocalUF_CE','LocalUF_DF','LocalUF_ES','LocalUF_GO','LocalUF_MA','LocalUF_MG','LocalUF_MS','LocalUF_MT','LocalUF_PA','LocalUF_PB','LocalUF_PE','LocalUF_PI','LocalUF_PR','LocalUF_RJ','LocalUF_RN','LocalUF_RO','LocalUF_RR','LocalUF_RS','LocalUF_SC','LocalUF_SE','LocalUF_SP','LocalUF_TO']

In [2]:
# Loading the formatted data
data = pd.read_csv (r'data/DATA_F.csv')
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,PopEstimada_2018,PopCenso_2010,IDHM,ReceitasRealizadas_2014,DespesasEmpenhadas_2014,Pib_2014,LocalUF_AC,LocalUF_AL,LocalUF_AM,LocalUF_AP,...,LocalUF_PR,LocalUF_RJ,LocalUF_RN,LocalUF_RO,LocalUF_RR,LocalUF_RS,LocalUF_SC,LocalUF_SE,LocalUF_SP,LocalUF_TO
0,23167,24392,3,53490,47577,16461.11,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,5438,6313,3,22635,19354,18346.17,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,84813,78574,4,169636,155473,20725.23,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,16444,17029,3,53147,46774,22179.17,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,16227,18591,3,34612,34381,15300.88,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4613,13746,12548,4,37777,33220,23237.24,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4614,8611,7371,3,26783,23511,39480.36,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4615,6026,4735,3,17295,15646,18992.37,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4616,5758,5145,3,19148,16785,35691.39,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Embaralhando dados
df = df.sample(frac = 1).reset_index(drop = True)

target = df.pop('IDHM')

# Representando as strings como inteiros 
target = target.astype('int')


# Treino 90% e teste 10%
qtd_treino = int(len(df) * 0.9)
x_treinamento = df[:qtd_treino]
x_teste = df[qtd_treino:]
y_treinamento = target[:qtd_treino]
y_teste = target[qtd_treino:]
df

Unnamed: 0,PopEstimada_2018,PopCenso_2010,ReceitasRealizadas_2014,DespesasEmpenhadas_2014,Pib_2014,LocalUF_AC,LocalUF_AL,LocalUF_AM,LocalUF_AP,LocalUF_BA,...,LocalUF_PR,LocalUF_RJ,LocalUF_RN,LocalUF_RO,LocalUF_RR,LocalUF_RS,LocalUF_SC,LocalUF_SE,LocalUF_SP,LocalUF_TO
0,40905,39255,93318,78665,21478.26,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4086,3878,11372,9902,6187.37,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3870,3522,12266,10935,8295.32,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3454,3510,13291,10778,12583.67,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,30451,26577,50810,49626,11737.29,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4613,14130,13226,34959,33140,12360.86,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4614,8365,7659,17847,16981,7190.09,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4615,16984,15776,43244,37890,21479.11,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4616,17775,14662,58995,45559,82101.14,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
# Árvore de decisão com o critério de entropia
clf = tree.DecisionTreeClassifier(criterion="entropy", class_weight='balanced')

In [5]:
# Treinamento
irisTree = clf.fit(x_treinamento, y_treinamento)

In [6]:
# Predição
y_pred = irisTree.predict(x_teste)

In [7]:
print(y_pred)
print(y_teste)

[3 3 3 4 3 4 4 3 4 4 2 4 3 2 3 3 3 3 3 3 2 3 3 4 3 4 4 3 3 3 3 3 4 4 4 2 4
 3 3 3 4 4 3 3 4 3 3 4 3 3 3 4 3 4 4 4 2 3 3 3 3 4 2 3 2 3 4 4 3 3 4 3 3 3
 3 4 4 3 4 4 3 3 3 4 4 4 3 4 4 3 3 3 3 4 4 2 3 3 3 3 4 3 4 4 3 2 3 3 3 4 4
 3 5 3 2 3 2 3 4 3 3 3 4 3 4 4 3 3 4 3 4 3 3 3 3 3 3 4 4 3 4 4 4 3 3 4 2 4
 3 3 3 4 3 4 3 3 4 4 2 3 2 3 3 3 4 3 4 3 3 4 4 3 2 3 3 3 4 3 4 3 4 3 4 4 4
 3 2 3 4 4 3 2 3 2 3 4 3 4 3 3 4 3 3 4 2 4 3 3 3 4 4 3 4 4 4 4 4 3 3 3 4 3
 4 3 2 3 4 4 3 4 3 3 4 4 2 3 4 3 3 4 3 4 4 4 3 3 4 4 4 4 3 4 4 5 3 3 4 4 4
 4 3 3 3 4 2 3 3 3 3 2 4 4 4 3 3 3 2 3 3 4 4 3 3 4 2 4 4 3 3 4 4 4 4 3 3 4
 3 3 3 3 3 3 3 4 4 4 3 3 3 3 4 3 3 4 3 3 3 4 3 4 4 2 4 4 4 4 4 4 3 3 3 3 4
 4 3 3 3 2 4 3 3 4 3 3 4 4 3 3 3 3 4 3 4 4 3 3 4 4 4 3 4 4 4 3 3 3 3 4 4 3
 4 3 3 4 3 4 4 2 3 3 3 2 3 3 3 3 2 2 3 3 3 3 3 3 3 3 4 3 3 4 3 4 3 3 3 4 3
 4 3 4 3 4 4 3 3 4 3 4 3 2 3 3 3 4 3 2 3 4 3 4 3 3 4 3 3 4 4 3 3 3 2 3 3 4
 4 4 3 4 3 4 3 4 3 4 2 3 3 4 2 4 3 2]
4156    3
4157    3
4158    3
4159    3
4160    3
       ..
46

In [8]:
MC = confusion_matrix(y_teste, y_pred)
print(MC)

[[ 16  23   0   0]
 [ 20 182  46   0]
 [  1  40 130   2]
 [  0   0   2   0]]


In [9]:
acuracia = (MC[0][0] + MC[1][1]) / np.sum(MC)
precisao = MC[0][0] / (np.sum(MC[0]))
recall = MC[0][0] / (np.sum(MC[:, 0]))

print('Acuracia:', acuracia)
print('Precisão: ', precisao)
print('Recall: ', recall)

Acuracia: 0.42857142857142855
Precisão:  0.41025641025641024
Recall:  0.43243243243243246


In [None]:
tree.plot_tree(clf, filled=True)