# Capítulo 4 - Análise de grupos

## Bibliotecas básicas e outras inicializações

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Carregamento dos dados

In [2]:
dados = pd.read_csv('./datasets/zoo.data', 
                    names=['animal', 'pelo', 'penas', 'oviparo', 'mamifero', 'alado', 'aquatico', 'predador', 'dentes', 'colunas',
                          'respira_ar', 'venenoso', 'barbatana', 'pernas', 'cauda', 'domestico', 'tamanho', 'tipo'],
                    na_values='?')
print(f'Linhas: {dados.shape[0]} | Colunas: {dados.shape[1]}')
dados.head()

Linhas: 101 | Colunas: 18


Unnamed: 0,animal,pelo,penas,oviparo,mamifero,alado,aquatico,predador,dentes,colunas,respira_ar,venenoso,barbatana,pernas,cauda,domestico,tamanho,tipo
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [3]:
dados.dtypes

animal        object
pelo           int64
penas          int64
oviparo        int64
mamifero       int64
alado          int64
aquatico       int64
predador       int64
dentes         int64
colunas        int64
respira_ar     int64
venenoso       int64
barbatana      int64
pernas         int64
cauda          int64
domestico      int64
tamanho        int64
tipo           int64
dtype: object

## Medidas de similaridade

### Distância Euclideana

In [10]:
from scipy.spatial.distance import pdist, squareform
distancia = squareform(pdist(dados.drop(['animal'], axis=1).values, metric='euclidean'))
print(distancia)

[[0.         1.41421356 5.74456265 ... 1.         7.68114575 3.74165739]
 [1.41421356 0.         5.74456265 ... 1.         7.68114575 3.46410162]
 [5.74456265 5.74456265 0.         ... 5.65685425 4.         3.87298335]
 ...
 [1.         1.         5.65685425 ... 0.         7.74596669 3.60555128]
 [7.68114575 7.68114575 4.         ... 7.74596669 0.         5.74456265]
 [3.74165739 3.46410162 3.87298335 ... 3.60555128 5.74456265 0.        ]]


### Distância de Jaccard

In [12]:
distancia = squareform(pdist(dados.drop(['animal'], axis=1).values, metric='jaccard'))
print(distancia)

[[0.         0.2        0.76923077 ... 0.1        0.9        0.84615385]
 [0.2        0.         0.76923077 ... 0.1        0.9        0.75      ]
 [0.76923077 0.76923077 0.         ... 0.69230769 0.88888889 0.75      ]
 ...
 [0.1        0.1        0.69230769 ... 0.         0.90909091 0.76923077]
 [0.9        0.9        0.88888889 ... 0.90909091 0.         0.75      ]
 [0.84615385 0.75       0.75       ... 0.76923077 0.75       0.        ]]


### Distância de Hamming

In [13]:
distancia = squareform(pdist(dados.drop(['animal'], axis=1).values, metric='hamming'))
print(distancia)

[[0.         0.11764706 0.58823529 ... 0.05882353 0.52941176 0.64705882]
 [0.11764706 0.         0.58823529 ... 0.05882353 0.52941176 0.52941176]
 [0.58823529 0.58823529 0.         ... 0.52941176 0.47058824 0.52941176]
 ...
 [0.05882353 0.05882353 0.52941176 ... 0.         0.58823529 0.58823529]
 [0.52941176 0.52941176 0.47058824 ... 0.58823529 0.         0.35294118]
 [0.64705882 0.52941176 0.52941176 ... 0.58823529 0.35294118 0.        ]]


### Distância de Matching

In [26]:
distancia = squareform(pdist(dados.drop(['animal'], axis=1).values, metric='matching'))
print(distancia)

[[0.         0.11764706 0.58823529 ... 0.05882353 0.52941176 0.64705882]
 [0.11764706 0.         0.58823529 ... 0.05882353 0.52941176 0.52941176]
 [0.58823529 0.58823529 0.         ... 0.52941176 0.47058824 0.52941176]
 ...
 [0.05882353 0.05882353 0.52941176 ... 0.         0.58823529 0.58823529]
 [0.52941176 0.52941176 0.47058824 ... 0.58823529 0.         0.35294118]
 [0.64705882 0.52941176 0.52941176 ... 0.58823529 0.35294118 0.        ]]
