# Algoritmo de selección de atributos basado en entropía
---
## Algoritmo para atributos de tipo categórico

In [1]:
import pandas as pd
import numpy as np
import math as mt

In [2]:
frame_test = pd.DataFrame({'x1':[1,0,0,1,0],
                          'x2':['A', 'B', 'C', 'C', 'A'],
                          'x3':['Alto', 'Bajo', 'Alto', 'Bajo', 'Bajo'],
                          'x4':['A', 'A', 'B', 'B', 'A']})
frame_test

Unnamed: 0,x1,x2,x3,x4
0,1,A,Alto,A
1,0,B,Bajo,A
2,0,C,Alto,B
3,1,C,Bajo,B
4,0,A,Bajo,A


Se realiza la matriz de similitudes

In [3]:
similarity_matrix = np.zeros((len(frame_test),len(frame_test)))
similarity_matrix

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

Esta es la instrucción para comparar una fila con otra

In [4]:
((frame_test.iloc[0,:] == frame_test.iloc[1,:]).sum())/len(frame_test.iloc[0,:])

0.25

In [5]:
for i in range(0, len(similarity_matrix[0])):
    for j in range(0, len(similarity_matrix)):
        if(j > i):
            similarity_matrix[i][j] = ((frame_test.iloc[i,:] == frame_test.iloc[j,:]).sum())/len(frame_test.iloc[i,:])

In [6]:
similarity_matrix

array([[0.  , 0.25, 0.25, 0.25, 0.5 ],
       [0.  , 0.  , 0.25, 0.25, 0.75],
       [0.  , 0.  , 0.  , 0.5 , 0.25],
       [0.  , 0.  , 0.  , 0.  , 0.25],
       [0.  , 0.  , 0.  , 0.  , 0.  ]])

Se calcula la entropía de cada elemento

In [7]:
n = similarity_matrix[0][1]
result = n * np.log2(n) + (1 - n) * np.log2(1 - n)
result

-0.8112781244591328

In [8]:
entropy_matrix = np.zeros((len(frame_test),len(frame_test)))

In [9]:
for i in range(0, len(similarity_matrix[0])):
    for j in range(0, len(similarity_matrix)):
        if(j > i):
            n = similarity_matrix[i][j]
            entropy_matrix[i][j] = n * np.log2(n) + (1 - n) * np.log2(1 - n)

In [10]:
entropy_matrix

array([[ 0.        , -0.81127812, -0.81127812, -0.81127812, -1.        ],
       [ 0.        ,  0.        , -0.81127812, -0.81127812, -0.81127812],
       [ 0.        ,  0.        ,  0.        , -1.        , -0.81127812],
       [ 0.        ,  0.        ,  0.        ,  0.        , -0.81127812],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [11]:
entropy_matrix.sum()

-8.490224995673064

La entropía total es

In [12]:
total_entropy = entropy_matrix.sum() * -1
total_entropy

8.490224995673064

Cálculo de la entropía sin uno de los atributos

In [13]:
frame_test_a = frame_test.drop(frame_test.columns[1], axis = 'columns').copy()

In [14]:
frame_test_a

Unnamed: 0,x1,x3,x4
0,1,Alto,A
1,0,Bajo,A
2,0,Alto,B
3,1,Bajo,B
4,0,Bajo,A


In [15]:
similarity_matrix = np.zeros((len(frame_test_a),len(frame_test_a)))
similarity_matrix

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [16]:
for i in range(0, len(similarity_matrix[0])):
    for j in range(0, len(similarity_matrix)):
        if(j > i):
            similarity_matrix[i][j] = ((frame_test_a.iloc[i,:] == frame_test_a.iloc[j,:]).sum())/len(frame_test_a.iloc[i,:])

In [17]:
similarity_matrix

array([[0.        , 0.33333333, 0.33333333, 0.33333333, 0.33333333],
       [0.        , 0.        , 0.33333333, 0.33333333, 1.        ],
       [0.        , 0.        , 0.        , 0.33333333, 0.33333333],
       [0.        , 0.        , 0.        , 0.        , 0.33333333],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [18]:
entropy_matrix = np.zeros((len(frame_test_a),len(frame_test_a)))

In [19]:
for i in range(0, len(similarity_matrix[0])):
    for j in range(0, len(similarity_matrix)):
        if(j > i):
            n = similarity_matrix[i][j]
            if(n != 0 and n!= 1):
                entropy_matrix[i][j] = n * np.log2(n) + (1 - n) * np.log2(1 - n)

In [20]:
entropy_matrix

array([[ 0.        , -0.91829583, -0.91829583, -0.91829583, -0.91829583],
       [ 0.        ,  0.        , -0.91829583, -0.91829583,  0.        ],
       [ 0.        ,  0.        ,  0.        , -0.91829583, -0.91829583],
       [ 0.        ,  0.        ,  0.        ,  0.        , -0.91829583],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [21]:
total_entropy = entropy_matrix.sum() * -1
total_entropy

8.264662506490406

In [22]:
def select_attribute(df):
    similarity_matrix = np.zeros((len(df), len(df)))
    entropy_matrix = similarity_matrix.copy()
    for i in range(0, len(similarity_matrix[0])):
        for j in range(0, len(similarity_matrix)):
            if(j > i):
                similarity_matrix[i][j] = ((df.iloc[i,:] == df.iloc[j,:]).sum())/len(df.iloc[i,:])
    
    for i in range(0, len(similarity_matrix[0])):
        for j in range(0, len(similarity_matrix)):
            if(j > i):
                n = similarity_matrix[i][j]
                if(n != 0 and n!= 1):
                    entropy_matrix[i][j] = n * np.log2(n) + (1 - n) * np.log2(1 - n)
    
    total_entropy = entropy_matrix.sum() * -1
    return total_entropy

In [23]:
resumen = {}
entropias = []
numerosidades = []
entropia_total = round(select_attribute(frame_test), 3)
resumen['Entropía general'] = [select_attribute(frame_test), 0]
for i in frame_test.columns:
    info = []
    entropia = round(select_attribute(frame_test.drop(i, axis = 1).copy()), 3)
    numerosidad = frame_test[i].value_counts().count()
    entropias.append(entropia)
    numerosidades.append(numerosidad)
    info.append(entropia)
    info.append(numerosidad)
    resumen['Entropía sin ' + i] = info

In [24]:
resumen

{'Entropía general': [8.490224995673064, 0],
 'Entropía sin x1': [6.428, 2],
 'Entropía sin x2': [8.265, 3],
 'Entropía sin x3': [6.428, 2],
 'Entropía sin x4': [8.265, 2]}

In [25]:
entropias = np.array(entropias)
entropias

array([6.428, 8.265, 6.428, 8.265])

In [26]:
a_restar = np.ones(len(entropias))
a_restar

array([1., 1., 1., 1.])

In [27]:
a_restar *= entropia_total

In [28]:
restas = np.absolute(a_restar - entropias)
restas

array([2.062, 0.225, 2.062, 0.225])

In [29]:
minima_entropia = restas.min()
minima_entropia

0.22499999999999964

In [30]:
indexes = np.where(restas == minima_entropia)
indexes

(array([1, 3], dtype=int64),)

In [31]:
attrs = frame_test.columns[indexes]

In [32]:
attrs = list(attrs)
attrs

['x2', 'x4']

In [33]:
numerosidades = np.array(numerosidades)

In [34]:
numerosidades_a_comparar = numerosidades[indexes]
numerosidades_a_comparar

array([3, 2])

In [35]:
max_num = numerosidades_a_comparar.max()
max_num

3

In [36]:
index_to_erase = np.where(numerosidades_a_comparar == max_num)
index_to_erase

(array([0], dtype=int64),)

In [37]:
attr_to_erase = attrs[index_to_erase[0][0]]
attr_to_erase

'x2'

In [38]:
frame_test.drop(attr_to_erase, axis=1)

Unnamed: 0,x1,x3,x4
0,1,Alto,A
1,0,Bajo,A
2,0,Alto,B
3,1,Bajo,B
4,0,Bajo,A


In [39]:
min_numerosidad = min(numerosidades)

In [40]:
frame_test = frame_test.drop(attrs, axis=1).copy()

In [41]:
frame_test

Unnamed: 0,x1,x3
0,1,Alto
1,0,Bajo
2,0,Alto
3,1,Bajo
4,0,Bajo


## Algoritmo para atributos de tipo numérico

In [42]:
df_test = pd.read_csv('ejercicio2.csv', sep=',')

In [83]:
df_test = pd.DataFrame({'x1':[25, 1, 3, 7, 8],
          'x2': [18, 8, 3, 5, 12],
          'x3': [12, 0, 4, 12, 25],
          'x4': [1738, 858, 93, 275, 132],
          'x5': [393, 278, 305, 401, 168],
          'x6': [215, 115, 101, 128, 23],
          'x7': [0, 13, 0, 0, 0],
          'x8': [67, 28, 74, 45, 122],
          'x9': [44, 13, 48, 17, 21],
          'x10': [4.6, 5, 1, 1, 3],
          'x11': [7.6, 33.8, 6.5, 0, 0]})
df_test

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11
0,25,18,12,1738,393,215,0,67,44,4.6,7.6
1,1,8,0,858,278,115,13,28,13,5.0,33.8
2,3,3,4,93,305,101,0,74,48,1.0,6.5
3,7,5,12,275,401,128,0,45,17,1.0,0.0
4,8,12,25,132,168,23,0,122,21,3.0,0.0


In [45]:
df_test

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11
0,25,18,12,1738,393,215,0,67,44,4.6,7.6
1,1,8,0,858,278,115,13,28,13,5.0,33.8
2,3,3,4,93,305,101,0,74,48,1.0,6.5
3,7,5,12,275,401,128,0,45,17,1.0,0.0
4,8,12,25,132,168,23,0,122,21,3.0,0.0


Tomado como numpy array...

In [46]:
np_array = df_test.values

In [47]:
np_array

array([[2.500e+01, 1.800e+01, 1.200e+01, 1.738e+03, 3.930e+02, 2.150e+02,
        0.000e+00, 6.700e+01, 4.400e+01, 4.600e+00, 7.600e+00],
       [1.000e+00, 8.000e+00, 0.000e+00, 8.580e+02, 2.780e+02, 1.150e+02,
        1.300e+01, 2.800e+01, 1.300e+01, 5.000e+00, 3.380e+01],
       [3.000e+00, 3.000e+00, 4.000e+00, 9.300e+01, 3.050e+02, 1.010e+02,
        0.000e+00, 7.400e+01, 4.800e+01, 1.000e+00, 6.500e+00],
       [7.000e+00, 5.000e+00, 1.200e+01, 2.750e+02, 4.010e+02, 1.280e+02,
        0.000e+00, 4.500e+01, 1.700e+01, 1.000e+00, 0.000e+00],
       [8.000e+00, 1.200e+01, 2.500e+01, 1.320e+02, 1.680e+02, 2.300e+01,
        0.000e+00, 1.220e+02, 2.100e+01, 3.000e+00, 0.000e+00]])

Cálculo del mínimo y el máximo de cada atributo

In [48]:
np_array[:,0].max()

25.0

In [49]:
max_values = []
min_values = []
for i in range(0, len(np_array[0,:])):
    max_values.append(np_array[:,i].max())
    min_values.append(np_array[:,i].min())    

In [50]:
max_values

[25.0, 18.0, 25.0, 1738.0, 401.0, 215.0, 13.0, 122.0, 48.0, 5.0, 33.8]

In [51]:
min_values

[1.0, 3.0, 0.0, 93.0, 168.0, 23.0, 0.0, 28.0, 13.0, 1.0, 0.0]

In [52]:
max_values = df_test.max()
min_values = df_test.min()

In [53]:
max_values

x1       25.0
x2       18.0
x3       25.0
x4     1738.0
x5      401.0
x6      215.0
x7       13.0
x8      122.0
x9       48.0
x10       5.0
x11      33.8
dtype: float64

In [54]:
min_values

x1       1.0
x2       3.0
x3       0.0
x4      93.0
x5     168.0
x6      23.0
x7       0.0
x8      28.0
x9      13.0
x10      1.0
x11      0.0
dtype: float64

Cálculo de la distancia Euclídea

In [55]:
distance_matrix = np.zeros((df_test.shape[0],df_test.shape[0]))
distance_matrix

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [56]:
df_test.iloc[0,:]

x1       25.0
x2       18.0
x3       12.0
x4     1738.0
x5      393.0
x6      215.0
x7        0.0
x8       67.0
x9       44.0
x10       4.6
x11       7.6
Name: 0, dtype: float64

In [57]:
((((np_array[0,:] - np_array[1,:]) / max_values) ** 2).sum()) ** (1/2)

2.0352233516439666

Cálculo de la distancia para un solo elemento

In [58]:
result = mt.sqrt((((df_test.iloc[0,:] - df_test.iloc[1, :])/ (max_values - min_values)) ** 2).sum())
result

2.245745426633766

In [59]:
delta_values = max_values.values - min_values.values

In [60]:
delta_values

array([  24. ,   15. ,   25. , 1645. ,  233. ,  192. ,   13. ,   94. ,
         35. ,    4. ,   33.8])

In [61]:
df_test.iloc[0,:].values - df_test.iloc[1,:].values

array([ 2.40e+01,  1.00e+01,  1.20e+01,  8.80e+02,  1.15e+02,  1.00e+02,
       -1.30e+01,  3.90e+01,  3.10e+01, -4.00e-01, -2.62e+01])

In [62]:
for i in range(0, len(distance_matrix[0])):
    for j in range(0, len(distance_matrix[0])):
        if(j > i):
            #distance_matrix[i][j] = ((((df_test.iloc[i,:] - df_test.iloc[j, :])/ delta_values) ** 2).sum()) ** (1/2)
            #distance_matrix[i][j] = (df_test.iloc[i,:].values - df_test.iloc[j,:].values).sum()
            #distance_matrix[i][j] = ((((df_test.iloc[i,:].values - df_test.iloc[j,:].values)/ delta_values) ** 2).sum()) ** (1/2)
            distance_matrix[i][j] = ((((np_array[i,:] - np_array[j,:]) / max_values) ** 2).sum()) ** (1/2)
distance_matrix

array([[0.        , 2.03522335, 1.82375693, 1.68244129, 1.84279648],
       [0.        , 0.        , 1.81049522, 1.78672951, 2.08376151],
       [0.        , 0.        , 0.        , 0.85731085, 1.38297799],
       [0.        , 0.        , 0.        , 0.        , 1.25356266],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

Cálculo de la similaridad

In [63]:
similarity_matrix = np.zeros((df_test.shape[0],df_test.shape[0]))

In [64]:
for i in range(0, len(similarity_matrix[0])):
    for j in range(0, len(similarity_matrix)):
        if(j > i):
            similarity_matrix[i][j] = np.exp(-0.5 * distance_matrix[i][j])
similarity_matrix

array([[0.        , 0.36145719, 0.40176881, 0.43118388, 0.3979622 ],
       [0.        , 0.        , 0.40444173, 0.40927632, 0.35279054],
       [0.        , 0.        , 0.        , 0.65138434, 0.50082978],
       [0.        , 0.        , 0.        , 0.        , 0.5343088 ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

Cálculo de la entropía

In [65]:
entropy_matrix = np.zeros((df_test.shape[0],df_test.shape[0]))

In [66]:
for i in range(0, len(similarity_matrix[0])):
    for j in range(0, len(similarity_matrix)):
        if(j > i):
            n = similarity_matrix[i][j]
            if(n != 0 and n!= 1):
                entropy_matrix[i][j] = n * np.log2(n) + (1 - n) * np.log2(1 - n)
entropy_matrix

array([[ 0.        , -0.94388612, -0.97197588, -0.98629231, -0.96974607],
       [ 0.        ,  0.        , -0.97348961, -0.97611891, -0.93653559],
       [ 0.        ,  0.        ,  0.        , -0.93282564, -0.99999801],
       [ 0.        ,  0.        ,  0.        ,  0.        , -0.99660096],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [67]:
total_entropy = entropy_matrix.sum() * -1
total_entropy

9.687469097402092

## Ahora se calcula la entropía para un subconjunto de atributos del dataset original

In [68]:
df_test_a = df_test.drop(df_test.columns[10], axis = 'columns').copy()
df_test_a

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,25,18,12,1738,393,215,0,67,44,4.6
1,1,8,0,858,278,115,13,28,13,5.0
2,3,3,4,93,305,101,0,74,48,1.0
3,7,5,12,275,401,128,0,45,17,1.0
4,8,12,25,132,168,23,0,122,21,3.0


In [69]:
max_values = df_test_a.max()
min_values = df_test_a.min()

In [70]:
distance_matrix = np.zeros((df_test_a.shape[0],df_test_a.shape[0]))
distance_matrix

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [71]:
for i in range(0, len(distance_matrix[0])):
    for j in range(0, len(distance_matrix)):
        if(j > i):
            distance_matrix[i][j] = mt.sqrt((((df_test_a.iloc[i,:] - df_test_a.iloc[j, :])/ (max_values - min_values)) ** 2).sum())
distance_matrix

array([[0.        , 2.10772821, 2.06554292, 1.94189455, 2.17990306],
       [0.        , 0.        , 1.90214164, 1.66959052, 2.02624926],
       [0.        , 0.        , 0.        , 1.10878246, 1.65111515],
       [0.        , 0.        , 0.        , 0.        , 1.65245551],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [72]:
similarity_matrix = np.zeros((df_test_a.shape[0],df_test_a.shape[0]))

In [73]:
for i in range(0, len(similarity_matrix[0])):
    for j in range(0, len(similarity_matrix)):
        if(j > i):
            similarity_matrix[i][j] = np.exp(-0.5 * distance_matrix[i][j])
similarity_matrix

array([[0.        , 0.34858816, 0.3560189 , 0.37872411, 0.33623279],
       [0.        , 0.        , 0.38632712, 0.43396332, 0.36308271],
       [0.        , 0.        , 0.        , 0.57442185, 0.43799071],
       [0.        , 0.        , 0.        , 0.        , 0.43769728],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [74]:
entropy_matrix = np.zeros((df_test_a.shape[0],df_test_a.shape[0]))

In [75]:
for i in range(0, len(similarity_matrix[0])):
    for j in range(0, len(similarity_matrix)):
        if(j > i):
            n = similarity_matrix[i][j]
            if(n != 0 and n!= 1):
                entropy_matrix[i][j] = n * np.log2(n) + (1 - n) * np.log2(1 - n)
entropy_matrix

array([[ 0.        , -0.93280084, -0.93932888, -0.95713592, -0.92116804],
       [ 0.        ,  0.        , -0.96238836, -0.98738043, -0.94521235],
       [ 0.        ,  0.        ,  0.        , -0.98395941, -0.98887662],
       [ 0.        ,  0.        ,  0.        ,  0.        , -0.98877082],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [76]:
parcial_entropy = entropy_matrix.sum() * -1
parcial_entropy

9.60702165773433

In [89]:
def select_numeric_attribute(df):
    np_array = df.values
    max_values = []
    min_values = []
    for i in range(0, len(np_array[0,:])):
        max_values.append(np_array[:,i].max())
        min_values.append(np_array[:,i].min())
    #max_values = df.max()
    #min_values = df.min()
    distance_matrix = np.zeros((df.shape[0],df.shape[0]))
    similarity_matrix = distance_matrix.copy()
    entropy_matrix = distance_matrix.copy()

    for i in range(0, len(distance_matrix[0])):
        for j in range(0, len(distance_matrix[0])):
            if(j > i):
                distance_matrix[i][j] = ((((np_array[i,:] - np_array[j,:]) / max_values) ** 2).sum()) ** (1/2)
    
    for i in range(0, len(similarity_matrix[0])):
        for j in range(0, len(similarity_matrix)):
            if(j > i):
                similarity_matrix[i][j] = np.exp(-0.5 * distance_matrix[i][j])
    
    for i in range(0, len(similarity_matrix[0])):
        for j in range(0, len(similarity_matrix)):
            if(j > i):
                n = similarity_matrix[i][j]
                if(n != 0 and n!= 1):
                    entropy_matrix[i][j] = n * np.log2(n) + (1 - n) * np.log2(1 - n)
    parcial_entropy = entropy_matrix.sum() * -1
    return parcial_entropy


In [92]:
def select_numeric_attribute(df):
        np_array = df.values
        max_values = []
        for i in range(0, len(np_array[0,:])):
            max_values.append(np_array[:,i].max())
        entropy_matrix = np.zeros((df.shape[0],df.shape[0]))
            
        for i in range(0, len(entropy_matrix[0])):
            for j in range(0, len(entropy_matrix[0])):
                if(j > i):
                    distance_result = ((((np_array[i,:] - np_array[j,:]) / max_values) ** 2).sum()) ** (1/2)
                    similarity_result = np.exp(-0.5 * distance_result)
                    n = similarity_result
                    if(n != 0 and n!= 1):
                        entropy_matrix[i][j] = n * np.log2(n) + (1 - n) * np.log2(1 - n)
        
        n_entropy = entropy_matrix.sum() * -1
        return n_entropy

In [93]:
resumen = {}
entropias = []
entropia_total = round(select_numeric_attribute(df_test), 3)
resumen['Entropía general'] = select_numeric_attribute(df_test)
for i in df_test.columns:
    entropia = round(select_numeric_attribute(df_test.drop(i, axis = 1).copy()), 3)
    entropias.append(entropia)
    resumen['Sin ' + i] = entropia

In [94]:
resumen

{'Entropía general': 9.687469097402092,
 'Sin x1': 9.764,
 'Sin x2': 9.729,
 'Sin x3': 9.704,
 'Sin x4': 9.769,
 'Sin x5': 9.684,
 'Sin x6': 9.726,
 'Sin x7': 9.802,
 'Sin x8': 9.698,
 'Sin x9': 9.601,
 'Sin x10': 9.751,
 'Sin x11': 9.78}

In [101]:
entropias = np.array(entropias)
a_restar = np.ones(len(entropias))
a_restar *= entropia_total
restas = np.absolute(a_restar - entropias)
restas

array([0.077, 0.042, 0.017, 0.082, 0.003, 0.039, 0.115, 0.011, 0.086,
       0.064, 0.093])

In [102]:
minima_resta = restas.min()

In [105]:
indexes = np.where(restas == minima_resta)
indexes

(array([4], dtype=int64),)

In [106]:
attrs = df_test.columns[indexes]

In [107]:
attrs

Index(['x5'], dtype='object')

# Identificación de atributos de tipo categóricos y numérico
---

In [None]:
frame_test

In [None]:
frame_test.info()

In [None]:
df_test

In [None]:
def kind_of_attribute(df):
    for i in df.columns:
        if df[i].dtype == 'object':
            print('El atributo ' + i + ' es categórico')
        else:
            if(df[i].value_counts().count() < len(df[i])/3):
                print('El atributo ' + i + ' es categórico')
            else:
                print('El atributo ' + i + ' es numérico')

In [None]:
kind_of_attribute(frame_test)