## Seleção de atributos utilizando variância

In [1]:
# Bibliotecas
import math
import pandas as pd
import numpy as np

In [2]:
# Criando um grupo de dados
np.random.rand(50)

array([0.82787145, 0.8424722 , 0.73006625, 0.2470767 , 0.14059202,
       0.86926498, 0.35104466, 0.3792922 , 0.75376552, 0.6895043 ,
       0.44520021, 0.00989566, 0.71681079, 0.32481213, 0.56705459,
       0.57594503, 0.79777476, 0.02931123, 0.99641512, 0.99827822,
       0.40644159, 0.07487425, 0.95168228, 0.10765832, 0.78883261,
       0.50088134, 0.724192  , 0.92333686, 0.92406957, 0.48047649,
       0.74504871, 0.41466333, 0.71181903, 0.0387578 , 0.12349541,
       0.59828122, 0.86166199, 0.63501442, 0.66553625, 0.58119929,
       0.67152971, 0.25591516, 0.54433632, 0.82831536, 0.11385971,
       0.88022228, 0.54091267, 0.93259872, 0.80930703, 0.40672127])

In [3]:
# Gerando um número entre 0 e 1
np.random.randint(0, 2)

0

In [4]:
# Criando uma base de dados
bs = {
    'a': np.random.rand(20),
    'b': np.array([0.5] * 20),
    'classe': np.random.randint(0, 2, size=20)
}
bs

{'a': array([0.57511305, 0.73856752, 0.19347051, 0.95383634, 0.80620465,
        0.33028575, 0.90205323, 0.07210399, 0.78184501, 0.7816085 ,
        0.21183064, 0.90055596, 0.93293625, 0.58120267, 0.29684161,
        0.27275874, 0.61427804, 0.25975987, 0.12289049, 0.02256181]),
 'b': array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]),
 'classe': array([0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0])}

In [5]:
# Transformando a base de dados em um DataSet
ds = pd.DataFrame(bs)
ds.head()

Unnamed: 0,a,b,classe
0,0.575113,0.5,0
1,0.738568,0.5,0
2,0.193471,0.5,1
3,0.953836,0.5,0
4,0.806205,0.5,0


In [6]:
# Visualizando a descrição do ds
ds.describe()

Unnamed: 0,a,b,classe
count,20.0,20.0,20.0
mean,0.517535,0.5,0.35
std,0.320669,0.0,0.48936
min,0.022562,0.5,0.0
25%,0.247778,0.5,0.0
50%,0.578158,0.5,0.0
75%,0.787935,0.5,1.0
max,0.953836,0.5,1.0


In [7]:
# A Variância das colunas A e B
np.var(ds['a']), np.var(ds['b'])

(0.09768701583283776, 0.0)

In [8]:
# A raiz quadrada da variância da coluna A
math.sqrt(np.var(ds['a']))

0.3125492214561376

In [9]:
# Selecionado as colunas A e B 
X = ds.iloc[:, 0:2].values
X

array([[0.57511305, 0.5       ],
       [0.73856752, 0.5       ],
       [0.19347051, 0.5       ],
       [0.95383634, 0.5       ],
       [0.80620465, 0.5       ],
       [0.33028575, 0.5       ],
       [0.90205323, 0.5       ],
       [0.07210399, 0.5       ],
       [0.78184501, 0.5       ],
       [0.7816085 , 0.5       ],
       [0.21183064, 0.5       ],
       [0.90055596, 0.5       ],
       [0.93293625, 0.5       ],
       [0.58120267, 0.5       ],
       [0.29684161, 0.5       ],
       [0.27275874, 0.5       ],
       [0.61427804, 0.5       ],
       [0.25975987, 0.5       ],
       [0.12289049, 0.5       ],
       [0.02256181, 0.5       ]])

In [10]:
# Lib
from sklearn.feature_selection import VarianceThreshold

In [11]:
# Criando a seleção
selecao = VarianceThreshold(threshold=0.08)
x_novo = selecao.fit_transform(X)

In [12]:
# Visualizando o X_novo e o tamanho
x_novo, x_novo.shape

(array([[0.57511305],
        [0.73856752],
        [0.19347051],
        [0.95383634],
        [0.80620465],
        [0.33028575],
        [0.90205323],
        [0.07210399],
        [0.78184501],
        [0.7816085 ],
        [0.21183064],
        [0.90055596],
        [0.93293625],
        [0.58120267],
        [0.29684161],
        [0.27275874],
        [0.61427804],
        [0.25975987],
        [0.12289049],
        [0.02256181]]),
 (20, 1))

In [13]:
# A Variância
selecao.variances_

array([0.09768702, 0.        ])

In [14]:
# Onde a variância é maior
indices = np.where(selecao.variances_ > 0.08)
indices

(array([0], dtype=int64),)