## Transformando dados categóricos em numéricos

In [1]:
from sklearn import preprocessing
import pandas as pd

In [2]:
raw_data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, 2, 3, 1, 2],
        'treatment': [0, 1, 0, 1, 0],
        'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])

In [3]:
le = preprocessing.LabelEncoder()

In [4]:
le.fit(df['score'])

LabelEncoder()

In [5]:
list(le.classes_)

['normal', 'strong', 'weak']

In [8]:
df['score'] = le.transform(df['score']) 
df

Unnamed: 0,patient,obs,treatment,score
0,1,1,0,1
1,1,2,1,2
2,1,3,0,0
3,2,1,1,2
4,2,2,0,1


In [7]:
list(le.inverse_transform([2, 2, 1]))

['weak', 'weak', 'strong']

## Trabalhando com dados vazios

In [9]:
import pandas as pd
import numpy as np

In [10]:
X = np.array([[1.1, 11.1], 
              [2.2, 22.2], 
              [3.3, 33.3], 
              [4.4, 44.4], 
              [np.nan, 55]])

In [11]:
X[~np.isnan(X).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [13]:
pd.DataFrame(X).dropna()

Unnamed: 0,0,1
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


## Detectando um outlier

In [14]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [17]:
X, _ = make_blobs(n_samples = 10,
                  n_features = 2,
                  centers = 1,
                  random_state = 1) # Gerando dados

In [18]:
X

array([[-1.83198811,  3.52863145],
       [-2.76017908,  5.55121358],
       [-1.61734616,  4.98930508],
       [-0.52579046,  3.3065986 ],
       [ 0.08525186,  3.64528297],
       [-0.79415228,  2.10495117],
       [-1.34052081,  4.15711949],
       [-1.98197711,  4.02243551],
       [-2.18773166,  3.33352125],
       [-0.19745197,  2.34634916]])

In [19]:
X[0,0] = 10000
X[0,1] = 10000 # Gerando outliers

In [20]:
X

array([[ 1.00000000e+04,  1.00000000e+04],
       [-2.76017908e+00,  5.55121358e+00],
       [-1.61734616e+00,  4.98930508e+00],
       [-5.25790464e-01,  3.30659860e+00],
       [ 8.52518583e-02,  3.64528297e+00],
       [-7.94152277e-01,  2.10495117e+00],
       [-1.34052081e+00,  4.15711949e+00],
       [-1.98197711e+00,  4.02243551e+00],
       [-2.18773166e+00,  3.33352125e+00],
       [-1.97451969e-01,  2.34634916e+00]])

In [21]:
outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit(X)
outlier_detector.predict(X)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [30]:
df = pd.DataFrame(X, columns=['feature 1', 'feature 2'])
df['outlier'] = outlier_detector.predict(df)

In [32]:
df = df[df['outlier']!=-1]
df

Unnamed: 0,feature 1,feature 2,outlier
1,-2.760179,5.551214,1
2,-1.617346,4.989305,1
3,-0.52579,3.306599,1
4,0.085252,3.645283,1
5,-0.794152,2.104951,1
6,-1.340521,4.157119,1
7,-1.981977,4.022436,1
8,-2.187732,3.333521,1
9,-0.197452,2.346349,1


In [33]:
from sklearn.preprocessing import Binarizer
import numpy as np

In [34]:
age = np.array([[6], 
                [12], 
                [20], 
                [36], 
                [65]])

In [35]:
# Criando um binarizer
binarizer = Binarizer(18)

# Transformando pelo bins
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [36]:
np.digitize(age, bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)