# 4. Trabajando con Datos Númericos

## Librerias

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import preprocessing

## 4.1 Reescalar features

In [3]:
feature = np.array([
    [-500.5],
    [-100.1],
    [0],
    [100.1],
    [900.9]
])

In [4]:
feature

array([[-500.5],
       [-100.1],
       [   0. ],
       [ 100.1],
       [ 900.9]])

In [5]:
# Creamos el escalador

minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [6]:
# Escalamos nuestros datos

scaled_feature = minmax_scale.fit_transform(feature)
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

## 4.2 Estandarizar features

In [7]:
feature = np.array([
    [-1000.1],
    [-200.2],
    [500.5],
    [600.6],
    [9000.9]
])

In [8]:
# Creamos el escalador

scaler = preprocessing.StandardScaler()

In [9]:
standarized = scaler.fit_transform(feature)
standarized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [10]:
print(f'Mean: {round(standarized.mean())}')
print(f'D. Est: {round(standarized.std())}')

Mean: 0
D. Est: 1


Si nuestros datos tienen outliers significativos podemos usar un escalador robusto

In [11]:
robust_scaler = preprocessing.RobustScaler()

In [12]:
robust_scaler.fit_transform(feature)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

## 4.3 Normalizar observaciones

**Normalizar:**  Consiste en ajustar los valores medidos en diferentes escalas respecto a una escala común.

In [13]:
feature = np.array([
    [0.5, 0.5],
    [1.1, 3.4],
    [1.5, 20.2],
    [1.63, 34.4],
    [10.9, 3.3]
])

In [14]:
feature

array([[ 0.5 ,  0.5 ],
       [ 1.1 ,  3.4 ],
       [ 1.5 , 20.2 ],
       [ 1.63, 34.4 ],
       [10.9 ,  3.3 ]])

In [15]:
# Creamos el normalizador 

normalizer = preprocessing.Normalizer(norm='l2')

In [16]:
normalizer.transform(feature)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

## 4.4 Generación de características polinomiales y de interacción

In [17]:
from sklearn.preprocessing import PolynomialFeatures

In [18]:
features = np.array([
    [2,3],
    [2,4],
    [2,3]
])
features

array([[2, 3],
       [2, 4],
       [2, 3]])

In [19]:
polynomial_interaction=PolynomialFeatures(degree=2, include_bias=False)

In [20]:
polynomial_interaction.fit_transform(features)

array([[ 2.,  3.,  4.,  6.,  9.],
       [ 2.,  4.,  4.,  8., 16.],
       [ 2.,  3.,  4.,  6.,  9.]])

## 4.5 Transformando caracteristicas

In [21]:
from sklearn.preprocessing import FunctionTransformer

In [22]:
def add_ten(x):
    return x+10

In [23]:
ten_transformer = FunctionTransformer(add_ten)

In [24]:
ten_transformer.transform(features)

array([[12, 13],
       [12, 14],
       [12, 13]])

## 4.6 Detectando Outliers

**Metodo 1**

In [25]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [26]:
features, _ = make_blobs(
    n_samples=10,
    n_features=2, 
    centers=1,
    random_state=1
)

features

array([[-1.83198811,  3.52863145],
       [-2.76017908,  5.55121358],
       [-1.61734616,  4.98930508],
       [-0.52579046,  3.3065986 ],
       [ 0.08525186,  3.64528297],
       [-0.79415228,  2.10495117],
       [-1.34052081,  4.15711949],
       [-1.98197711,  4.02243551],
       [-2.18773166,  3.33352125],
       [-0.19745197,  2.34634916]])

In [27]:
features[0, 0] = 10000
features[0, 1] = 10000
features

array([[ 1.00000000e+04,  1.00000000e+04],
       [-2.76017908e+00,  5.55121358e+00],
       [-1.61734616e+00,  4.98930508e+00],
       [-5.25790464e-01,  3.30659860e+00],
       [ 8.52518583e-02,  3.64528297e+00],
       [-7.94152277e-01,  2.10495117e+00],
       [-1.34052081e+00,  4.15711949e+00],
       [-1.98197711e+00,  4.02243551e+00],
       [-2.18773166e+00,  3.33352125e+00],
       [-1.97451969e-01,  2.34634916e+00]])

In [28]:
outlier_detector = EllipticEnvelope(contamination=.1)

In [29]:
# Ajustamos el detector
outlier_detector.fit(features)

EllipticEnvelope()

In [30]:
#Predecimos los outliers
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

**Metodo 2**

In [31]:
feature = features[:,0]

In [41]:
def indices_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    
    return np.where((x > upper_bound) | (x < lower_bound))

In [44]:
outliers = indices_outliers(feature)

In [45]:
outliers

(array([0], dtype=int64),)

## 4.7 Manejando los Outliers

In [53]:
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_feet'] = [1500, 2500, 1500, 48000]

In [56]:
# Filtramos las observaciones

houses[houses['Bathrooms'] < 10]

Unnamed: 0,Price,Bathrooms,Square_feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [57]:
# Podemos marcar los outliers con un nuevo feature

houses['Outliers'] = np.where(houses['Bathrooms']>10, 1, 0)

In [63]:
# Podemos transformas nuestros datos para disminuir el efecto de los outliers

houses['Log_Of_Square_feet'] = np.log(houses['Square_feet'])
houses

Unnamed: 0,Price,Bathrooms,Square_feet,Outliers,Log_Of_Square_feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


## 4.8 Discretizando características

In [64]:
from sklearn.preprocessing import Binarizer

In [76]:
age = np.array([
    [6],
    [12],
    [18],
    [20],
    [36],
    [65]
])

In [77]:
# Podemos crear rangos para nuestros datos númericos

binarizer = Binarizer(threshold=18)
binarizer.fit_transform(age)

array([[0],
       [0],
       [0],
       [1],
       [1],
       [1]])

In [78]:
np.digitize(age, bins=[20, 30, 40])

array([[0],
       [0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

## 4.9 Agrupar observaciones utilizando Clustering