[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mitchell-Mirano/Allison/blob/develop/examples/preprocessing/encoders-scalers-transformers.ipynb)

In [None]:
!pip install "allison[cpu] @ git+https://github.com/Mitchell-Mirano/Allison.git@develop"

zsh:1: command not found: pip


In [2]:
import numpy as np
import pandas as pd
from allison.preprocessing import OneHotEncoder
from allison.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from allison.preprocessing import ColumnTransformer

In [3]:
data = {'Edad': [25, 30, 45, 50, 35, 60, 20, 40],
        'Ingresos': [30000, 50000, 100000, 120000, 70000, 150000, 20000, 80000],
        'Pais': ['EE. UU.', 'Canadá', 'México', 'EE. UU.', 'Canadá', 'México', 'EE. UU.', 'Canadá'],
        'Ciudades': ['New York', 'Toronto', 'Mexico City', 'New York', 'Toronto', 'Mexico City', 'New York', 'Toronto'],
        'Compra': [0, 1, 1, 1, 0, 1, 0, 1]}
df = pd.DataFrame(data)

X = df.drop('Compra', axis=1)
y = df['Compra']
X

Unnamed: 0,Edad,Ingresos,Pais,Ciudades
0,25,30000,EE. UU.,New York
1,30,50000,Canadá,Toronto
2,45,100000,México,Mexico City
3,50,120000,EE. UU.,New York
4,35,70000,Canadá,Toronto
5,60,150000,México,Mexico City
6,20,20000,EE. UU.,New York
7,40,80000,Canadá,Toronto


In [4]:
X.shape

(8, 4)

In [5]:
categorical_features = ['Pais', 'Ciudades']
numeric_features = ['Edad', 'Ingresos'] 

# One Hot Encoder

In [6]:
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[categorical_features])
pd.DataFrame(X_encoded, columns=encoder.get_features_names())

Unnamed: 0,Pais_Canadá,Pais_EE. UU.,Pais_México,Ciudades_Mexico City,Ciudades_New York,Ciudades_Toronto
0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,1.0,1.0,0.0,0.0
6,0.0,1.0,0.0,0.0,1.0,0.0
7,1.0,0.0,0.0,0.0,0.0,1.0


# Standard Scaler

In [7]:
scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X[numeric_features]) 
pd.DataFrame(X_scaled, columns=scaler.get_features_names())

Unnamed: 0,Edad,Ingresos
0,-1.051315,-1.1375
1,-0.650814,-0.658553
2,0.550689,0.538816
3,0.95119,1.017763
4,-0.250313,-0.179605
5,1.752192,1.736185
6,-1.451816,-1.376974
7,0.150188,0.059868


# One Hot Ecoder + Standard Scaler

In [8]:
encoder = OneHotEncoder()
scaler = StandardScaler() 
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features]) 
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())

Unnamed: 0,Pais_Canadá,Pais_EE. UU.,Pais_México,Ciudades_Mexico City,Ciudades_New York,Ciudades_Toronto,Edad,Ingresos
0,0.0,1.0,0.0,0.0,1.0,0.0,-1.051315,-1.1375
1,1.0,0.0,0.0,0.0,0.0,1.0,-0.650814,-0.658553
2,0.0,0.0,1.0,1.0,0.0,0.0,0.550689,0.538816
3,0.0,1.0,0.0,0.0,1.0,0.0,0.95119,1.017763
4,1.0,0.0,0.0,0.0,0.0,1.0,-0.250313,-0.179605
5,0.0,0.0,1.0,1.0,0.0,0.0,1.752192,1.736185
6,0.0,1.0,0.0,0.0,1.0,0.0,-1.451816,-1.376974
7,1.0,0.0,0.0,0.0,0.0,1.0,0.150188,0.059868


# One Hot Ecoder + MinMax Scaler

In [9]:
encoder = OneHotEncoder()
scaler = MinMaxScaler()
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features])
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())

Unnamed: 0,Pais_Canadá,Pais_EE. UU.,Pais_México,Ciudades_Mexico City,Ciudades_New York,Ciudades_Toronto,Edad,Ingresos
0,0.0,1.0,0.0,0.0,1.0,0.0,0.125,0.076923
1,1.0,0.0,0.0,0.0,0.0,1.0,0.25,0.230769
2,0.0,0.0,1.0,1.0,0.0,0.0,0.625,0.615385
3,0.0,1.0,0.0,0.0,1.0,0.0,0.75,0.769231
4,1.0,0.0,0.0,0.0,0.0,1.0,0.375,0.384615
5,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,1.0,0.5,0.461538


# One Hot Ecoder + Robust Scaler

In [10]:
encoder = OneHotEncoder()
scaler = RobustScaler()
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features])
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())

Unnamed: 0,Pais_Canadá,Pais_EE. UU.,Pais_México,Ciudades_Mexico City,Ciudades_New York,Ciudades_Toronto,Edad,Ingresos
0,0.0,1.0,0.0,0.0,1.0,0.0,-0.714286,-0.75
1,1.0,0.0,0.0,0.0,0.0,1.0,-0.428571,-0.416667
2,0.0,0.0,1.0,1.0,0.0,0.0,0.428571,0.416667
3,0.0,1.0,0.0,0.0,1.0,0.0,0.714286,0.75
4,1.0,0.0,0.0,0.0,0.0,1.0,-0.142857,-0.083333
5,0.0,0.0,1.0,1.0,0.0,0.0,1.285714,1.25
6,0.0,1.0,0.0,0.0,1.0,0.0,-1.0,-0.916667
7,1.0,0.0,0.0,0.0,0.0,1.0,0.142857,0.083333


# Column Transformer

In [11]:
column_transformer = ColumnTransformer(
    transformers = [
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numeric_features)
    ]
)

X_train = column_transformer.fit_transform(X)

pd.DataFrame(X_train, columns=column_transformer.get_features_names())

Unnamed: 0,cat_Pais_Canadá,cat_Pais_EE. UU.,cat_Pais_México,cat_Ciudades_Mexico City,cat_Ciudades_New York,cat_Ciudades_Toronto,num_Edad,num_Ingresos
0,0.0,1.0,0.0,0.0,1.0,0.0,-1.051315,-1.1375
1,1.0,0.0,0.0,0.0,0.0,1.0,-0.650814,-0.658553
2,0.0,0.0,1.0,1.0,0.0,0.0,0.550689,0.538816
3,0.0,1.0,0.0,0.0,1.0,0.0,0.95119,1.017763
4,1.0,0.0,0.0,0.0,0.0,1.0,-0.250313,-0.179605
5,0.0,0.0,1.0,1.0,0.0,0.0,1.752192,1.736185
6,0.0,1.0,0.0,0.0,1.0,0.0,-1.451816,-1.376974
7,1.0,0.0,0.0,0.0,0.0,1.0,0.150188,0.059868
