# Transformación de los datos

## 1. Importar paquetes

In [48]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

%config IPCompleter.greedy = True

## 2. Carga de los datos

In [49]:
ruta = 'C:/Users/matia/OneDrive/Escritorio/Churn_Bank'

In [50]:
nombre_cat = 'cat_resultado_calidad.pickle'
nombre_num = 'num_resultado_calidad.pickle'

In [51]:
cat = pd.read_pickle(ruta + '/02_Datos/03_Trabajo/' + nombre_cat)
num = pd.read_pickle(ruta + '/02_Datos/03_Trabajo/' + nombre_num)

## 3. Variables categoricas

Es necesario explicar la siguiente decisión. Por que si las variables categoricas ya estan ingresadas con valores numericos si aplica OheHotEnconder? La razón es que al plantear el objetivo de crear un perfil de clientes utilizando modelos de ML, lo que buscamos no es solamente es ver que variables tienen más peso para el modelo sino que valores tambien impactan en el proceso y así obtener más información.

### 3.1 Instanciar la transformación

In [52]:
ohe = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

### 3.2 Aplicar

In [53]:
cat_ohe = ohe.fit_transform(cat)

### 3.3 Recuperar la información

In [54]:
cat_ohe = pd.DataFrame(cat_ohe, columns = ohe.get_feature_names_out())

## 4. Variables numericas

Dada las distribuciones de las variables numericas, vamos a aplicar dos transformaciones diferentes. Para las variables que tiene una distribucion cercana a la normal, se les aplica el StandardScaler. Mientras que las que posee multiples distribuciones se usa el MinMaxScaler.

### 4.1 Seleccionar las variables

In [55]:
var_1 = ['CreditScore', 'Age', 'Tenure', 'EstimatedSalary']

In [56]:
var_2 = ['Balance', 'NumOfProducts']

### 4.2 Instanciar y aplicar

In [57]:
std_scaler = StandardScaler()
num[var_1] = std_scaler.fit_transform(num[var_1])

In [58]:
mm_scaler = MinMaxScaler()
num[var_2] = mm_scaler.fit_transform(num[var_2])

## 5. Unir los datasets

In [62]:
df_tablon = pd.concat([cat_ohe, num], axis = 1)

In [63]:
df_tablon

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,HasCrCard_0,HasCrCard_1,IsActiveMember_0,IsActiveMember_1,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Exited
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,-0.326221,0.293517,-1.041760,0.000000,0.000000,0.021886,1
1,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.440036,0.198164,-1.387538,0.334031,0.000000,0.216534,0
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1.536794,0.293517,1.032908,0.636357,0.666667,0.240687,1
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.501521,0.007457,-1.387538,0.000000,0.333333,-0.108918,0
4,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,2.063884,0.388871,-1.041760,0.500246,0.000000,-0.365276,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.246488,0.007457,-0.004426,0.000000,0.333333,-0.066419,0
9996,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,-1.391939,-0.373958,1.724464,0.228657,0.000000,0.027988,0
9997,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.604988,-0.278604,0.687130,0.000000,0.000000,-1.008643,1
9998,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.256835,0.293517,-0.695982,0.299226,0.333333,-0.125231,1


## 6. Guardar el nuevo dataset

In [64]:
ruta_df_tablon = ruta + '/02_Datos/03_Trabajo/' + 'df_tablon.pickle'

In [65]:
df_tablon.to_pickle(ruta_df_tablon)