## Preprocesamiento de datos

Usaremos scikit-learn para hacer preprocesamiento de datos, lo que requiiere el modelo es una tabla numérica y scikitlearn nos permitirá armarla.


In [55]:
### Importamos librerias
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler,  OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [56]:
#1 Carguemos los datos

Data=pd.read_csv("../Datos/data_adults.csv")

In [57]:
Data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [58]:
## Ahora definimos tres tipos de variables: categoricas, "normales", "Rango"
Data_cop=Data.drop("fnlwgt",axis=1)
Data_cop=Data_cop.drop("education-num",axis=1)

X=Data_cop.drop("income",axis=1)

y= Data_cop['income']
X.dtypes

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [59]:
##pd.DataFrame(imputer_nulls_cat.fit_transform(Data[['workclass', 'occupation']])).value_counts()

In [60]:
X['education'].value_counts()

education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64

In [61]:
## Iniciamos usando SimpleImputer para rellenar valores nulos

imputer_nulls_cat = SimpleImputer(strategy="constant",fill_value="?") 
imputer_nulls_num = SimpleImputer(strategy="mean") 

In [62]:
## Definimos las transformaciones

standar_scaler=StandardScaler()
rango=MinMaxScaler((-1,1))

one_hot=OneHotEncoder(sparse_output=False,handle_unknown='ignore')
# Define the desired order of categories
categories = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th',
    'HS-grad', 'Some-college', 'Assoc-acdm', 'Assoc-voc', 'Prof-school',
    'Bachelors', 'Masters', 'Doctorate']

# Create an OrdinalEncoder instance
ordinalencoder = OrdinalEncoder(categories=[categories])

In [63]:
### Primero creamos cuatro pipeline [IMPUTER NULL] -> [ENCODER o SCALER]

trans_num_standard =Pipeline([('imputer_null',imputer_nulls_num),
                               ('standard_scaler',standar_scaler)])
trans_num_rango=Pipeline([('imputer_null',imputer_nulls_num),
                               ('rango',rango)])

trans_cat_OHE=Pipeline([('imputer_null',imputer_nulls_cat),
                               ('OHE',one_hot)])
trans_cat_ordinal=Pipeline([('imputer_null',imputer_nulls_cat),
                               ('Ordinal',ordinalencoder)])


### Definimos las columnas a tratar

col_num_standard =['capital-gain','capital-loss']
col_num_rango = ['age','hours-per-week']

col_cat_OHE=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
col_cat_ordinal=['education']


## Finalmente construimos columns transformers

Preprocesamiento=ColumnTransformer(
transformers=[
    ('Standar',trans_num_standard,col_num_standard),
    ('Rango',trans_num_rango,col_num_rango),
    ('OHE',trans_cat_OHE,col_cat_OHE),
    ('Ordinal',trans_cat_ordinal,col_cat_ordinal),
]
)

In [64]:
Preprocesamiento

In [65]:
result=Preprocesamiento.fit_transform(X)

In [66]:
pd.DataFrame(result,columns=Preprocesamiento.get_feature_names_out())

Unnamed: 0,Standar__capital-gain,Standar__capital-loss,Rango__age,Rango__hours-per-week,OHE__workclass_?,OHE__workclass_Federal-gov,OHE__workclass_Local-gov,OHE__workclass_Never-worked,OHE__workclass_Private,OHE__workclass_Self-emp-inc,OHE__workclass_Self-emp-not-inc,OHE__workclass_State-gov,OHE__workclass_Without-pay,OHE__marital-status_Divorced,OHE__marital-status_Married-AF-spouse,OHE__marital-status_Married-civ-spouse,OHE__marital-status_Married-spouse-absent,OHE__marital-status_Never-married,OHE__marital-status_Separated,OHE__marital-status_Widowed,OHE__occupation_?,OHE__occupation_Adm-clerical,OHE__occupation_Armed-Forces,OHE__occupation_Craft-repair,OHE__occupation_Exec-managerial,OHE__occupation_Farming-fishing,OHE__occupation_Handlers-cleaners,OHE__occupation_Machine-op-inspct,OHE__occupation_Other-service,OHE__occupation_Priv-house-serv,OHE__occupation_Prof-specialty,OHE__occupation_Protective-serv,OHE__occupation_Sales,OHE__occupation_Tech-support,OHE__occupation_Transport-moving,OHE__relationship_Husband,OHE__relationship_Not-in-family,OHE__relationship_Other-relative,OHE__relationship_Own-child,OHE__relationship_Unmarried,OHE__relationship_Wife,OHE__race_Amer-Indian-Eskimo,OHE__race_Asian-Pac-Islander,OHE__race_Black,OHE__race_Other,OHE__race_White,OHE__sex_Female,OHE__sex_Male,OHE__native-country_?,OHE__native-country_Cambodia,OHE__native-country_Canada,OHE__native-country_China,OHE__native-country_Columbia,OHE__native-country_Cuba,OHE__native-country_Dominican-Republic,OHE__native-country_Ecuador,OHE__native-country_El-Salvador,OHE__native-country_England,OHE__native-country_France,OHE__native-country_Germany,OHE__native-country_Greece,OHE__native-country_Guatemala,OHE__native-country_Haiti,OHE__native-country_Holand-Netherlands,OHE__native-country_Honduras,OHE__native-country_Hong,OHE__native-country_Hungary,OHE__native-country_India,OHE__native-country_Iran,OHE__native-country_Ireland,OHE__native-country_Italy,OHE__native-country_Jamaica,OHE__native-country_Japan,OHE__native-country_Laos,OHE__native-country_Mexico,OHE__native-country_Nicaragua,OHE__native-country_Outlying-US(Guam-USVI-etc),OHE__native-country_Peru,OHE__native-country_Philippines,OHE__native-country_Poland,OHE__native-country_Portugal,OHE__native-country_Puerto-Rico,OHE__native-country_Scotland,OHE__native-country_South,OHE__native-country_Taiwan,OHE__native-country_Thailand,OHE__native-country_Trinadad&Tobago,OHE__native-country_United-States,OHE__native-country_Vietnam,OHE__native-country_Yugoslavia,Ordinal__education
0,0.146932,-0.217127,-0.397260,-0.204082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0
1,-0.144804,-0.217127,-0.095890,-0.755102,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0
2,-0.144804,-0.217127,-0.424658,-0.204082,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0
3,-0.144804,-0.217127,-0.013699,-0.204082,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0
4,-0.144804,-0.217127,-0.698630,-0.204082,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.144804,-0.217127,-0.397260,-0.285714,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0
48838,-0.144804,-0.217127,0.287671,-0.204082,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0
48839,-0.144804,-0.217127,-0.424658,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0
48840,0.587220,-0.217127,-0.260274,-0.204082,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0


In [67]:

Data.iloc[5654].to_json()

'{"age":25,"workclass":"Private","fnlwgt":189590,"education":"Bachelors","education-num":13,"marital-status":"Never-married","occupation":"Tech-support","relationship":"Not-in-family","race":"White","sex":"Male","capital-gain":0,"capital-loss":0,"hours-per-week":40,"native-country":"United-States","income":"<=50K"}'

In [68]:
user={"age":[25],"workclass":["Private"],"education":["Bachelors"],
      "marital-status":["Never-married"],
      "occupation":["Tech-support"],
      "relationship":["Not-in-family"],
      "race":["White"],
      "sex":["Gay"],
      "capital-gain":[0],
      "capital-loss":[0],
      "hours-per-week":[40],
      "native-country":["Columbia"]}

In [69]:
pd.DataFrame(user)

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,Bachelors,Never-married,Tech-support,Not-in-family,White,Gay,0,0,40,Columbia


In [70]:
Preprocesamiento.transform(pd.DataFrame(user))

array([[-0.14480353, -0.2171271 , -0.78082192, -0.20408163,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [71]:
pd.set_option("display.max_columns", 999)
pd.DataFrame(Preprocesamiento.transform(pd.DataFrame(user)),columns=Preprocesamiento.get_feature_names_out())

Unnamed: 0,Standar__capital-gain,Standar__capital-loss,Rango__age,Rango__hours-per-week,OHE__workclass_?,OHE__workclass_Federal-gov,OHE__workclass_Local-gov,OHE__workclass_Never-worked,OHE__workclass_Private,OHE__workclass_Self-emp-inc,OHE__workclass_Self-emp-not-inc,OHE__workclass_State-gov,OHE__workclass_Without-pay,OHE__marital-status_Divorced,OHE__marital-status_Married-AF-spouse,OHE__marital-status_Married-civ-spouse,OHE__marital-status_Married-spouse-absent,OHE__marital-status_Never-married,OHE__marital-status_Separated,OHE__marital-status_Widowed,OHE__occupation_?,OHE__occupation_Adm-clerical,OHE__occupation_Armed-Forces,OHE__occupation_Craft-repair,OHE__occupation_Exec-managerial,OHE__occupation_Farming-fishing,OHE__occupation_Handlers-cleaners,OHE__occupation_Machine-op-inspct,OHE__occupation_Other-service,OHE__occupation_Priv-house-serv,OHE__occupation_Prof-specialty,OHE__occupation_Protective-serv,OHE__occupation_Sales,OHE__occupation_Tech-support,OHE__occupation_Transport-moving,OHE__relationship_Husband,OHE__relationship_Not-in-family,OHE__relationship_Other-relative,OHE__relationship_Own-child,OHE__relationship_Unmarried,OHE__relationship_Wife,OHE__race_Amer-Indian-Eskimo,OHE__race_Asian-Pac-Islander,OHE__race_Black,OHE__race_Other,OHE__race_White,OHE__sex_Female,OHE__sex_Male,OHE__native-country_?,OHE__native-country_Cambodia,OHE__native-country_Canada,OHE__native-country_China,OHE__native-country_Columbia,OHE__native-country_Cuba,OHE__native-country_Dominican-Republic,OHE__native-country_Ecuador,OHE__native-country_El-Salvador,OHE__native-country_England,OHE__native-country_France,OHE__native-country_Germany,OHE__native-country_Greece,OHE__native-country_Guatemala,OHE__native-country_Haiti,OHE__native-country_Holand-Netherlands,OHE__native-country_Honduras,OHE__native-country_Hong,OHE__native-country_Hungary,OHE__native-country_India,OHE__native-country_Iran,OHE__native-country_Ireland,OHE__native-country_Italy,OHE__native-country_Jamaica,OHE__native-country_Japan,OHE__native-country_Laos,OHE__native-country_Mexico,OHE__native-country_Nicaragua,OHE__native-country_Outlying-US(Guam-USVI-etc),OHE__native-country_Peru,OHE__native-country_Philippines,OHE__native-country_Poland,OHE__native-country_Portugal,OHE__native-country_Puerto-Rico,OHE__native-country_Scotland,OHE__native-country_South,OHE__native-country_Taiwan,OHE__native-country_Thailand,OHE__native-country_Trinadad&Tobago,OHE__native-country_United-States,OHE__native-country_Vietnam,OHE__native-country_Yugoslavia,Ordinal__education
0,-0.144804,-0.217127,-0.780822,-0.204082,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0


In [72]:
import joblib

joblib.dump(Preprocesamiento,'../Modelos/mi_primer_preprocesamiento.joblib')

['../Modelos/mi_primer_preprocesamiento.joblib']