## Preprocesamientos de datos
Usaremos Scikit-Learn para hacer preprocesamiento de los datos, lo que requiere el modelo es una tabla numérica.

In [1]:
import pandas as pd
import numbers as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [3]:
#1 Cargar los datos
Data = pd.read_csv("../Datos/data_adults.csv")

In [4]:
# Inciamos usando SimpleImputer para rellenar valores nulos
imputer_nulls_cat = SimpleImputer(strategy="constant", fill_value="?")
imputer_nulls_num = SimpleImputer(strategy="mean")

In [6]:
pd.DataFrame(imputer_nulls_cat.fit_transform(Data[['workclass']])).value_counts()

0               
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [12]:
## Eliminamos el campo "fnlwgt"
Data.drop("fnlwgt", axis=1)
Data.drop("education-num", axis=1)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [11]:
## Ahora definimos tres tipos de
Data.dtypes


age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

In [17]:
X = Data.drop("income", axis=1)
y = Data["income"]

In [39]:
## Definimos las transformaciones
standar_scaler = StandardScaler()
rango = MinMaxScaler((-1,1))
## el sparse solo se utiliza de forma academica para ver la matriz con las columnas dummisadas pero en la realidad se deja sin el sparce para que sea de forma más eficiente.
one_hot = OneHotEncoder(sparse_output=False)
categories = ['Preschool','1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad', 'Some-college', 'Assoc-acdm', 'Assoc-voc', 'Prof-school', 'Bachelors', 'Masters', 'Doctorate']
ordinalencoder = OrdinalEncoder(categories= [categories])

In [40]:
X['education'].value_counts()

education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64

In [41]:
## Primero creamos cuatro pipeline [imputr null] -> 
trans_num_standard = Pipeline([('imputer_null', imputer_nulls_num),
                                ('standar_scaler', standar_scaler)])

trans_num_rango = Pipeline([('imputer_null', imputer_nulls_num),
                            ('rango',rango)])

trans_cat_OHE = Pipeline([('imputer_null', imputer_nulls_cat),
                          ('OHE',one_hot)])

trans_cat_ordinal = Pipeline([('imputer_null', imputer_nulls_cat),
                             ('Ordinal',ordinalencoder)])

In [46]:
## Definimos las columnas a tratar
col_num_standard =['capital-gain','capital-loss']
col_num_rango = ['age','hours-per-week']

col_cat_OHE=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
col_cat_ordinal=['education']

In [47]:
## finalmente construimos ccolumnas transformar
Preprocesamiento = ColumnTransformer(
transformers =[
    ('Standar', trans_num_standard, col_num_standard),
    ('Rango', trans_num_rango, col_num_rango),
    ('OHE', trans_cat_OHE, col_cat_OHE),
    ('Ordinal', trans_cat_ordinal, col_cat_ordinal),
]
)

In [48]:
Preprocesamiento

In [50]:
result=Preprocesamiento.fit_transform(X)
result

array([[ 0.14693247, -0.2171271 , -0.39726027, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.09589041, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.42465753, ...,  0.        ,
         0.        ,  8.        ],
       ...,
       [-0.14480353, -0.2171271 , -0.42465753, ...,  0.        ,
         0.        , 13.        ],
       [ 0.58722034, -0.2171271 , -0.26027397, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.50684932, ...,  0.        ,
         0.        , 13.        ]])

In [51]:
Preprocesamiento.get_feature_names_out()

array(['Standar__capital-gain', 'Standar__capital-loss', 'Rango__age',
       'Rango__hours-per-week', 'OHE__workclass_?',
       'OHE__workclass_Federal-gov', 'OHE__workclass_Local-gov',
       'OHE__workclass_Never-worked', 'OHE__workclass_Private',
       'OHE__workclass_Self-emp-inc', 'OHE__workclass_Self-emp-not-inc',
       'OHE__workclass_State-gov', 'OHE__workclass_Without-pay',
       'OHE__marital-status_Divorced',
       'OHE__marital-status_Married-AF-spouse',
       'OHE__marital-status_Married-civ-spouse',
       'OHE__marital-status_Married-spouse-absent',
       'OHE__marital-status_Never-married',
       'OHE__marital-status_Separated', 'OHE__marital-status_Widowed',
       'OHE__occupation_?', 'OHE__occupation_Adm-clerical',
       'OHE__occupation_Armed-Forces', 'OHE__occupation_Craft-repair',
       'OHE__occupation_Exec-managerial',
       'OHE__occupation_Farming-fishing',
       'OHE__occupation_Handlers-cleaners',
       'OHE__occupation_Machine-op-inspct',
    

In [52]:
import joblib
joblib.dump('Preprocesamiento', '../Modelos/mi_primer_procesamiento.joblib')

FileNotFoundError: [Errno 2] No such file or directory: '../Modelos/mi_primer_procesamiento.joblib'