In [1]:
#conda install -c conda-forge feature_engine
# conda install -c anaconda flask

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer
from feature_engine.transformation import PowerTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

from sklearn.metrics import mean_squared_error 

import joblib

In [2]:
#Librería creada
import my_preprocessors as mypp 

In [3]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
cat_vars = [var for var in data.columns if data[var].dtype == 'O']
#cat_vars

#Agregamos variables que por definición son categóricas
cat_vars = cat_vars + ['Pclass'] + ['SibSp'] + ['Parch']
#cat_vars

data[cat_vars] = data[cat_vars].astype('O')

# Cantidad de variables categóricas
#len(cat_vars)

In [5]:
# Se separa data para train y test
X_train, X_test, y_train, y_test = train_test_split(
                        data.drop(['PassengerId', 'Survived'], axis=1),
                        data['Survived'],
                        test_size=0.15,
                        random_state=2021)

X_train.shape, X_test.shape

((757, 10), (134, 10))

In [6]:
# No se usa np.log por datos negatvos, genera error

y_train = np.sqrt(np.power(y_train,2))
y_test = np.sqrt(np.power(y_test,2))
#y_train
#y_test
#X_train

In [7]:
cat_vars_with_na = [var for var in cat_vars 
                        if X_train[var].isnull().sum() > 0]

X_train[cat_vars_with_na].isnull().mean().sort_values(ascending = False)

# variables tratadas con etiqueta de faltante por cantidad masiva de faltantes.
vars_with_missing_string = [var for var in cat_vars_with_na
                               if X_train[var].isnull().mean() > 0.2]

#variables tratadas con procedimiento por candiad adecuada de faltantes.
vars_freq_category = [var for var in cat_vars_with_na
                               if X_train[var].isnull().mean() <= 0.2]

#Faltantes con etiqueta missing
X_train[vars_with_missing_string] = X_train[vars_with_missing_string].fillna('Missing')
X_test[vars_with_missing_string] = X_test[vars_with_missing_string].fillna('Missing')

for var in vars_freq_category:
    mode = X_train[var].mode()[0]
    
    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)

cat_vars_with_na = [var for var in cat_vars 
                        if X_train[var].isnull().sum() > 0]

num_vars = [var for var in X_train.columns
               if var not in cat_vars and var != 'Survived']

# Numéricas con faltantes
num_vars_with_na = [var for var in num_vars
                       if X_train[var].isnull().sum() > 0]

for var in num_vars_with_na:
    mean_val = X_train[var].mean()
    
    X_train[var].fillna(mean_val, inplace=True)
    X_test[var].fillna(mean_val, inplace=True)

for var in ['Fare']:
    X_train[var] = np.sqrt(np.power(X_train[var],2))
    X_test[var] = np.sqrt(np.power(X_test[var],2))

In [8]:
X_test[vars_with_missing_string] = X_test[vars_with_missing_string].fillna('Missing')
X_test['Age'].isnull().sum()
#X_test[vars_with_missing_string]

0

## Configuración del Machine Learning Pipeline

In [9]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_MISSING = ['SibSp', 'Parch','Pclass']

#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['Age']


#Varibles para transformación logaritmia
NUMERICALS_LOG_VARS = ['Fare']

#Variables para hacer mapeo categorico por codificación ordinal
SEX_VARS = ['Sex']

#Variables categoricas a codificar sin ordinalidad
#CATEGORICAL_VARS = ['Cabin', 'Embarked', 'SibSp', 'Parch', 'Pclass']

#Mapeos de variables categoricas
SEX_MAPPINGS = {'male':1, 'female':0}

#Variables seleccionadas según análisis de Lasso
FEATURES = ['Sex', 'Age', 'Fare','SibSp', 'Parch','Pclass']

In [10]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]
X_train

Unnamed: 0,Sex,Age,Fare,SibSp,Parch,Pclass
856,female,45.000000,164.8667,1,1,1
685,male,25.000000,41.5792,1,2,2
95,male,29.512678,8.0500,0,0,3
419,female,10.000000,24.1500,0,2,3
208,female,16.000000,7.7500,0,0,3
...,...,...,...,...,...,...
621,male,42.000000,52.5542,1,0,1
128,female,29.512678,22.3583,1,1,3
57,male,28.500000,7.2292,0,0,3
341,female,24.000000,263.0000,3,2,1


In [11]:
#Seleccionamos variables para predicción
#X_test = pd.read_csv("test.csv")
#X_test = X_test[FEATURES]
#X_test

## Machine Learing PipeLine

In [12]:
Survived_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #1. Imputación de varaibles categoricas
    ('missing_imputation', 
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #3. Indicamos Faltante en variables numéricas para imputar
    #('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #4. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
     imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)),
    
  
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #5. Transformación logaritmica
    ('pow', PowerTransformer(variables=NUMERICALS_LOG_VARS)),
    
    
    #=============== CODIFICACION DE VARIABLES CATEGORICAS ORDINALES ==============
    ('mapper_sex', mypp.Mapper(
        variables=SEX_VARS, mappings=SEX_MAPPINGS)),
    
    
     
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Lasso', Lasso(alpha=0.01, random_state=2022)),
    
]) 

In [13]:
X_train

Unnamed: 0,Sex,Age,Fare,SibSp,Parch,Pclass
856,female,45.000000,164.8667,1,1,1
685,male,25.000000,41.5792,1,2,2
95,male,29.512678,8.0500,0,0,3
419,female,10.000000,24.1500,0,2,3
208,female,16.000000,7.7500,0,0,3
...,...,...,...,...,...,...
621,male,42.000000,52.5542,1,0,1
128,female,29.512678,22.3583,1,1,3
57,male,28.500000,7.2292,0,0,3
341,female,24.000000,263.0000,3,2,1


In [14]:
Survived_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation',
                 CategoricalImputer(variables=['SibSp', 'Parch', 'Pclass'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Age'])),
                ('pow', PowerTransformer(variables=['Fare'])),
                ('mapper_sex',
                 Mapper(mappings={'female': 0, 'male': 1}, variables=['Sex'])),
                ('scaler', MinMaxScaler()),
                ('Lasso', Lasso(alpha=0.01, random_state=2022))])

In [15]:
#Seleccionamos variables para predicción
#X_test = pd.read_csv("test.csv")
X_test = X_test[FEATURES]
X_test

Unnamed: 0,Sex,Age,Fare,SibSp,Parch,Pclass
210,male,24.000000,7.0500,0,0,3
876,male,20.000000,9.8458,0,0,3
666,male,25.000000,13.0000,0,0,2
819,male,10.000000,27.9000,3,2,3
736,female,48.000000,34.3750,1,3,3
...,...,...,...,...,...,...
41,female,27.000000,21.0000,1,0,2
187,male,45.000000,26.5500,0,0,1
46,male,29.512678,15.5000,1,0,3
179,male,36.000000,0.0000,0,0,3


In [16]:
cat_vars = [var for var in X_test.columns if X_test[var].dtype == 'O']
cat_vars

#Agregamos variables que por definición son categóricas
#cat_vars = cat_vars + ['Pclass'] + ['SibSp'] + ['Parch']
#cat_vars

data[cat_vars] = X_test[cat_vars].astype('O')

# Cantidad de variables categóricas
#len(cat_vars)

In [17]:
cat_vars 

['Sex', 'SibSp', 'Parch', 'Pclass']

In [18]:
cat_vars_with_na = [var for var in cat_vars 
                        if X_test[var].isnull().sum() > 0]

# variables tratadas con etiqueta de faltante por cantidad masiva de faltantes.
vars_with_missing_string = [var for var in cat_vars_with_na
                               if X_test[var].isnull().mean() > 0.2]

# variables tratadas con procedimiento por candiad adecuada de faltantes.
vars_freq_category = [var for var in cat_vars_with_na
                               if X_test[var].isnull().mean() <= 0.2]

# Faltantes con etiqueta missing
X_train[vars_with_missing_string] = X_train[vars_with_missing_string].fillna('Missing')
X_test[vars_with_missing_string] = X_test[vars_with_missing_string].fillna('Missing')

for var in vars_freq_category:
    mode = X_test[var].mode()[0]
    
    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)
    
num_vars = [var for var in X_test.columns
               if var not in cat_vars and var != 'Survived']

# Numéricas con faltantes
num_vars_with_na = [var for var in num_vars
                       if X_test[var].isnull().sum() > 0]

for var in num_vars_with_na:
    mean_val = X_test[var].mean()
    
    X_train[var].fillna(mean_val, inplace=True)
    X_test[var].fillna(mean_val, inplace=True)

for var in ['Fare']:
    X_train[var] = np.sqrt(np.power(X_train[var],2))
    X_test[var] = np.sqrt(np.power(X_test[var],2))
    
sesgadas = ['Fare']

for var in sesgadas:
    X_train[var] = np.where(X_train[var] == 0, 0, 1)
    X_test[var] = np.where(X_test[var] == 0, 0, 1)

In [19]:
X_test

Unnamed: 0,Sex,Age,Fare,SibSp,Parch,Pclass
210,male,24.000000,1,0,0,3
876,male,20.000000,1,0,0,3
666,male,25.000000,1,0,0,2
819,male,10.000000,1,3,2,3
736,female,48.000000,1,1,3,3
...,...,...,...,...,...,...
41,female,27.000000,1,1,0,2
187,male,45.000000,1,0,0,1
46,male,29.512678,1,1,0,3
179,male,36.000000,0,0,0,3


In [20]:
X_test.isnull().sum()

Sex       0
Age       0
Fare      0
SibSp     0
Parch     0
Pclass    0
dtype: int64

In [21]:
preds = Survived_pipeline.predict(X_test)
preds

array([0.11145379, 0.11145379, 0.24118642, 0.11145379, 0.61922106,
       0.61922106, 0.87868631, 0.11145379, 0.24118642, 0.11145379,
       0.11145379, 0.37091904, 0.11145379, 0.11145379, 0.74895369,
       0.61922106, 0.11145379, 0.37091904, 0.11145379, 0.87868631,
       0.37091904, 0.11145379, 0.61922106, 0.61922106, 0.11145379,
       0.11145379, 0.11145379, 0.11145379, 0.87868631, 0.24118642,
       0.61922106, 0.11145379, 0.11145379, 0.11145379, 0.37091904,
       0.74895369, 0.11145379, 0.11145379, 0.37091904, 0.61922106,
       0.61922106, 0.11145379, 0.87868631, 0.11145379, 0.11145379,
       0.11145379, 0.11145379, 0.61922106, 0.37091904, 0.24118642,
       0.87868631, 0.74895369, 0.74895369, 0.61922106, 0.61922106,
       0.11145379, 0.24118642, 0.61922106, 0.74895369, 0.87868631,
       0.24118642, 0.24118642, 0.61922106, 0.61922106, 0.37091904,
       0.61922106, 0.11145379, 0.74895369, 0.11145379, 0.11145379,
       0.74895369, 0.74895369, 0.11145379, 0.61922106, 0.61922

In [22]:
#Survived_pipeline.transform(X_test)

In [23]:
mean_squared_error(np.exp(y_test), np.exp(preds), squared=False)

0.7632295207202568

In [24]:
X_test

Unnamed: 0,Sex,Age,Fare,SibSp,Parch,Pclass
210,male,24.000000,1,0,0,3
876,male,20.000000,1,0,0,3
666,male,25.000000,1,0,0,2
819,male,10.000000,1,3,2,3
736,female,48.000000,1,1,3,3
...,...,...,...,...,...,...
41,female,27.000000,1,1,0,2
187,male,45.000000,1,0,0,1
46,male,29.512678,1,1,0,3
179,male,36.000000,0,0,0,3


In [25]:
#Guardamos pipeline
joblib.dump(Survived_pipeline, 'Survived_pipeline.pkl')

['Survived_pipeline.pkl']

In [26]:
#X_train