In [1]:
! pip install -U feature-engine



In [2]:
#TODOS LOS IMPORTS
# data manipulation and plotting

import pandas as pd
import numpy as np

#for saving the pipeline
import joblib

#from Scikit-learn
from sklearn.pipeline import Pipeline

#from feature-engine
from feature_engine.imputation import (
    MeanMedianImputer,
    AddMissingIndicator
)

from feature_engine.selection import DropFeatures

#to separate training and test
from sklearn.model_selection import train_test_split

#the model
from sklearn.linear_model import LogisticRegression

In [3]:
# CARGAR DATA

file_name = "titanic.csv"
df = pd.read_csv(file_name, sep=';')
df.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,2113375.0,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,0


In [4]:
X = df[["pclass", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked"]]
Y = df["survived"]

In [5]:
# Let's separate into train and test set
X_train, X_test, Y_train, Y_test = train_test_split(
    X, #features
    Y, #labels
    test_size=0.3, #portion to test
    random_state=0 #seed definition
)

X_train = pd.DataFrame(X_train, columns=["pclass","name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked"])
X_test = pd.DataFrame(X_test, columns=["pclass", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked"])
Y_train = pd.DataFrame(Y_train, columns=['survived'])
Y_test = pd.DataFrame(Y_test, columns=['survived'])

In [6]:
# CONFIGURATION

#numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ['age']

# Drop features:
DROP_FEATURES = ["name", "sex", "sibsp", "parch", "ticket", "fare", "cabin", "embarked"]

# the selected variables:
FEATURES = ['pclass',
        'age'
]

In [7]:
# set up the pipeline
genero_pipe = Pipeline([
    
    #====IMPUTATION ====
    ('drop_features', DropFeatures(features_to_drop = DROP_FEATURES)),
    # add missing indicator
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #impute numerical variables with the mean
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA
    )),
    ('LogisticRegression', LogisticRegression())
])

In [8]:
# train the pipeline
genero_pipe.fit(X_train, Y_train.values.ravel())

In [9]:
ejemplo = pd.DataFrame([[1, "Maria Solana", "female", 36.0000, 2, 0, 24165, 70.00, "MS35 DD38", "S"]], columns = ["pclass", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked"])
pred = genero_pipe.predict(ejemplo)
print(pred[0])

1


In [10]:
joblib.dump(genero_pipe, open('cfk.pkl', 'wb'))

In [11]:
X_train.to_csv('train.csv',sep=';')
X_test.to_csv('test.csv',sep=';')