In [None]:
! pip install -U feature-engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature-engine
  Downloading feature_engine-1.6.1-py2.py3-none-any.whl (326 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.6/326.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.1


In [None]:
# TODOS LOS IMPORTS
# data manipulation and plotting

import pandas as pd
import numpy as np

#for saving the pipeline
import joblib

#from Scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#from feature-engine
from feature_engine.imputation import (
    MeanMedianImputer,
    AddMissingIndicator
)

from feature_engine.selection import DropFeatures
from feature_engine.encoding import OrdinalEncoder

#to separate training and test
from sklearn.model_selection import train_test_split

#the model
from sklearn.linear_model import LogisticRegression


In [None]:
# CARGAR DATA

file_name = "/content/titanic.csv"
data = pd.read_csv(file_name, sep=';')
data.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,2113375.0,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,0


In [None]:
# Variables con NAN
nan_counts = data.isna().sum()
print(nan_counts)

pclass         0
name           0
sex            0
age          263
sibsp          0
parch          0
ticket         0
fare           0
cabin       1013
embarked       2
survived       0
dtype: int64


In [None]:
# Feature engineering
target = "survived"
# Variables númericas con NaNs en train set
NUMERICAL_VARS_WITH_NA = ['age']
FEATURES = ['pclass', 'sex', 'age', 'fare']
# Drop features:
DROP_FEATURES = ["name", "sibsp", "parch", "ticket", "cabin", "embarked"]

In [None]:
# Creamos el pipe
pipeline = Pipeline([

    #====IMPUTATION ====
    ('drop_features', DropFeatures(features_to_drop = DROP_FEATURES)),
    # missing indicator
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),

    # imputamos variables numericas con la mean
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA
    )),
    # Convertir la variable categórica "sex" a numérica
    ('categorical_encoding', OrdinalEncoder(
        encoding_method='arbitrary', variables=['sex']
    ))
])

In [None]:
df = pipeline.fit(data)

In [None]:
df

In [None]:
# Almacenamos el pipeline de transformación de datos
joblib.dump(pipeline, "pipeline.pkl")

['pipeline.pkl']

In [None]:
df = pipeline.transform(data)

In [None]:
data.head(3)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,2113375.0,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0


In [None]:
df.head(3)

Unnamed: 0,pclass,sex,age,fare,survived,age_na
0,1,0,29.0,2113375.0,1,0
1,1,1,0.9167,151.55,1,0
2,1,0,2.0,151.55,0,0


In [None]:
# Segmentamos el dataframe resultante en características y variable objetivo
X = df.drop(target, axis=1)
Y = df[target]

display(X.head(3))
display(Y.head(3))

Unnamed: 0,pclass,sex,age,fare,age_na
0,1,0,29.0,2113375.0,0
1,1,1,0.9167,151.55,0
2,1,0,2.0,151.55,0


0    1
1    1
2    0
Name: survived, dtype: int64

In [None]:
# Separamos entre train y test

X_train, X_test, Y_train, Y_test = train_test_split(
    X, #features
    y, #labels
    test_size=0.3, #portion to test
    random_state=0 #seed definition
)

X_train = pd.DataFrame(X_train, columns=["pclass", "sex", "age", "fare"])
X_test = pd.DataFrame(X_test, columns=["pclass", "sex", "age", "fare"])
Y_train = pd.DataFrame(Y_train, columns=['survived'])
Y_test = pd.DataFrame(Y_test, columns=['survived'])

In [None]:


# Definir el pipeline
genero_pipe = Pipeline([
    ('scaler', StandardScaler()),  # Escalado de características
    ('model', LogisticRegression())  # Modelo de regresión logística
])


In [None]:
# Ajustar el pipeline utilizando los datos de entrenamiento
genero_pipe.fit(X_train, Y_train.values.ravel())

# Predecir las etiquetas para los datos de prueba
y_pred = genero_pipe.predict(X_test)

In [None]:
# Evaluar el rendimiento del modelo utilizando métricas apropiadas
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_test, y_pred)
print("Precisión:", accuracy)

Precisión: 0.7964376590330788


In [None]:
joblib.dump(genero_pipe, open('model.pkl', 'wb'))

In [None]:
X_train.to_csv('train.csv',sep=';')
X_test.to_csv('test.csv',sep=';')

In [None]:
predictions = genero_pipe.predict(X_test)
probabilities = genero_pipe.predict_proba(X_test)[:, 1]
results = pd.DataFrame({'survived': Y_test.values.ravel(), 'prediction': predictions.ravel(), 'probability': probabilities.ravel()})

print(results.head(10))

   survived  prediction  probability
0         0           1     0.693998
1         0           0     0.188883
2         0           0     0.219060
3         1           1     0.560991
4         0           0     0.116005
5         0           0     0.093416
6         1           0     0.283767
7         1           0     0.255312
8         0           0     0.093416
9         0           0     0.093416
