In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/phpMYEkMl-2.csv')
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,?,C,?,328,?
1305,3,0,"Zabour, Miss. Thamine",female,?,1,0,2665,14.4542,?,C,?,?,?
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,?,C,?,304,?
1307,3,0,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,?,C,?,?,?


In [None]:
#fare, nulos
#edad, media
#ticket, eliminar strings

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

# Función para reemplazar '?' por NaN
def replace_question_with_nan(X, column):
    X[column] = X[column].replace('?', pd.NA)
    return X

# Transformador para Fare: Rellenar valores nulos
class fare_imputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = replace_question_with_nan(X, 'fare')
        X['fare'] = pd.to_numeric(X['fare'], errors='coerce')  # Convertir a numérico por si acaso
        X['fare'].fillna(0, inplace=True)
        return X

# Transformador para Age: Rellenar valores nulos con la media
class age_inputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = replace_question_with_nan(X, 'age')
        self.mean_age = pd.to_numeric(X['age'], errors='coerce').mean()  # Convertir a numérico y calcular la media
        return self

    def transform(self, X, y=None):
        X = replace_question_with_nan(X, 'age')
        X['age'] = pd.to_numeric(X['age'], errors='coerce')  # Convertir a numérico
        X['age'].fillna(self.mean_age, inplace=True)
        return X

# Transformador para Ticket: Eliminar strings
class ticket_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = replace_question_with_nan(X, 'ticket')
        X['ticket'] = X['ticket'].str.extract('(\d+)', expand=False)
        X['ticket'] = pd.to_numeric(X['ticket'], errors='coerce')  # Convertir a numérico
        X['ticket'].fillna(0, inplace=True)
        return X


In [17]:
from sklearn.pipeline import Pipeline

# Crear la pipeline con los transformadores personalizados
titanic_pipeline = Pipeline([
    ('fare_imputer', fare_imputer()),
    ('age_inputer', age_inputer()),
    ('ticket_transformer', ticket_transformer())
])


In [20]:
# Aplicar fit y transform al DataFrame
titanic_transformed_df = titanic_pipeline.fit_transform(df.copy())

In [21]:
titanic_transformed_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.000000,0,0,24160.0,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.916700,1,2,113781.0,151.5500,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.000000,1,2,113781.0,151.5500,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.000000,1,2,113781.0,151.5500,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.000000,1,2,113781.0,151.5500,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.500000,1,0,2665.0,14.4542,?,C,?,328,?
1305,3,0,"Zabour, Miss. Thamine",female,29.881135,1,0,2665.0,14.4542,?,C,?,?,?
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.500000,0,0,2656.0,7.2250,?,C,?,304,?
1307,3,0,"Zakarian, Mr. Ortin",male,27.000000,0,0,2670.0,7.2250,?,C,?,?,?


In [25]:
df.describe()

Unnamed: 0,pclass,survived,sibsp,parch
count,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.381971,0.498854,0.385027
std,0.837836,0.486055,1.041658,0.86556
min,1.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0
50%,3.0,0.0,0.0,0.0
75%,3.0,1.0,1.0,0.0
max,3.0,1.0,8.0,9.0


In [24]:
titanic_transformed_df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,ticket,fare
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,225634.0,33.270043
std,0.837836,0.486055,12.883199,1.041658,0.86556,506429.9,51.747063
min,1.0,0.0,0.1667,0.0,0.0,0.0,0.0
25%,2.0,0.0,22.0,0.0,0.0,11769.0,7.8958
50%,3.0,0.0,29.881135,0.0,0.0,36973.0,14.4542
75%,3.0,1.0,35.0,1.0,0.0,345777.0,31.275
max,3.0,1.0,80.0,8.0,9.0,3101317.0,512.3292
