# Feature engeering Titanic

In [31]:
# Liberías de manipulación y análisis de datos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Librerías de preprocesamiento y modelado
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2, f_classif, mutual_info_classif

#Librerías de modelado y evaluación
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier



## Carga del dataset

In [16]:
def load_data():
    try:
        df = pd.read_csv('./data/Titanic-Dataset.csv')
        source = "Local file"
    except FileNotFoundError:
        df = sns.load_dataset('titanic')
        source = "Seaborn"
    return df, source

df, source = load_data()
print(f"El dataset fue cargado desde: {source}")

df.head()

El dataset fue cargado desde: Local file


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
print("Dimnesiones del dataset:", df.shape)
print("Los tipos de datos son:\n", df.dtypes)
print("Valores nulos por columna:\n", df.isnull().sum())

Dimnesiones del dataset: (891, 12)
Los tipos de datos son:
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
Valores nulos por columna:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [18]:
# Normalización de nombres de columnas
df.columns = [c.lower() for c in df.columns]
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
# Campos candidatos para features
candidate_features = []
for c in ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']:
    if c in df.columns:
        candidate_features.append(c)

print("Campos candidatos para features:", candidate_features)

base_df = df[candidate_features + ['survived']].copy()

Campos candidatos para features: ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']


In [23]:
base_df.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,survived,family_size
0,3,male,22.0,1,0,7.25,S,0,2
1,1,female,38.0,1,0,71.2833,C,1,2
2,3,female,26.0,0,0,7.925,S,1,1
3,1,female,35.0,1,0,53.1,S,1,2
4,3,male,35.0,0,0,8.05,S,0,1


## Ingeniería de features

In [None]:
# Tamaño de la familia
base_df['family_size'] = base_df[['sibsp', 'parch']].fillna(0).sum(axis=1) + 1

# Esta solo?
base_df['is_alone'] = (base_df['family_size'] == 1).astype(int)

# Transformación logarítmica de fare, para reducir la asimetría y outliers
base_df['log_fare'] = np.log1p(df['fare'])

# Grupos de edad (niño, joven ,adulto, anciano)
#bins = [0, 12, 25, 55, 120]
bins = [-np.inf, 12, 25, 55, np.inf]
labels = ['child', 'young', 'adult', 'senior']
base_df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

#Interacción entre pclass y sexo
pclass_str = df['pclass'].astype(str)
base_df['pclass_sex'] = pclass_str + '_' + df['sex']

base_df.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,survived,family_size,is_alone,log_fare,age_group,pclass_sex
0,3,male,22.0,1,0,7.25,S,0,2,0,2.110213,young,3_male
1,1,female,38.0,1,0,71.2833,C,1,2,0,4.280593,adult,1_female
2,3,female,26.0,0,0,7.925,S,1,1,1,2.188856,adult,3_female
3,1,female,35.0,1,0,53.1,S,1,2,0,3.990834,adult,1_female
4,3,male,35.0,0,0,8.05,S,0,1,1,2.202765,adult,3_male


## Preprocesamiento

In [25]:
#Elegimos features numerica y catergoricas
num_features = ['age', 'sibsp', 'parch', 'fare', 'family_size', 'log_fare']
cat_features = ['pclass','sex','embarked', 'is_alone', 'age_group','pclass_sex']

X = base_df[num_features + cat_features]
y = base_df['survived'] 

In [26]:
# Split en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((712, 12), (179, 12))

In [27]:
y_train.shape, y_test.shape

((712,), (179,))

In [34]:
# Preprocesamiento (imputación, escalado y codificación)
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])


numeric_chi2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_chi2, num_features),
    ('cat', categorical_pipeline, cat_features)
])

In [35]:
# HELPERS PARA NOMBRES DE FEATURES
def get_feature_names_after_preprocess(prep: ColumnTransformer, cat_feats, Xsample: pd.DataFrame):
    # nombres numéricos (tal como entran al prep)
    num_names = prep.transformers_[0][2] if len(prep.transformers_) > 0 else []
    # nombres categóricos después de OneHot
    cat_trans = prep.named_transformers_["cat"]
    ohe = cat_trans.named_steps["onehot"]
    cat_out = ohe.get_feature_names_out(cat_feats)
    return np.concatenate([num_names, cat_out])

## Seleccion de features

In [36]:
# Selección de características por filtros
k=15
sel_chi2 = SelectKBest(score_func=chi2, k=k)
sel_anova = SelectKBest(score_func=f_classif, k=k)
sel_mi = SelectKBest(score_func=mutual_info_classif, k=k) # Mutual Information queda como tarea pendiente

# Pipelines completos
pipe_chi2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', sel_chi2),
])

pipe_annova = Pipeline(steps=[
    ("prep", preprocessor), 
    ("select", sel_anova)
])

#Ajuste de los pipelines
pipe_chi2.fit(X_train, y_train)
pipe_annova.fit(X_train, y_train)

#Nombres de features después del preprocesamiento
feature_names = get_feature_names_after_preprocess(preprocessor, cat_features, X_train)

#Soportes de mascara
mask_chi2 = pipe_chi2.named_steps['selector'].get_support()
mask_anova = pipe_annova.named_steps['select'].get_support()

selected_chi2 = feature_names[mask_chi2]
selected_anova = feature_names[mask_anova]



print("Top-K por Chi2:\n", selected_chi2)
print("\nTop-K por ANOVA (f_classif):\n", selected_anova)


Top-K por Chi2:
 ['fare' 'pclass_1' 'pclass_2' 'pclass_3' 'sex_female' 'sex_male'
 'embarked_C' 'is_alone_0' 'is_alone_1' 'age_group_child'
 'pclass_sex_1_female' 'pclass_sex_2_female' 'pclass_sex_2_male'
 'pclass_sex_3_female' 'pclass_sex_3_male']

Top-K por ANOVA (f_classif):
 ['fare' 'log_fare' 'pclass_1' 'pclass_3' 'sex_female' 'sex_male'
 'embarked_C' 'embarked_S' 'is_alone_0' 'is_alone_1' 'age_group_child'
 'pclass_sex_1_female' 'pclass_sex_2_female' 'pclass_sex_2_male'
 'pclass_sex_3_male']
