In [22]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

# Cargar datos (ajusta el path si es necesario)
df = pd.read_csv('./titanic_train.csv')

# Drop columnas irrelevantes
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

In [23]:
# Imputar 'Embarked' con moda
from sklearn.impute import SimpleImputer
imputer_mode = SimpleImputer(strategy='most_frequent')
df[['Embarked']] = imputer_mode.fit_transform(df[['Embarked']])

# Imputar 'Cabin' con 'Unknown'
imputer_const = SimpleImputer(strategy='constant', fill_value='Unknown')
df[['Cabin']] = imputer_const.fit_transform(df[['Cabin']])

# Feature Deck extraída de Cabin (primera letra)
df['Deck'] = df['Cabin'].str[0]

In [24]:
# Codificar variables categóricas
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

encoder_embarked = OrdinalEncoder(categories=[['S', 'C', 'Q']])
df[['Embarked']] = encoder_embarked.fit_transform(df[['Embarked']])

encoder_deck = OrdinalEncoder(
    categories=[["A", "B", "C", "D", "E", "F", "G", "T", "U"]]
)
df[["Deck"]] = encoder_deck.fit_transform(df[["Deck"]])

encoder_sex = OneHotEncoder(drop='first', sparse_output=False)
sex_encoded = encoder_sex.fit_transform(df[['Sex']])
df['Sex_male'] = sex_encoded


In [25]:
# Crear feature FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Separar datos con y sin edad
df_known_age = df[df['Age'].notnull()]
df_unknown_age = df[df['Age'].isnull()]

In [26]:
# Features para predecir edad (incluyo FamilySize ahora)
features_for_age = ['Fare', 'Parch', 'Pclass', 'SibSp', 'Deck', 'FamilySize', 'Sex_male', 'Embarked']
X_age_train = df_known_age[features_for_age]
y_age_train = df_known_age['Age']
X_age_pred = df_unknown_age[features_for_age]

# Entrenar XGBRegressor para imputar edad
xgb_reg = XGBRegressor(random_state=42, n_estimators=100)
xgb_reg.fit(X_age_train, y_age_train)


In [27]:
# Predecir y completar edades faltantes
predicted_ages = xgb_reg.predict(X_age_pred)
df.loc[df['Age'].isnull(), 'Age'] = predicted_ages

print("Imputación de Edad completada con XGBRegressor.")

Imputación de Edad completada con XGBRegressor.


In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Elegir features para clasificación
features_classification = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Sex_male', 'Deck', 'FamilySize']

X = df[features_classification]
y = df['Survived']

# Escalar features numéricas (dejo Embarked, Sex_male, Deck sin escalar porque son ordinales/categóricos codificados)
scaler = StandardScaler()
num_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
X[num_features] = scaler.fit_transform(X[num_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_features] = scaler.fit_transform(X[num_features])


In [29]:
# División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Entrenar XGBoost Classifier con hiperparámetros básicos (puedes ajustar después)
xgb_clf = XGBClassifier(
    random_state=42,
    n_estimators=200,
    max_depth=10,
    min_child_weight=3,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [30]:
# Predecir y evaluar
y_pred = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy con XGBoost (tunning básico): {accuracy*100:.2f}%")

Accuracy con XGBoost (tunning básico): 81.56%


In [31]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

params = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(3, 15),
    'min_child_weight': uniform(1, 5),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'learning_rate': uniform(0.01, 0.2),
}

xgb_cv = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

random_search = RandomizedSearchCV(
    estimator=xgb_cv,
    param_distributions=params,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X, y)
print("Mejores parámetros:", random_search.best_params_)
print(f"Mejor Accuracy CV: {random_search.best_score_ * 100:.2f}%")


Fitting 5 folds for each of 100 candidates, totalling 500 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Mejores parámetros: {'colsample_bytree': 0.8191887532992349, 'learning_rate': 0.09469418846162371, 'max_depth': 10, 'min_child_weight': 5.03417369633632, 'n_estimators': 276, 'subsample': 0.892659102119856}
Mejor Accuracy CV: 84.96%
