 # ****importacion****

In [1]:
import pandas as pd
import numpy as np
import joblib

from xgboost import XGBRegressor, XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint, uniform

#### Carga de datos

In [2]:
# Cargar datos
df = pd.read_csv('titanic_train.csv')

# Preprocesamiento inicial
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

In [3]:
# Imputaciones
df[['Embarked']] = SimpleImputer(strategy='most_frequent').fit_transform(df[['Embarked']])
df[['Cabin']] = SimpleImputer(strategy='constant', fill_value='Unknown').fit_transform(df[['Cabin']])
df['Deck'] = df['Cabin'].str[0]

In [4]:
# Codificaciones
df[['Embarked']] = OrdinalEncoder(categories=[['S', 'C', 'Q']]).fit_transform(df[['Embarked']])
df[['Deck']] = OrdinalEncoder(categories=[["A", "B", "C", "D", "E", "F", "G", "T", "U"]]).fit_transform(df[['Deck']])

# Codificación one-hot para 'Sex'
sex_encoder = OneHotEncoder(drop='first', sparse_output=False)
df['Sex_male'] = sex_encoder.fit_transform(df[['Sex']])

In [5]:
# Feature de tamaño de familia
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [6]:
# Imputar edad con XGBRegressor
features_age = ['Fare', 'Parch', 'Pclass', 'SibSp', 'Deck', 'FamilySize', 'Sex_male', 'Embarked']
df_known_age = df[df['Age'].notnull()]
df_unknown_age = df[df['Age'].isnull()]

X_age_train = df_known_age[features_age]
y_age_train = df_known_age['Age']
X_age_pred = df_unknown_age[features_age]

xgb_reg = XGBRegressor(random_state=42, n_estimators=100)
xgb_reg.fit(X_age_train, y_age_train)
df.loc[df['Age'].isnull(), 'Age'] = xgb_reg.predict(X_age_pred)

In [7]:
# Clasificación
features_classification = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Sex_male', 'Deck', 'FamilySize']
X = df[features_classification]
y = df['Survived']

In [8]:
# Escalar variables numéricas
scaler = StandardScaler()
num_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
X[num_features] = scaler.fit_transform(X[num_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_features] = scaler.fit_transform(X[num_features])


In [9]:
# Hiperparámetros para RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(3, 15),
    'min_child_weight': uniform(1, 5),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'learning_rate': uniform(0.01, 0.2),
}

xgb_clf = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_distributions,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X, y)

print("Mejores parámetros encontrados:")
print(random_search.best_params_)
print(f"Mejor Accuracy CV: {random_search.best_score_ * 100:.2f}%")

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Mejores parámetros encontrados:
{'colsample_bytree': 0.8191887532992349, 'learning_rate': 0.09469418846162371, 'max_depth': 10, 'min_child_weight': 5.03417369633632, 'n_estimators': 276, 'subsample': 0.892659102119856}
Mejor Accuracy CV: 84.96%


In [10]:
# Guardar modelo, scaler y encoder
joblib.dump(random_search.best_estimator_, 'xgb_titanic_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(sex_encoder, 'sex_encoder.pkl')

['sex_encoder.pkl']