In [67]:
# Versión optimizada basada en tu mejor modelo
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Cargar y preparar datos (similar a tu mejor versión)
df = pd.read_csv('./titanic_train.csv')
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

# Preprocesamiento (manteniendo lo que funciona)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Cabin'] = df['Cabin'].fillna('Unknown').str[0]
df['Deck'] = OrdinalEncoder(categories=[["A", "B", "C", "D", "E", "F", "G", "T", "U"]]).fit_transform(df[['Cabin']])
df['Sex_male'] = OneHotEncoder(drop='first', sparse_output=False).fit_transform(df[['Sex']])
df['Embarked'] = OrdinalEncoder(categories=[['S', 'C', 'Q']]).fit_transform(df[['Embarked']])

# Imputación de edad mejorada
age_features = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked', 'Deck']
known_age = df[df['Age'].notnull()]
unknown_age = df[df['Age'].isnull()]

from sklearn.ensemble import GradientBoostingRegressor
age_model = GradientBoostingRegressor(random_state=42)
age_model.fit(known_age[age_features], known_age['Age'])
df.loc[df['Age'].isnull(), 'Age'] = age_model.predict(unknown_age[age_features])

# Feature selection (basado en importancia)
features = ['Sex_male', 'Age', 'Fare', 'Pclass', 'Deck', 'SibSp', 'Parch']
X = df[features]
y = df['Survived']

# Pipeline optimizado
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('knn', KNeighborsClassifier())
])

# Búsqueda de mejores parámetros
param_grid = {
    'pca__n_components': [0.85, 0.90, 0.95],
    'knn__n_neighbors': range(5, 15),
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]  # 1: manhattan, 2: euclidean
}

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Mejor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Mejores parámetros: {grid_search.best_params_}")
print(f"Accuracy: {accuracy*100:.2f}%")

Mejores parámetros: {'knn__n_neighbors': 10, 'knn__p': 2, 'knn__weights': 'uniform', 'pca__n_components': 0.95}
Accuracy: 78.77%
