In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# =============================================================================
# 1. INSTALLATION & IMPORTS
# =============================================================================
# Install required TDA and ML libraries
!pip install ripser gudhi scikit-learn xgboost plotly -q
print("TDA libraries installed successfully!")

import numpy as np
import pandas as pd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m834.5/834.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.6/48.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for hopcroftkarp (setup.py) ... [?25l[?25hdone
TDA libraries installed successfully!
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# %% [markdown]
# # COMPLETE ENHANCED TDA PIPELINE FOR TITANIC

# %%
import matplotlib.pyplot as plt
from sklearn import preprocessing, model_selection, metrics
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

# TDA libraries
try:
    import ripser
    RIPSER_AVAILABLE = True
except ImportError:
    RIPSER_AVAILABLE = False

# %%
def load_and_preprocess_titanic():
    train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
    test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
    
    def preprocess_data(df):
        df_clean = df.copy()
        df_clean['Age'] = df_clean['Age'].fillna(df_clean['Age'].median())
        df_clean['Fare'] = df_clean['Fare'].fillna(df_clean['Fare'].median())
        df_clean['Embarked'] = df_clean['Embarked'].fillna('S')
        df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1
        df_clean['IsAlone'] = (df_clean['FamilySize'] == 1).astype(int)
        
        df_clean['Title'] = df_clean['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 5, 'Col': 5, 'Major': 5}
        df_clean['Title'] = df_clean['Title'].map(title_mapping).fillna(5)
        
        df_clean['HasCabin'] = (~df_clean['Cabin'].isna()).astype(int)
        df_clean['Sex'] = df_clean['Sex'].map({'male': 0, 'female': 1})
        df_clean['Embarked'] = df_clean['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
        return df_clean
    
    train_clean = preprocess_data(train_df)
    test_clean = preprocess_data(test_df)
    
    FEATURE_COLUMNS = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'Title', 'HasCabin', 'Embarked']
    X_train = train_clean[FEATURE_COLUMNS].values
    X_test = test_clean[FEATURE_COLUMNS].values
    y_train = train_clean['Survived'].values
    
    scaler = preprocessing.StandardScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)
    
    return train_clean, test_clean, X_train, X_test, X_train_normalized, X_test_normalized, y_train

train_clean, test_clean, X_train, X_test, X_train_normalized, X_test_normalized, y_train = load_and_preprocess_titanic()

# %%
class RobustTDAExtractor:
    def __init__(self):
        self.feature_names = []
        
    def extract_tda_features(self, X_normalized):
        if not RIPSER_AVAILABLE:
            return self._create_fallback_features(X_normalized)
        
        tda_features = []
        for i in range(len(X_normalized)):
            try:
                passenger_point = X_normalized[i].reshape(1, -1)
                distances = np.linalg.norm(X_normalized - passenger_point, axis=1)
                n_neighbors = min(20, len(X_normalized))
                neighbor_indices = np.argpartition(distances, n_neighbors)[:n_neighbors]
                neighborhood = X_normalized[neighbor_indices]
                
                diagrams = ripser.ripser(neighborhood, maxdim=1)['dgms']
                h0_diagram = diagrams[0]
                
                if len(h0_diagram) > 0:
                    h0_lifetimes = np.clip(h0_diagram[:, 1] - h0_diagram[:, 0], 0, 5.0)
                    h0_persistence = np.max(h0_lifetimes) if len(h0_lifetimes) > 0 else 0.0
                    h0_components = len(h0_diagram)
                    h0_std = np.std(h0_lifetimes) if len(h0_lifetimes) > 0 else 0.0
                else:
                    h0_persistence, h0_components, h0_std = 0.0, 0, 0.0
                
                h1_diagram = diagrams[1] if len(diagrams) > 1 else np.array([])
                h1_loops = len(h1_diagram) if len(h1_diagram) > 0 else 0
                
                if len(distances) > 1:
                    avg_distance = np.mean(distances[neighbor_indices[1:]])
                    local_density = 1.0 / (avg_distance + 0.1)
                    local_density = min(local_density, 10.0)
                else:
                    local_density = 0.0
                
                tda_features.append([h0_persistence, h0_components, h0_std, h1_loops, local_density])
            except Exception:
                tda_features.append([0.0, 0, 0.0, 0, 0.0])
        
        self.feature_names = ['tda_h0_persistence', 'tda_h0_components', 'tda_h0_std', 'tda_h1_loops', 'tda_local_density']
        return np.nan_to_num(np.array(tda_features), nan=0.0, posinf=10.0, neginf=0.0)
    
    def _create_fallback_features(self, X_normalized):
        statistical_features = []
        for i in range(len(X_normalized)):
            passenger_point = X_normalized[i]
            distances = np.linalg.norm(X_normalized - passenger_point, axis=1)
            n_neighbors = min(15, len(X_normalized))
            neighbor_distances = np.partition(distances, n_neighbors)[:n_neighbors]
            stats = [
                np.mean(neighbor_distances), len(neighbor_distances), np.std(neighbor_distances),
                len(neighbor_distances[neighbor_distances < 1.0]),
                1.0 / (np.mean(neighbor_distances[1:]) + 0.1) if len(neighbor_distances) > 1 else 0.0
            ]
            statistical_features.append(stats)
        self.feature_names = ['stat_mean_dist', 'stat_neighbor_count', 'stat_std_dist', 'stat_dense_count', 'stat_inv_density']
        return np.array(statistical_features)

# %%
class ConsistentFeatureEngineer:
    def engineer_features(self, train_df, test_df):
        combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
        combined_advanced = self._engineer_advanced_features(combined_df)
        train_advanced = combined_advanced.iloc[:len(train_df)].copy()
        test_advanced = combined_advanced.iloc[len(train_df):].copy()
        
        X_train = self._prepare_features(train_advanced)
        X_test = self._prepare_features(test_advanced)
        X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
        
        return X_train.values, X_test.values, list(X_train.columns)
    
    def _engineer_advanced_features(self, df):
        df_advanced = df.copy()
        df_advanced['FamilySize'] = df_advanced['SibSp'] + df_advanced['Parch'] + 1
        df_advanced['IsAlone'] = (df_advanced['FamilySize'] == 1).astype(int)
        df_advanced['Title'] = df_advanced['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4}
        df_advanced['Title'] = df_advanced['Title'].map(title_mapping).fillna(5)
        df_advanced['AgeGroup'] = pd.cut(df_advanced['Age'], bins=[0, 12, 18, 35, 50, 100], labels=[1, 2, 3, 4, 5]).fillna(3).astype(int)
        df_advanced['FarePerPerson'] = df_advanced['Fare'] / df_advanced['FamilySize']
        df_advanced['FarePerPerson'] = df_advanced['FarePerPerson'].replace([np.inf, -np.inf], 0)
        df_advanced['HasCabin'] = (~df_advanced['Cabin'].isna()).astype(int)
        df_advanced['TicketLength'] = df_advanced['Ticket'].apply(len)
        return df_advanced
    
    def _prepare_features(self, df):
        feature_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'IsAlone', 'Title', 'HasCabin', 'AgeGroup', 'FarePerPerson', 'TicketLength']
        available_features = [f for f in feature_columns if f in df.columns]
        X = df[available_features].copy().fillna(0)
        return X

# %%
class RobustEnsemble:
    def __init__(self):
        self.scaler = preprocessing.StandardScaler()
        self.ensemble = None
        self.feature_importance = None
        
    def train_ensemble(self, X_train, y_train):
        X_clean = np.nan_to_num(X_train, nan=0.0, posinf=10.0, neginf=0.0)
        X_scaled = self.scaler.fit_transform(X_clean)
        
        models = {
            'xgb': XGBClassifier(n_estimators=150, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='logloss'),
            'rf': RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42),
            'lgb': LGBMClassifier(n_estimators=150, max_depth=6, learning_rate=0.1, random_state=42, verbose=-1)
        }
        
        best_models = []
        for name, model in models.items():
            cv_scores = cross_val_score(model, X_scaled, y_train, cv=5, scoring='accuracy')
            mean_score = np.mean(cv_scores)
            best_models.append((name, model))
        
        self.ensemble = VotingClassifier(estimators=best_models, voting='soft')
        self.ensemble.fit(X_scaled, y_train)
        
        for name, model in best_models:
            if hasattr(model, 'feature_importances_'):
                self.feature_importance = model.feature_importances_
                break
        
        final_scores = cross_val_score(self.ensemble, X_scaled, y_train, cv=5, scoring='accuracy')
        return np.mean(final_scores)
    
    def predict(self, X):
        X_clean = np.nan_to_num(X, nan=0.0, posinf=10.0, neginf=0.0)
        X_scaled = self.scaler.transform(X_clean)
        predictions = self.ensemble.predict(X_scaled)
        probabilities = self.ensemble.predict_proba(X_scaled)[:, 1]
        return predictions, probabilities

# %%
print("EXECUTING COMPLETE ENHANCED TDA PIPELINE")

feature_engineer = ConsistentFeatureEngineer()
X_train_advanced, X_test_advanced, feature_names = feature_engineer.engineer_features(train_clean, test_clean)

tda_extractor = RobustTDAExtractor()
X_train_tda = tda_extractor.extract_tda_features(X_train_normalized)
X_test_tda = tda_extractor.extract_tda_features(X_test_normalized)

X_train_combined = np.hstack([X_train_advanced, X_train_tda])
X_test_combined = np.hstack([X_test_advanced, X_test_tda])
all_feature_names = feature_names + tda_extractor.feature_names

ensemble_model = RobustEnsemble()
cv_score = ensemble_model.train_ensemble(X_train_combined, y_train)

final_predictions, final_probabilities = ensemble_model.predict(X_test_combined)

final_submission = pd.DataFrame({'PassengerId': test_clean['PassengerId'], 'Survived': final_predictions})
final_submission.to_csv('enhanced_tda_titanic_submission.csv', index=False)

# %%
print("FINAL RESULTS SUMMARY")
print(f"Cross-Validation Score: {cv_score:.4f}")
print(f"Test Predictions - Survived: {final_predictions.sum()}/{len(final_predictions)} ({final_predictions.mean():.3f})")
print(f"Training Survival Rate: {y_train.mean():.3f}")
print(f"Predicted Survival Rate: {final_predictions.mean():.3f}")

high_confidence = np.sum((final_probabilities > 0.7) | (final_probabilities < 0.3))
print(f"High-confidence predictions: {high_confidence}/{len(final_predictions)} ({high_confidence/len(final_predictions):.1%})")

EXECUTING COMPLETE ENHANCED TDA PIPELINE
FINAL RESULTS SUMMARY
Cross-Validation Score: 0.8260
Test Predictions - Survived: 161/418 (0.385)
Training Survival Rate: 0.384
Predicted Survival Rate: 0.385
High-confidence predictions: 345/418 (82.5%)
