In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("="*60)
print("TITANIC SURVIVAL PREDICTION - ADVANCED ML MODEL")
print("="*60)

TITANIC SURVIVAL PREDICTION - ADVANCED ML MODEL


In [2]:
# 1. LOAD DATA
print("\n1. Loading data...")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Survival rate: {train_df['Survived'].mean():.2%}")


1. Loading data...
Training set shape: (891, 12)
Test set shape: (418, 11)
Survival rate: 38.38%


In [3]:
print("\n2. Performing feature engineering...")

def feature_engineering(df):
    """Apply comprehensive feature engineering"""
    df = df.copy()
    
    # Extract Title from Name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # Family Size Features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['SmallFamily'] = ((df['FamilySize'] >= 2) & (df['FamilySize'] <= 4)).astype(int)
    df['LargeFamily'] = (df['FamilySize'] >= 5).astype(int)
    
    # Age Features - Fill missing ages
    for title in df['Title'].unique():
        for pclass in df['Pclass'].unique():
            mask = (df['Title'] == title) & (df['Pclass'] == pclass) & (df['Age'].isnull())
            median_age = df[(df['Title'] == title) & (df['Pclass'] == pclass)]['Age'].median()
            if pd.notna(median_age):
                df.loc[mask, 'Age'] = median_age
    
    df['Age'].fillna(df['Age'].median(), inplace=True)
    
    # Age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    df['IsChild'] = (df['Age'] < 16).astype(int)
    df['IsElderly'] = (df['Age'] >= 60).astype(int)
    
    # Fare Features
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    df['FareBin'] = pd.qcut(df['Fare'], 5, labels=['Very_Low', 'Low', 'Med', 'High', 'Very_High'], duplicates='drop')
    
    # Cabin Features
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    df['CabinDeck'] = df['Cabin'].str[0]
    df['CabinDeck'].fillna('Unknown', inplace=True)
    
    # Embarked
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # Ticket Features
    df['TicketPrefix'] = df['Ticket'].str.split().str[0]
    df['TicketPrefix'] = df['TicketPrefix'].apply(lambda x: 'Numeric' if x.isdigit() else x)
    df['TicketLength'] = df['Ticket'].apply(len)
    
    # Interaction Features
    df['Sex_Pclass'] = df['Sex'] + '_' + df['Pclass'].astype(str)
    df['Age_Pclass'] = df['Age'] * df['Pclass']
    df['Fare_Age'] = df['Fare'] / (df['Age'] + 1)
    
    return df

train_processed = feature_engineering(train_df)
test_processed = feature_engineering(test_df)

print(f"Feature engineering completed! New shape: {train_processed.shape}")



2. Performing feature engineering...
Feature engineering completed! New shape: (891, 29)


In [4]:
# 3. PREPARE DATA FOR MODELING
print("\n3. Preparing data for modeling...")

categorical_features = ['Sex', 'Embarked', 'Title', 'AgeGroup', 'FareBin', 'CabinDeck', 'Sex_Pclass']
numerical_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 
                     'SmallFamily', 'LargeFamily', 'IsChild', 'IsElderly', 'HasCabin',
                     'FarePerPerson', 'TicketLength', 'Age_Pclass', 'Fare_Age']

# One-hot encode
train_encoded = pd.get_dummies(train_processed[categorical_features + numerical_features], 
                               columns=categorical_features, drop_first=True)
test_encoded = pd.get_dummies(test_processed[categorical_features + numerical_features], 
                              columns=categorical_features, drop_first=True)

# Align columns
train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

X = train_encoded
y = train_processed['Survived']
X_test = test_encoded

print(f"Training features shape: {X.shape}")
print(f"Test features shape: {X_test.shape}")


3. Preparing data for modeling...
Training features shape: (891, 44)
Test features shape: (418, 44)


In [5]:
# 4. MODEL TRAINING WITH CROSS-VALIDATION
print("\n4. Training models with cross-validation...")

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, 
                                           min_samples_leaf=2, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, 
                                                    max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, 
                            random_state=42, eval_metric='logloss', use_label_encoder=False)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}

print("\nCross-Validation Results:")
print("="*50)

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    results[name] = scores
    print(f"{name:20s}: {scores.mean():.4f} (+/- {scores.std():.4f})")


4. Training models with cross-validation...

Cross-Validation Results:
Logistic Regression : 0.8384 (+/- 0.0098)
Random Forest       : 0.8361 (+/- 0.0069)
Gradient Boosting   : 0.8193 (+/- 0.0178)
XGBoost             : 0.8316 (+/- 0.0149)


In [6]:
# 5. ENSEMBLE MODEL
print("\n5. Creating ensemble model...")

ensemble_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, 
                                     min_samples_leaf=2, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, 
                                         max_depth=5, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, 
                             random_state=42, eval_metric='logloss', use_label_encoder=False)),
        ('lr', LogisticRegression(max_iter=1000, random_state=42))
    ],
    voting='soft'
)

ensemble_scores = cross_val_score(ensemble_model, X, y, cv=cv, scoring='accuracy')
print(f"\nEnsemble Model CV Score: {ensemble_scores.mean():.4f} (+/- {ensemble_scores.std():.4f})")

# Train final model
print("\nTraining final ensemble model...")
ensemble_model.fit(X, y)
print("Training completed!")


5. Creating ensemble model...

Ensemble Model CV Score: 0.8350 (+/- 0.0164)

Training final ensemble model...
Training completed!


In [7]:
# 6. FEATURE IMPORTANCE
print("\n6. Analyzing feature importance...")

rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X, y)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))


6. Analyzing feature importance...

Top 10 Most Important Features:
          feature  importance
         Title_Mr    0.108617
         Sex_male    0.098332
       Age_Pclass    0.083254
         Fare_Age    0.080096
    FarePerPerson    0.068088
             Fare    0.065698
              Age    0.057195
Sex_Pclass_male_3    0.039847
     TicketLength    0.031675
        Title_Mrs    0.030020


In [8]:
# 7. MAKE PREDICTIONS
print("\n7. Making predictions...")

predictions = ensemble_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': predictions
})

submission.to_csv('submission.csv', index=False)

print("Predictions completed!")
print(f"Predicted survival rate: {predictions.mean():.2%}")


7. Making predictions...
Predictions completed!
Predicted survival rate: 36.36%


In [9]:
# 8. SUMMARY
print("\n" + "="*60)
print("MODEL PERFORMANCE SUMMARY")
print("="*60)
print(f"\nTraining Set Size: {len(train_df)}")
print(f"Test Set Size: {len(test_df)}")
print(f"Number of Features: {X.shape[1]}")
print(f"\nCross-Validation Accuracy: {ensemble_scores.mean():.4f} (+/- {ensemble_scores.std():.4f})")
print(f"\nPredicted Survivors: {predictions.sum()} ({predictions.mean():.2%})")
print(f"Predicted Non-Survivors: {len(predictions) - predictions.sum()} ({1-predictions.mean():.2%})")
print("\n" + "="*60)
print("Submission file 'submission.csv' created successfully!")
print("="*60)

print("\nFirst 10 predictions:")
print(submission.head(10))


MODEL PERFORMANCE SUMMARY

Training Set Size: 891
Test Set Size: 418
Number of Features: 44

Cross-Validation Accuracy: 0.8350 (+/- 0.0164)

Predicted Survivors: 152 (36.36%)
Predicted Non-Survivors: 266 (63.64%)

Submission file 'submission.csv' created successfully!

First 10 predictions:
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
5          897         0
6          898         0
7          899         0
8          900         1
9          901         0
