# 🚢 Titanic Survival Predictor

**Project**: Binary Classification - Predicting Titanic Passenger Survival  
**Level**: Beginner  
**Dataset**: Titanic Dataset (Seaborn built-in)  

## 📋 Project Overview

In this project, we'll predict whether passengers survived the Titanic disaster using machine learning. This classic dataset is perfect for learning:

- Advanced data preprocessing and cleaning
- Feature engineering and creation
- Handling missing values
- Working with categorical variables
- Binary classification techniques

The RMS Titanic sank on April 15, 1912, and this dataset contains information about passengers and their survival outcomes.

Let's dive in! ⚓

## 1. Import Libraries

Let's import all necessary libraries for our comprehensive analysis.

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Advanced ML
import xgboost as xgb

# Utilities
import warnings
import re

warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🤖 Scikit-learn version: {sklearn.__version__}")

## 2. Data Loading and Initial Exploration

Let's load the Titanic dataset and explore its structure.

In [None]:
# Load the Titanic dataset from seaborn
df = sns.load_dataset('titanic')

print("🚢 Titanic dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

In [None]:
# Display first few rows
print("🔍 First 5 rows of the dataset:")
df.head()

In [None]:
# Basic dataset information
print("📊 Dataset Information:")
print(f"Total passengers: {len(df)}")
print(f"Features: {df.shape[1]}")
print(f"Survival rate: {df['survived'].mean():.1%}")

print("\n🔍 Data types:")
print(df.dtypes)

print("\n❌ Missing values:")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
})
print(missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False))

In [None]:
# Statistical summary
print("📈 Statistical Summary:")
df.describe()

## 3. Exploratory Data Analysis (EDA)

Let's explore the relationships between different features and survival rates.

In [None]:
# Overall survival distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Survival count
survival_counts = df['survived'].value_counts()
axes[0].pie(survival_counts.values, labels=['Did not survive', 'Survived'], 
           autopct='%1.1f%%', startangle=90, colors=['#FF6B6B', '#4ECDC4'])
axes[0].set_title('🚢 Overall Survival Distribution', fontweight='bold', fontsize=14)

# Survival by gender
survival_by_sex = df.groupby('sex')['survived'].mean()
bars = axes[1].bar(survival_by_sex.index, survival_by_sex.values, 
                   color=['#FF6B6B', '#4ECDC4'])
axes[1].set_title('👫 Survival Rate by Gender', fontweight='bold', fontsize=14)
axes[1].set_ylabel('Survival Rate')
axes[1].set_ylim(0, 1)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height + 0.02,
                f'{height:.1%}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"📊 Key Insight: Women had a {survival_by_sex['female']:.1%} survival rate vs {survival_by_sex['male']:.1%} for men")

In [None]:
# Survival analysis by passenger class and other factors
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('🔍 Survival Analysis by Different Factors', fontsize=16, fontweight='bold')

# Survival by passenger class
survival_by_class = df.groupby('pclass')['survived'].mean()
bars1 = axes[0,0].bar(survival_by_class.index, survival_by_class.values, 
                      color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0,0].set_title('🎫 Survival Rate by Passenger Class')
axes[0,0].set_xlabel('Passenger Class')
axes[0,0].set_ylabel('Survival Rate')
axes[0,0].set_ylim(0, 1)

for bar in bars1:
    height = bar.get_height()
    axes[0,0].text(bar.get_x() + bar.get_width()/2., height + 0.02,
                   f'{height:.1%}', ha='center', va='bottom', fontweight='bold')

# Age distribution by survival
survived_ages = df[df['survived'] == 1]['age'].dropna()
not_survived_ages = df[df['survived'] == 0]['age'].dropna()

axes[0,1].hist(not_survived_ages, bins=30, alpha=0.7, label='Did not survive', color='#FF6B6B')
axes[0,1].hist(survived_ages, bins=30, alpha=0.7, label='Survived', color='#4ECDC4')
axes[0,1].set_title('👶 Age Distribution by Survival')
axes[0,1].set_xlabel('Age')
axes[0,1].set_ylabel('Count')
axes[0,1].legend()

# Survival by embarkation port
survival_by_embarked = df.groupby('embarked')['survived'].mean()
bars2 = axes[1,0].bar(survival_by_embarked.index, survival_by_embarked.values, 
                      color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1,0].set_title('🚢 Survival Rate by Embarkation Port')
axes[1,0].set_xlabel('Embarkation Port')
axes[1,0].set_ylabel('Survival Rate')
axes[1,0].set_ylim(0, 1)

for bar in bars2:
    height = bar.get_height()
    axes[1,0].text(bar.get_x() + bar.get_width()/2., height + 0.02,
                   f'{height:.1%}', ha='center', va='bottom', fontweight='bold')

# Fare distribution by survival
survived_fare = df[df['survived'] == 1]['fare'].dropna()
not_survived_fare = df[df['survived'] == 0]['fare'].dropna()

axes[1,1].hist(not_survived_fare, bins=50, alpha=0.7, label='Did not survive', color='#FF6B6B')
axes[1,1].hist(survived_fare, bins=50, alpha=0.7, label='Survived', color='#4ECDC4')
axes[1,1].set_title('💰 Fare Distribution by Survival')
axes[1,1].set_xlabel('Fare')
axes[1,1].set_ylabel('Count')
axes[1,1].set_xlim(0, 200)  # Limit x-axis for better visualization
axes[1,1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for numerical features
plt.figure(figsize=(10, 8))
numerical_features = ['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']
correlation_matrix = df[numerical_features].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0, 
            square=True, linewidths=0.5, fmt='.2f')
plt.title('🔥 Correlation Matrix - Numerical Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("🔍 Key Correlations with Survival:")
survival_corr = correlation_matrix['survived'].sort_values(key=abs, ascending=False)[1:]
for feature, corr in survival_corr.items():
    print(f"• {feature}: {corr:.3f}")
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Data Preprocessing and Feature Engineering\n",
    "\n",
    "Now let's clean the data and create new features to improve our model's performance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a copy of the dataset for preprocessing\n",
    "data = df.copy()\n",
    "\n",
    "print(\"🔧 Starting data preprocessing and feature engineering...\")\n",
    "print(f\"Original dataset shape: {data.shape}\")\n",
    "\n",
    "# 1. Handle missing values in 'age'\n",
    "# Fill missing ages with median age by passenger class and gender\n",
    "age_median = data.groupby(['pclass', 'sex'])['age'].median()\n",
    "for pclass in data['pclass'].unique():\n",
    "    for sex in data['sex'].unique():\n",
    "        mask = (data['pclass'] == pclass) & (data['sex'] == sex) & (data['age'].isnull())\n",
    "        data.loc[mask, 'age'] = age_median[pclass, sex]\n",
    "\n",
    "print(f\"✅ Age missing values filled: {data['age'].isnull().sum()} remaining\")\n",
    "\n",
    "# 2. Handle missing values in 'embarked'\n",
    "# Fill with the most common port\n",
    "most_common_port = data['embarked'].mode()[0]\n",
    "data['embarked'].fillna(most_common_port, inplace=True)\n",
    "print(f\"✅ Embarked missing values filled with '{most_common_port}': {data['embarked'].isnull().sum()} remaining\")\n",
    "\n",
    "# 3. Create new features\n",
    "print(\"\\n🛠️ Creating new features...\")\n",
    "\n",
    "# Family size\n",
    "data['family_size'] = data['sibsp'] + data['parch'] + 1\n",
    "print(f\"✅ Created 'family_size' feature\")\n",
    "\n",
    "# Is alone\n",
    "data['is_alone'] = (data['family_size'] == 1).astype(int)\n",
    "print(f\"✅ Created 'is_alone' feature\")\n",
    "\n",
    "# Age groups\n",
    "data['age_group'] = pd.cut(data['age'], bins=[0, 12, 18, 35, 60, 100], \n",
    "                          labels=['Child', 'Teen', 'Adult', 'Middle_age', 'Senior'])\n",
    "print(f\"✅ Created 'age_group' feature\")\n",
    "\n",
    "# Fare bins\n",
    "data['fare_bin'] = pd.qcut(data['fare'], q=4, labels=['Low', 'Medium', 'High', 'Very_High'])\n",
    "print(f\"✅ Created 'fare_bin' feature\")\n",
    "\n",
    "print(f\"\\n📊 Final dataset shape: {data.shape}\")\n",
    "print(f\"📊 New features added: {data.shape[1] - df.shape[1]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare features for machine learning\n",
    "print(\"🔧 Preparing features for machine learning...\")\n",
    "\n",
    "# Select features for modeling\n",
    "feature_columns = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', \n",
    "                  'family_size', 'is_alone']\n",
    "\n",
    "# Create feature matrix\n",
    "X = data[feature_columns].copy()\n",
    "y = data['survived']\n",
    "\n",
    "# Encode categorical variables\n",
    "label_encoders = {}\n",
    "categorical_features = ['sex', 'embarked']\n",
    "\n",
    "for feature in categorical_features:\n",
    "    le = LabelEncoder()\n",
    "    X[feature] = le.fit_transform(X[feature])\n",
    "    label_encoders[feature] = le\n",
    "    print(f\"✅ Encoded '{feature}': {dict(zip(le.classes_, le.transform(le.classes_)))}\")\n",
    "\n",
    "print(f\"\\n📊 Final feature matrix shape: {X.shape}\")\n",
    "print(f\"📊 Target variable shape: {y.shape}\")\n",
    "print(f\"📊 Features used: {list(X.columns)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Model Building and Training\n",
    "\n",
    "Let's train multiple classification algorithms and compare their performance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the data\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42, stratify=y\n",
    ")\n",
    "\n",
    "print(\"✂️ Data split completed:\")\n",
    "print(f\"Training set: {X_train.shape[0]} samples\")\n",
    "print(f\"Testing set: {X_test.shape[0]} samples\")\n",
    "print(f\"Training survival rate: {y_train.mean():.1%}\")\n",
    "print(f\"Testing survival rate: {y_test.mean():.1%}\")\n",
    "\n",
    "# Scale the features\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n",
    "\n",
    "print(\"\\n⚖️ Feature scaling completed!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize models\n",
    "models = {\n",
    "    'Logistic Regression': LogisticRegression(random_state=42),\n",
    "    'Decision Tree': DecisionTreeClassifier(random_state=42),\n",
    "    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),\n",
    "    'Gradient Boosting': GradientBoostingClassifier(random_state=42),\n",
    "    'SVM': SVC(probability=True, random_state=42),\n",
    "    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')\n",
    "}\n",
    "\n",
    "print(\"🤖 Models initialized:\")\n",
    "for name in models.keys():\n",
    "    print(f\"• {name}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}