# Ensemble Learning with Bagging and Random Forests
**Dataset:** Titanic (Survival Prediction)

## Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

sns.set(style='whitegrid')

## Step 2: Load the Titanic Dataset

In [None]:
titanic = sns.load_dataset('titanic')
titanic.head()

## Step 3: Exploratory Data Analysis (EDA)

### 3.1 Dataset Overview

In [None]:
titanic.info()

### 3.2 Missing Values Heatmap

In [None]:
plt.figure(figsize=(10, 5))
sns.heatmap(titanic.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values in Titanic Dataset")
plt.show()

### 3.3 Class Balance - Survival

In [None]:
sns.countplot(x='survived', data=titanic, palette='pastel')
plt.title("Survival Distribution")
plt.xlabel("Survived (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()

### 3.4 Survival Rate by Sex

In [None]:
sns.barplot(x='sex', y='survived', data=titanic, palette='Set2')
plt.title("Survival Rate by Sex")
plt.ylabel("Survival Probability")
plt.show()

### 3.5 Survival by Passenger Class

In [None]:
sns.barplot(x='pclass', y='survived', data=titanic, palette='Blues_d')
plt.title("Survival Rate by Passenger Class")
plt.ylabel("Survival Probability")
plt.show()

### 3.6 Age Distribution by Survival

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data=titanic, x='age', hue='survived', kde=True, element='step', palette='muted')
plt.title("Age Distribution by Survival")
plt.show()

### 3.7 Boxplot of Fare by Class and Survival

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=titanic, x='pclass', y='fare', hue='survived', palette='Set3')
plt.title("Fare by Passenger Class and Survival")
plt.show()

## Step 4: Data Preprocessing

In [None]:
titanic.drop(['deck', 'embark_town', 'alive', 'who', 'class'], axis=1, inplace=True)

label_encoders = {}
categorical_cols = ['sex', 'embarked', 'alone']
for col in categorical_cols:
    le = LabelEncoder()
    titanic[col] = le.fit_transform(titanic[col].astype(str))
    label_encoders[col] = le

imputer = SimpleImputer(strategy='median')
titanic['age'] = imputer.fit_transform(titanic[['age']])
titanic['embarked'] = imputer.fit_transform(titanic[['embarked']])
titanic.dropna(inplace=True)

titanic.head()

## Step 5: Train-Test Split

In [None]:
X = titanic.drop('survived', axis=1)
y = titanic['survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## Step 6: Decision Tree Classifier (Baseline)

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

## Step 7: Bagging Classifier

In [None]:
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=100,
    random_state=42
)
bag_model.fit(X_train, y_train)
y_pred_bag = bag_model.predict(X_test)

print("Bagging Classifier Accuracy:", accuracy_score(y_test, y_pred_bag))
print(confusion_matrix(y_test, y_pred_bag))
print(classification_report(y_test, y_pred_bag))

## Step 8: Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

## Step 9: Feature Importance Visualization

In [None]:
importances = rf_model.feature_importances_
features = X.columns
indices = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.title("Feature Importances - Random Forest")
plt.barh(range(len(indices)), importances[indices], align="center")
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()