# Decision tree vs Bagging vs Boosting on Titanic dataset

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import kagglehub
from kagglehub import KaggleDatasetAdapter
import time
import joblib
import os
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm


## Load dataset

In [5]:
# Set the path to the file you'd like to load
file_path = "Titanic-Dataset.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "yasserh/titanic-dataset",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


First 5 records:    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   

In [6]:
# Laod the dataset offline
#df = pd.read_csv('datasets/Titanic-Dataset.xls')
#df.info()

## Preprocessing

In [7]:


# Create models subfolder
os.makedirs('models', exist_ok=True)

# Select relevant features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features]
y = df['Survived']

# Handle missing values
age_imputer = SimpleImputer(strategy='median')
X['Age'] = age_imputer.fit_transform(X[['Age']])

embarked_imputer = SimpleImputer(strategy='most_frequent')
X['Embarked'] = embarked_imputer.fit_transform(X[['Embarked']]).ravel()

# Encode categorical variables
le_sex = LabelEncoder()
X['Sex'] = le_sex.fit_transform(X['Sex'])

le_embarked = LabelEncoder()
X['Embarked'] = le_embarked.fit_transform(X['Embarked'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Age'] = age_imputer.fit_transform(X[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Embarked'] = embarked_imputer.fit_transform(X[['Embarked']]).ravel()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex'] = le_sex.fit_transform(X['Sex'])
A value is trying to be set on a copy of a

## Training and Inference

In [8]:

# Define models with titanic_ prefix
models = {
    'titanic_Decision Tree': DecisionTreeClassifier(random_state=42),
    'titanic_Bagging Tree': BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42),
    'titanic_Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'titanic_AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'titanic_GradientBoost': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Initialize results storage
results = []
classification_reports = {}

# Train and evaluate each model
for model_name, model in models.items():
    # Measure training time
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Calculate accuracies
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    
    # Store results
    results.append({
        'Model': model_name,
        'Training Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Training Time (s)': training_time
    })
    
    # Generate classification report
    y_pred = model.predict(X_test)
    classification_reports[model_name] = classification_report(y_test, y_pred, output_dict=True)
    
    # Save the model with titanic_ prefix
    joblib.dump(model, f'models/{model_name.replace(" ", "_").lower()}.pkl')

# Convert results to DataFrame and format
results_df = pd.DataFrame(results)
results_df['Training Accuracy'] = results_df['Training Accuracy'].round(4)
results_df['Test Accuracy'] = results_df['Test Accuracy'].round(4)
results_df['Training Time (s)'] = results_df['Training Time (s)'].round(4)

# Print results table
print("\nModel Performance Comparison:")
print(tabulate(results_df, headers='keys', tablefmt='psql', showindex=False))

# Print classification reports
print("\nClassification Reports (Test Set):")
for model_name in models.keys():
    print(f"\n{model_name}:")
    report = classification_reports[model_name]
    report_df = pd.DataFrame({
        'Class': ['Not Survived (0)', 'Survived (1)', 'Weighted Avg'],
        'Precision': [
            report['0']['precision'],
            report['1']['precision'],
            report['weighted avg']['precision']
        ],
        'Recall': [
            report['0']['recall'],
            report['1']['recall'],
            report['weighted avg']['recall']
        ],
        'F1-Score': [
            report['0']['f1-score'],
            report['1']['f1-score'],
            report['weighted avg']['f1-score']
        ],
        'Support': [
            report['0']['support'],
            report['1']['support'],
            report['weighted avg']['support']
        ]
    }).round(4)
    print(tabulate(report_df, headers='keys', tablefmt='psql', showindex=False))


Model Performance Comparison:
+-----------------------+---------------------+-----------------+---------------------+
| Model                 |   Training Accuracy |   Test Accuracy |   Training Time (s) |
|-----------------------+---------------------+-----------------+---------------------|
| titanic_Decision Tree |              0.9789 |          0.7821 |              0.0029 |
| titanic_Bagging Tree  |              0.9789 |          0.8101 |              0.0944 |
| titanic_Random Forest |              0.9789 |          0.8212 |              0.0478 |
+-----------------------+---------------------+-----------------+---------------------+

Classification Reports (Test Set):

titanic_Decision Tree:
+------------------+-------------+----------+------------+-----------+
| Class            |   Precision |   Recall |   F1-Score |   Support |
|------------------+-------------+----------+------------+-----------|
| Not Survived (0) |      0.83   |   0.7905 |     0.8098 |       105 |
| Survive