In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load the dataset
titanic = sns.load_dataset('titanic')

In [2]:
titanic = titanic.drop('deck', axis=1)


In [3]:
titanic = titanic.dropna()

In [9]:

X = titanic.drop(columns=['survived'])  # Features
y = titanic['survived']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
print(len(X_train), len(y_train))

623 623


In [36]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,712.0,712.0,712.0,712.0,712.0,712.0
mean,0.404494,2.240169,29.642093,0.514045,0.432584,34.567251
std,0.491139,0.836854,14.492933,0.930692,0.854181,52.938648
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,20.0,0.0,0.0,8.05
50%,0.0,2.0,28.0,0.0,0.0,15.64585
75%,1.0,3.0,38.0,1.0,1.0,33.0
max,1.0,3.0,80.0,5.0,6.0,512.3292


In [13]:
# Pipeline: Integrate preprocessing into the training process
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', DecisionTreeClassifier())
])

# Number of bootstrap samples and trees in the ensemble
n_samples = len(X_train)
n_trees = 50

def bootstrap_sample(X, y):
    indices = np.random.choice(len(X), size=len(X), replace=True)
    # return X[indices], y[indices]
    return X.iloc[indices], y.iloc[indices]



In [38]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [3]:
titanic = titanic.drop('embark_town', axis=1)

In [4]:
titanic = titanic.drop('who', axis=1)

In [5]:
titanic = titanic.drop('embarked', axis=1)

In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    891 non-null    int64   
 1   pclass      891 non-null    int64   
 2   sex         891 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       891 non-null    int64   
 5   parch       891 non-null    int64   
 6   fare        891 non-null    float64 
 7   class       891 non-null    category
 8   adult_male  891 non-null    bool    
 9   alive       891 non-null    object  
 10  alone       891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(2)
memory usage: 58.6+ KB


In [7]:
titanic['sex'] = titanic['sex'].replace({'female': 1, 'male': 0})
titanic['class'] = titanic['class'].replace({'First':1, 'Second':2, 'Third':3})
titanic['alive'] = titanic['alive'].replace({'no':0, 'yes':1})
titanic['alone'] = titanic['alone'].replace({'False':0, 'True':1})


In [8]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,adult_male,alive,alone
0,0,3,0,22.0,1,0,7.25,3,True,0,False
1,1,1,1,38.0,1,0,71.2833,1,False,1,False
2,1,3,1,26.0,0,0,7.925,3,False,1,True
3,1,1,1,35.0,1,0,53.1,1,False,1,False
4,0,3,0,35.0,0,0,8.05,3,True,0,True


In [14]:
# Train multiple decision trees on different bootstrap samples
trees = []
for _ in range(n_trees):
    X_sample, y_sample = bootstrap_sample(X_train, y_train)
    
    # Clone the pipeline for each tree
    cloned_pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('clf', DecisionTreeClassifier())])
    cloned_pipeline.fit(X_sample, y_sample)
    trees.append(cloned_pipeline)


ValueError: Input X contains NaN.
DecisionTreeClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values