In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
from scipy.stats import mode
import time

In [2]:
data = sns.load_dataset('titanic')
df = data.drop(['deck','class','who','adult_male','embark_town','alive','alone'],axis=1)
df['age'] = df['age'].fillna(df.groupby('sex')['age'].transform('mean'))
df['fam'] = df['parch'] + df['sibsp']

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,fam
0,0,3,male,22.0,1,0,7.25,S,1
1,1,1,female,38.0,1,0,71.2833,C,1
2,1,3,female,26.0,0,0,7.925,S,0
3,1,1,female,35.0,1,0,53.1,S,1
4,0,3,male,35.0,0,0,8.05,S,0


In [4]:
df = pd.get_dummies(df, columns=['sex','embarked','pclass'], dtype=int)

In [5]:
y = df['survived']
X = df.drop('survived', axis=1)

In [6]:
all_columns = df.columns.to_list()
random_columns = [np.random.choice(all_columns, 3, replace=True).tolist() for i in range (10)]

In [7]:
random_columns

[['parch', 'survived', 'sex_female'],
 ['sex_male', 'age', 'embarked_C'],
 ['age', 'pclass_1', 'survived'],
 ['sex_female', 'fare', 'survived'],
 ['sex_male', 'fare', 'age'],
 ['sex_male', 'embarked_C', 'fam'],
 ['embarked_S', 'sex_female', 'embarked_S'],
 ['pclass_3', 'pclass_3', 'fam'],
 ['fam', 'embarked_Q', 'pclass_1'],
 ['pclass_3', 'parch', 'fare']]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
def build_decision_trees(X_train, y_train, num_models=10, num_columns=3):
    all_columns = X_train.columns.to_list()
    models = []
    for i in range(num_models):
        selected_columns = np.random.choice(all_columns, num_columns, replace=True).tolist()
        X_train_subset = X_train[selected_columns]
        tree = DecisionTreeClassifier(random_state=42)
        tree.fit(X_train_subset, y_train)
        models.append((tree, selected_columns))
    return models
models = build_decision_trees(X_train, y_train, num_models=10, num_columns=3)

In [9]:
models

[(DecisionTreeClassifier(random_state=42), ['parch', 'fam', 'fam']),
 (DecisionTreeClassifier(random_state=42), ['sibsp', 'fam', 'pclass_1']),
 (DecisionTreeClassifier(random_state=42), ['parch', 'parch', 'age']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_C', 'sibsp', 'pclass_3']),
 (DecisionTreeClassifier(random_state=42),
  ['sex_male', 'embarked_C', 'sex_female']),
 (DecisionTreeClassifier(random_state=42), ['fare', 'age', 'parch']),
 (DecisionTreeClassifier(random_state=42), ['pclass_3', 'sibsp', 'sex_male']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_S', 'pclass_3', 'sex_male']),
 (DecisionTreeClassifier(random_state=42),
  ['sibsp', 'sex_male', 'embarked_C']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_S', 'sex_female', 'pclass_1'])]

In [10]:
def measure_accuracy(models, X_train, y_train):
    predictions = []
    for tree, selected_columns in models:
        X_train_subset = X_train[selected_columns]
        pred = tree.predict(X_train_subset)
        predictions.append(pred)
    predictions = np.array(predictions).T
    final_predictions, i = mode(predictions, axis=1)
    final_predictions = final_predictions.flatten()
    accuracy = accuracy_score(y_train, final_predictions)
    #print(predictions)
    #print(final_predictions)
    return accuracy

In [None]:
#