# Solution for Exercise Titanic

In [None]:
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas")

In [None]:
# Load the Titanic dataset "Dataset_titanic.csv" from Moodle into a Pandas DataFrame
titanic_df = pd.read_csv("../data/Dataset_titanic.csv")

In [None]:
# Analyze the missing data and the data types
titanic_df.info()

In [None]:
# Save the features 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch' in a variable X 
# and the labels 'Survived' in a variable y
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
X = titanic_df[features]
y = titanic_df['Survived']


In [None]:
# Conduct a train-test split with a 30% test ratio and a random state of 42
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)

In [None]:
# Instantiate a Simple Imputer with a Median Strategy
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="median")

In [None]:
# Instantiate a One Hot Encoder that ignores unknown categories
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [None]:
# Use a Column Transformer to apply the two transformers (Imputer and One-hot-encoder) to the 
# correct columns and pass through all other columns
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [('ohe', ohe, ['Sex']), 
    ('imputer', imp, ['Age'])],              
    remainder='passthrough'
)

ct.fit_transform(X_train)

In [None]:
# Create a Decision Tree Classifier with Gini impurity as a splitting criterion
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='gini')

In [None]:
# Create a Pipeline with two steps: preprocessing (Column Transformer) and classifier
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('preprocessor', ct),
    ('classifier', clf)]
)

pipe

In [None]:
# Train the Pipeline on the train set
pipe.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
y_pred = pipe.predict(X_test)

In [None]:
# Create a confusion matrix that compares the predicted labels with the correct labels of the test set
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

display_labels=['dead', 'survived']
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=display_labels)
disp.plot()
plt.show()

In [None]:
# Print a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))