# Precision-Recall Tradeoff

In [None]:
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas")

In [None]:
# get dataset
titanic_df = pd.read_csv("../data/Dataset_Titanic.csv")
titanic_df.head()

In [None]:
titanic_df.isna().sum()

In [None]:
# prediction target
y = titanic_df["Survived"]


In [None]:
# training data
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
X = titanic_df[features]

X.head()

In [None]:
# create train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
# One-hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
imp = SimpleImputer()

ct = ColumnTransformer(
    [('ohe', ohe, ['Sex']), 
    ('imputer', imp, ['Age'])],              
    remainder='passthrough'
)

ct.fit_transform(X_train)

In [None]:
# create pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# define classifier (= ML-model)
clf = DecisionTreeClassifier()

# create pipeline
pipe = Pipeline([
    ('preprocessor', ct),
    ('classifier', clf)]
)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

# Adjusting the Classification-Threshold

In [None]:
# Source: https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
# use model to predict values
y_pred_new = pipe.predict(X_test)
y_pred_new

In [None]:
# get probabilities for predictions
# lists: [probability for class 0, probability for class 1]
pipe.predict_proba(X_test)

In [None]:
# get only probabilities for class 1 ("survived")
pipe.predict_proba(X_test)[:,0]

In [None]:
# Switching threshold from 0.5 to x (Default-threshold = 0.5)

# Only if the probability is higher than threshold, we will assign class 0 (dead). Otherwise: survived!
y_pred_new = (pipe.predict_proba(X_test)[:,0] >= 0.1).astype(int)
y_pred_new = ~y_pred_new + 2
y_pred_new

In [None]:
print(classification_report(y_test, y_pred_new))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_new)
cm