In [None]:
import polars as pl

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from xgboost import XGBClassifier

import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets


In [None]:
set_config(transform_output='polars')

df = pl.read_csv('datasets/titanic_train.csv')
df = df.rename({col: col.lower() for col in df.columns})

In [None]:
columns = ['age', 'fare', 'parch']
dropdown = widgets.Dropdown(options=columns, value=columns[0])

def plot(column):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    # sns.lineplot(data=df, x=column, y='survived')
    sns.kdeplot(data=df, x=column, hue='survived', ax=axes[0])
    sns.boxplot(data=df, x=column, hue='survived', ax=axes[1], showfliers=False)
    plt.show()
    agg_df = (
        df.group_by('survived')
            .agg([
                pl.col(column).quantile(0.25).alias('25%'),
                pl.col(column).median().alias('median'),
                pl.col(column).quantile(0.75).alias('75%'),])
            .sort('survived')
    )
    print(agg_df)

widgets.interact(plot, column=dropdown)


In [None]:
cat_columns = ["embarked", "sex", "pclass"]
dropdown = widgets.Dropdown(options=cat_columns, value=cat_columns[0])

def plot(column):
    agg_df = (
        df.group_by(column)
            .agg([
                pl.sum('survived').alias('survived'),
                pl.count('survived').alias('total'),
                (pl.sum('survived') / pl.count('survived')).alias('survival_rate')
            ])
            .sort(column)
    )

    sns.barplot(data=agg_df, x=column, y='survival_rate')
    plt.show()
    print(agg_df)

widgets.interact(plot, column=dropdown)

In [None]:
relevant_df = df.select(
    pl.col('age').fill_null(strategy='mean'),
    pl.col('fare').fill_null(strategy='mean'),
    pl.col('survived'),
)
print('correlation with survived:')
print(relevant_df.corr()[2])
# sns.heatmap(correlation_matrix, annot=True, cmap='PuOr')

In [None]:
x_df = df.drop('survived')
y_df = df['survived']
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(x_df, y_df, stratify=y_df, random_state=0)
x_train_df

In [None]:
ct = make_column_transformer(
    (make_pipeline(SimpleImputer(), StandardScaler()), ["age", "fare"]),
    (OneHotEncoder(sparse_output=False), ["embarked", "sex", "pclass"]),
    verbose_feature_names_out=False)


# Note: click on pipeline elements to see more details
classifiers = {
    'logistic_regression': LogisticRegression(),
    'stochastic_gradient_descent': SGDClassifier(loss='log_loss'),
    'random_forest': RandomForestClassifier(),
    'xgboost': XGBClassifier() # see https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn for more parameters
}

from sklearn.metrics import roc_curve

def run_pipeline_with_classifier(clf_name):
    clf = make_pipeline(ct, classifiers[clf_name])
    clf.fit(x_train_df, y_train_df)
    print('Solver:', clf_name)
    print()
    print('Score: ', clf.score(x_test_df, y_test_df))

    prob = clf.predict_proba(x_test_df)
    fpr, tpr, thresholds = roc_curve(y_test_df, prob[:, 1])
    sns.lineplot(x=fpr, y=tpr, errorbar=None)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    auc_roc = roc_auc_score(y_test_df, prob[:, 1])
    plt.fill_between(fpr, tpr, color="skyblue", alpha=0.4)
    print('AUC ROC:', auc_roc)

classifiers_selector = widgets.Dropdown(options=classifiers.keys(), value='logistic_regression')
widgets.interact(run_pipeline_with_classifier, clf_name=classifiers_selector)

In [None]:
n_estimators_widget = widgets.IntSlider(value=100, min=0, max=300, step=5)
max_depth_widget = widgets.IntSlider(min=1, max=20, step=1)

print('Performance of Random Forest with different hyperparameters:')
def run_pipeline_with_random_forest(n_estimators, max_depth):
    clf = make_pipeline(ct, RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth))
    clf.fit(x_train_df, y_train_df)
    print(clf.score(x_test_df, y_test_df))

widgets.interact(run_pipeline_with_random_forest, n_estimators=n_estimators_widget, max_depth=max_depth_widget)


In [None]:
max_iter_widget = widgets.IntSlider(value=40, min=20, max=100, step=5)

print('Performance of Logistic Regression with different hyperparameters:')
def run_pipeline_with_logistic(max_iter):
    clf = make_pipeline(ct, LogisticRegression(max_iter=max_iter))
    clf.fit(x_train_df, y_train_df)
    print('Score: ', clf.score(x_test_df, y_test_df))


widgets.interact(run_pipeline_with_logistic, max_iter=max_iter_widget)


In [None]:
clf = make_pipeline(ct, LogisticRegression())
clf.fit(x_train_df, y_train_df)
prob = clf.predict_proba(x_test_df)
print('score: ', clf.score(x_test_df, y_test_df))

def convert_prob_to_df(prob):
    return pl.DataFrame({str(clf.classes_[i]): list(prob[:, 0]) for i in range(len(clf.classes_))})

prob_df = convert_prob_to_df(prob)
sns.lineplot(data=prob_df['1'].sort())

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

threshold_widget = widgets.FloatSlider(value=0.5, min=0, max=1, step=0.1)

def show_confusion(threshold):
    # probability of survival must be greater than threshold to predict survival
    # c[1][0] is the number of those who survived but were predicted as not survived

    prob = clf.predict_proba(x_test_df)
    y_pred = [1 if row[1] > threshold else 0 for row in prob]
    cm = confusion_matrix(y_test_df, y_pred)
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
    plt.xlabel('predicted survived')
    plt.ylabel('actual survived')

widgets.interact(show_confusion, threshold=threshold_widget)