# Bank Marketing Dataset - Notebook 02

Predicting Term Deposit Suscriptions

This notebook demonstrates how to train a model using the notebook's instance (no extra computational resources).

In [None]:
!ls -la

In [None]:
!ls -la data

In [None]:
!pip install sagemaker ipywidgets --upgrade --quiet

## Import libs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)

## Define preproc functions

In [None]:
def rebalance(data):
    """
    Resample data to keep balance between target classes.

    The function uses the resample function to downsample the minority class to match the majority class.

    Args:
        data (pd.DataFrame): DataFrame

    Returns:
        pd.DataFrame): balanced DataFrame
    """
    churn_0 = data[data["Exited"] == 0]
    churn_1 = data[data["Exited"] == 1]
    if len(churn_0) > len(churn_1):
        churn_maj = churn_0
        churn_min = churn_1
    else:
        churn_maj = churn_1
        churn_min = churn_0
    churn_maj_downsample = resample(
        churn_maj, n_samples=len(churn_min), replace=False, random_state=1234
    )

    return pd.concat([churn_maj_downsample, churn_min])


def preprocess(df):
    """
    Preprocess and split data into training and test sets.

    Args:
        df (pd.DataFrame): DataFrame with features and target variables

    Returns:
        ColumnTransformer: ColumnTransformer with scalers and encoders
        pd.DataFrame: training data with transformed features
        pd.DataFrame: test data with transformed features
 
    """
    filter_feat = [
        "Exited",
        "CreditScore",
        "Geography",
        "Gender",
        "Age",
        "Tenure",
        "Balance",
        "NumOfProducts",
        "HasCrCard",
        "IsActiveMember",
        "EstimatedSalary",
    ]
    cat_cols = ["Geography", "Gender"]
    num_cols = [
        "CreditScore",
        "Age",
        "Tenure",
        "Balance",
        "NumOfProducts",
        "HasCrCard",
        "IsActiveMember",
        "EstimatedSalary",
    ]
    data = df.loc[:, filter_feat]
    data_bal = rebalance(data=data)

    df_train, df_test = train_test_split(
        data_bal, test_size=0.3, random_state=1912
    )
    col_transf = make_column_transformer(
        (StandardScaler(), num_cols),
        (OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False), cat_cols),
        remainder="passthrough",
        verbose_feature_names_out=False
    ).set_output(transform='pandas')

    df_train = col_transf.fit_transform(df_train)
    df_train = df_train

    df_test = col_transf.transform(df_test)
    
    cols = df_train.columns.tolist()
    cols = [cols[-1]] + cols[:-1]

    # Reorder the columns in the DataFrame
    df_train = df_train[cols]
    df_test = df_test[cols]

    return col_transf, df_train, df_test

## Open data

In [None]:
df = pd.read_csv("data/Churn_Modelling.csv")
df.head(3)

## Preprocess the data 

In [None]:
col_transf, df_train, df_test = preprocess(df)
df_train.head(2)

## Train Model

In [None]:
def train(X_train, y_train):
    """
    Train a logistic regression model.

    Args:
        X_train (pd.DataFrame): DataFrame with features
        y_train (pd.Series): Series with target

    Returns:
        LogisticRegression: trained logistic regression model
    """
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)
    return log_reg

In [None]:
X_train = df_train.drop("Exited", axis=1)
y_train = df_train["Exited"]

model = train(X_train, y_train)

## Evaluate model

In [None]:
X_test = df_test.drop("Exited", axis=1)
y_test = df_test["Exited"]

y_pred = model.predict(X_test)

print(f"Accuracy score: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision score: {precision_score(y_test, y_pred):.2f}")
print(f"Recall score: {recall_score(y_test, y_pred):.2f}")
print(f"F1 score: {f1_score(y_test, y_pred):.2f}")

In [None]:
conf_mat = confusion_matrix(y_test, y_pred, labels=model.classes_)
conf_mat_disp = ConfusionMatrixDisplay(
    confusion_matrix=conf_mat, display_labels=model.classes_
)
conf_mat_disp.plot()
plt.show()