# Bank Marketing Dataset

Predicting Term Deposit Suscriptions

This notebook has an exploratory analysis to demonstrate some features that could be used in a Sagemaker notebook instance.

## Importing libs

In [None]:
!pip install -U pip missingno --quiet

In [None]:
# Data
import pandas as pd

# Export
import pickle

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# AWS
import boto3

## Get data from S3

In [None]:
!mkdir data

In [None]:
bucket_in = "mlops-material"
FILE_NAME = "Churn_Modelling.csv"

In [None]:
s3 = boto3.client("s3")

# Download the dataset
# s3.download_file(
#    "mlops-material", FILE_NAME, f"data/{FILE_NAME}"
# )

!wget https://mlops-material.s3.us-east-2.amazonaws.com/Churn_Modelling.csv
!mv Churn_Modelling.csv data/Churn_Modelling.csv

In [None]:
# List files
!ls data

## Open Data

In [None]:
df = pd.read_csv("data/Churn_Modelling.csv")
df.sample(3)

## Check data

In [None]:
print("Shape")
print(df.shape)

print("\nTypes")
print(df.dtypes)

### Check missings

In [None]:
pd.DataFrame(df.isnull().sum()).T

In [None]:
msno.matrix(df)
plt.show()

In [None]:
sns.set(style="whitegrid")
g = sns.countplot(data=df, x="Gender", hue="Exited")
g.set_xticklabels(g.get_xticklabels(), rotation=90);

In [None]:
sns.set(style="whitegrid")
g = sns.countplot(data=df, x="Geography", hue="Exited")
g.set_xticklabels(g.get_xticklabels(), rotation=90);

In [None]:
sns.pairplot(df)

In [None]:
def correlation_matrix(df, cols):
    fig = plt.gcf()
    fig.set_size_inches(8, 6)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    fig = sns.heatmap(df[cols].corr(), annot=True, linewidths=0.5, annot_kws={"size": 12}, linecolor="w", cmap="RdBu")
    plt.show(block=True)

num_cols = [
        "CreditScore",
        "Age",
        "Tenure",
        "Balance",
        "NumOfProducts",
        "HasCrCard",
        "IsActiveMember",
        "EstimatedSalary",
    ]
correlation_matrix(df, num_cols)