<a href="https://colab.research.google.com/github/HigherGround189/EGT309-Team-Harish-Kanna/blob/main/EDA/AI_Solution_Development_Case_Study_Ying_Ray_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup


In [None]:
!curl -L -o bmarket.db https://github.com/HigherGround189/EGT309-Team-Harish-Kanna/raw/refs/heads/main/data/01_raw/bmarket.db
!curl -L -o plotting_utils.py https://github.com/HigherGround189/EGT309-Team-Harish-Kanna/raw/refs/heads/main/src/egt309_pipeline/plotting_utils.py

Downloading...
From: https://drive.google.com/uc?id=17S8vGBsbaAcuxgwSZLGhOOrrhfaqio7j
To: /content/bmarket.db
  0% 0.00/3.15M [00:00<?, ?B/s]100% 3.15M/3.15M [00:00<00:00, 146MB/s]


In [2]:
import operator
import sqlite3
import warnings

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
from plotly.subplots import make_subplots

warnings.filterwarnings("ignore")
pl.Config(set_tbl_cols=-1, fmt_str_lengths=65535, set_tbl_width_chars=65535)

<polars.config.Config at 0x795b6c70a0c0>

In [3]:
conn = sqlite3.connect("/content/bmarket.db")
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_name = cursor.fetchall()[0][0]
print(table_name)

bank_marketing


# Utility Functions

In [4]:
def lprint(dataframe, num=-1):
    with pl.Config(tbl_rows=num):
        print(dataframe)

In [5]:
def cprint(input, long=False, enable_print=False):
    if enable_print:
        if long:
            lprint(input)

        else:
            print(input)

In [183]:
def plot_categorical(df, col_name, debug=False):
    # Compute counts and percentages of each value, along with creating the graph's display label
    counts = df[col_name].value_counts()
    counts = (
        counts.with_columns(
            # Replace null with "Missing Data"
            pl.col(col_name).fill_null("Missing Data"),
            (pl.col("count") / df.shape[0] * 100).round(2).alias("percentage"),
        )
        .with_columns(
            # Outputs in this format: <COUNT> (<PERCENT>%)
            (
                pl.col("count").cast(pl.String)
                + " ("
                + pl.col("percentage").cast(pl.String)
                + "%)"
            ).alias("display_label")
        )
        .sort("count")
    )
    cprint(counts, enable_print=debug)

    # Plots distribution bar chart
    fig = px.bar(
        counts,
        x=col_name,
        y="count",
        text="display_label",
        color="count",
        color_continuous_scale="blues",
    )

    fig.update_traces(
        textposition="outside",
        hovertemplate=f"{col_name}: %{{x}}<br>Count: %{{text}}<extra></extra>",
    )
    fig.update_layout(
        title=f"{col_name} Distribution",
        yaxis_title="Count",
        title_x=0.5,
        showlegend=False,
    )

    fig.show()

In [184]:
def plot_subscription_distribution_categorical(
    df, col_name, distribution_col="Subscription Status", debug=False
):
    df = df.with_columns(pl.all().exclude(pl.Int64, pl.String).cast(pl.String))
    cprint(df)

    unique_values = sorted(df[distribution_col].unique().to_list(), reverse=True)
    cprint(unique_values, enable_print=debug)

    # Count number of unique values in "Subscription Status", and alias as value name.
    counts = df.group_by(col_name).agg(
        (
            (pl.col(distribution_col) == str(value)).sum().alias(str(value))
            for value in unique_values
        ),
    )
    cprint(counts, enable_print=debug)

    # Calculate total count of each category (yes + no)
    totals = counts.select(
        pl.col(col_name),
        pl.fold(
            acc=pl.lit(0),
            function=operator.add,
            exprs=pl.all().exclude(col_name),
        ).alias("Total Count"),
    )
    cprint(totals, enable_print=debug)

    # Convert from wide -> long format, add total_counts as a column, and calculate display label
    long_counts = (
        counts.sort(by=unique_values)
        .melt(col_name)
        .join(totals, on=col_name, how="left")
        .with_columns(
            # Outputs in this format: <COUNT> (<PERCENT>%)
            (
                pl.col("value").cast(pl.String)
                + " ("
                + (pl.col("value") / pl.col("Total Count") * 100)
                .round(2)
                .cast(pl.String)
                + "%)"
            ).alias("display_label")
        )
    )
    cprint(long_counts, enable_print=debug)

    # Create Stacked bar chart
    fig = px.bar(
        long_counts,
        x=col_name,
        y="value",
        color="variable",
        custom_data=["variable"],  # To access Subscription Status in hovertemplate
        labels={"variable": distribution_col},
        text="display_label",
        barmode="stack",
        color_discrete_map={  # Hardcoded colour values cause jun hoe has an extreme reaction to the regular one
            "no": "rgb(239, 85, 59)",
            "False": "rgb(239, 85, 59)",
            "false": "rgb(239, 85, 59)",
            "yes": "rgb(99, 110, 251)",
            "True": "rgb(99, 110, 251)",
            "true": "rgb(99, 110, 251)",
        },
    )

    fig.update_traces(
        textposition="outside",
        hovertemplate=f"Subscription Status: %{{customdata[0]}}<br>{col_name}: %{{x}}<br>Count: %{{y}}<extra></extra>",
    )
    fig.update_layout(
        title=f"{col_name} Subscribed Distribution", yaxis_title="Count", title_x=0.5
    )

    fig.show()

In [185]:
def plot_numerical(df, col_name, distribution_col="Subscription Status", debug=False):
    # Get describe info for the column
    column = df[col_name]
    describe = (
        column.describe()
        .transpose(column_names="statistic")
        .with_columns(pl.all().round(2))
    )

    # Get unique subscription values (we need each value to have its own stacked bar)
    subscription_values = sorted(df[distribution_col].unique().to_list(), reverse=True)

    # Calculate counts and package dataframe into dict
    # Goal is to construct a dictionary like this:
    # {
    # "yes": {0: 50, 1: 30, 2: 20},
    # "no":  {0: 20, 1: 70, 2: 10}
    # }
    counts_dict = {}
    for status in subscription_values:
        value_counts = df.filter(pl.col("Subscription Status") == status)[
            col_name
        ].value_counts()
        cprint(value_counts, enable_print=debug)
        # Creates dict of this format {x_value : x_value_count}
        counts_dict[status] = dict(
            zip(value_counts[col_name].to_list(), value_counts["count"].to_list())
        )
        cprint(counts_dict, enable_print=debug)

    # Count total number of each x_value / point (bar)
    x_values = sorted(df[col_name].unique().to_list())
    total_per_x = [
        sum(counts_dict[value].get(x, 0) for value in subscription_values)
        for x in x_values
    ]  # Sums both values (eg: If Key=Campaign Call, Value=Count: {yes: {12: 1}, no: {12: 3}} -> 3 + 1 = 4; Thus, Campaign call = 12 is 4 in total)

    hist_traces = []
    for status in subscription_values:
        # Get total count (y_value), and calculate percentage
        y_values = [counts_dict[status].get(x, 0) for x in x_values]
        percents = [
            (y / total * 100) if total > 0 else 0.0
            for y, total in zip(y_values, total_per_x)
        ]
        cprint(percents, enable_print=debug)

        # Create customdata and hovertemplate for graph.
        hist_customdata = [[percentage] for percentage in percents]
        hist_hovertemplate = (
            f"{col_name}: %{{x}}<br>"
            f"{distribution_col}: {status}<br>"
            f"Count: %{{y}} (%{{customdata[0]:.1f}}%)"
        )

        # Build & collect histogram traces to stack
        hist_traces.append(
            go.Bar(  # I know that everything else has been referring to this as a histogram, but I had to do a last minute change to use go.Bar instead. Just pretend it is a histogram, they look basically the same.
                x=x_values,
                y=y_values,
                name=status,
                customdata=hist_customdata,
                hovertemplate=hist_hovertemplate,
            )
        )

    # Create boxplot trace
    box_trace = go.Box(
        x=column, name="Boxplot", hovertemplate=f"{col_name}: %{{x}}<extra></extra>"
    )

    # Combine histogram & boxplot in subplots
    fig = make_subplots(
        rows=2,
        cols=1,
        shared_xaxes=True,
        row_heights=[0.75, 0.25],
        vertical_spacing=0.05,
    )

    # Add both histogram traces to first row
    for trace in hist_traces:
        fig.add_trace(trace, row=1, col=1)

    # Add boxplot trace to second row
    fig.add_trace(box_trace, row=2, col=1)

    # Create annotation text
    stats_text = (
        f"Min: {describe['min'][0]:g}<br>"
        f"25%: {describe['25%'][0]:g}<br>"
        f"50%: {describe['50%'][0]:g}<br>"
        f"75%: {describe['75%'][0]:g}<br>"
        f"Max: {describe['max'][0]:g}<br>"
        f"Mean: {describe['mean'][0]:g}<br>"
        f"STD: {describe['std'][0]:g}"
    )

    # Add & style annotation box
    fig.add_annotation(
        text=stats_text,
        xref="paper",
        yref="paper",
        x=0.8,
        y=0.8,
        showarrow=False,
        font=dict(size=14, color="black"),
        align="left",
        bgcolor="rgba(255,255,255,0.75)",
        bordercolor="grey",
        borderwidth=1,
        borderpad=6,
    )

    # Update layout & style, and enable stacking for histogram
    fig.update_layout(
        title=f"{col_name} Distribution",
        title_x=0.5,
        bargap=0.05,
        barmode="stack",
        margin=dict(t=60, b=40, l=40, r=40),
        showlegend=False,
    )

    fig.update_xaxes(title_text=col_name, row=2, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=1)

    fig.show()

In [216]:
def plot_distribution(
    df, col_name, distribution_col="Subscription Status", debug=False
):
    # Check for pandas dataframe and convert it to Polars
    if isinstance(df, pd.DataFrame):
        cprint("Pandas Detected. Converting...", enable_print=debug)
        df = pl.from_pandas(df)

    column = df[col_name]
    if (
        column.dtype == pl.Categorical
        or column.dtype == pl.String
        or column.dtype == pl.Boolean
    ):  # Added pl.Boolean here
        plot_categorical(df, col_name, debug=debug)

        if col_name != distribution_col:
            plot_subscription_distribution_categorical(
                df, col_name, distribution_col, debug=debug
            )

    else:
        plot_numerical(df, col_name, distribution_col, debug=debug)

# Dataset Overview

In [217]:
df = pl.read_database(query=f"SELECT * FROM {table_name}", connection=conn).fill_null(
    "Missing Data"
)
print(df)

shape: (41_188, 12)
┌───────────┬──────────┬─────────────┬────────────────┬─────────────────────┬────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┐
│ Client ID ┆ Age      ┆ Occupation  ┆ Marital Status ┆ Education Level     ┆ Credit Default ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status │
│ ---       ┆ ---      ┆ ---         ┆ ---            ┆ ---                 ┆ ---            ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 │
│ i64       ┆ str      ┆ str         ┆ str            ┆ str                 ┆ str            ┆ str          ┆ str           ┆ str            ┆ i64            ┆ i64                   ┆ str                 │
╞═══════════╪══════════╪═════════════╪════════════════╪═════════════════════╪════════════════╪══════════════╪═══════════════╪════════════════╪══════════════

In [218]:
print(df.describe())

shape: (9, 13)
┌────────────┬─────────────┬───────────┬────────────┬────────────────┬─────────────────┬────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┐
│ statistic  ┆ Client ID   ┆ Age       ┆ Occupation ┆ Marital Status ┆ Education Level ┆ Credit Default ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status │
│ ---        ┆ ---         ┆ ---       ┆ ---        ┆ ---            ┆ ---             ┆ ---            ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 │
│ str        ┆ f64         ┆ str       ┆ str        ┆ str            ┆ str             ┆ str            ┆ str          ┆ str           ┆ str            ┆ f64            ┆ f64                   ┆ str                 │
╞════════════╪═════════════╪═══════════╪════════════╪════════════════╪═════════════════╪════════════════╪════════════

In [219]:
def get_unique_values_df(df):
    unique_data = []
    for col in df.columns:
        unique_vals = df[col].unique().to_list()
        unique_data.append({"Column Name": col, "Unique Values": unique_vals})
    unique_df = pl.DataFrame(unique_data).with_columns(
        pl.col("Unique Values").list.len().alias("Count")
    )

    with pl.Config(set_fmt_table_cell_list_len=13):
        lprint(unique_df)

    return unique_df


unique_df = get_unique_values_df(df)

shape: (12, 3)
┌───────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────┐
│ Column Name           ┆ Unique Values                                                                                                                                                  ┆ Count │
│ ---                   ┆ ---                                                                                                                                                            ┆ ---   │
│ str                   ┆ list[str]                                                                                                                                                      ┆ u32   │
╞═══════════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╪═══════╡
│ Client I

# Basic Data Cleaning

In [220]:
semi_cleaned_df = df.with_columns(
    pl.col("Age").str.extract(r"\d+", group_index=0).cast(pl.Int64),  # 30 years -> 30
    pl.col("Credit Default")
    .replace(r"yes", True)
    .replace(r"no", False),  # Replaced values with Booleans ❌
    pl.col("Housing Loan")
    .replace(r"yes", True)
    .replace(r"no", False),  # Replaced values with Booleans ❌
    pl.col("Personal Loan")
    .replace(r"yes", True)
    .replace(r"no", False),  # Replaced values with Booleans ❌
    pl.col("Contact Method")
    .str.replace(r"Cell|cellular", "Cellular")
    .str.replace(
        r"telephone", "Telephone"
    ),  # (Cell, cellular) -> Cellular, (Telephone, telephone) -> Telephone
    pl.when(pl.col("Previous Contact Days") == 999)
    .then(-1)
    .otherwise(pl.col("Previous Contact Days"))
    .alias("Previous Contact Days"),  # 999 -> -1 (For plotting reasons)
    pl.col("Subscription Status").replace_strict(
        {"no": False, "yes": True}
    ),  # Replaced values with Booleans
)

print(semi_cleaned_df)

shape: (41_188, 12)
┌───────────┬─────┬─────────────┬────────────────┬─────────────────────┬────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┐
│ Client ID ┆ Age ┆ Occupation  ┆ Marital Status ┆ Education Level     ┆ Credit Default ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status │
│ ---       ┆ --- ┆ ---         ┆ ---            ┆ ---                 ┆ ---            ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 │
│ i64       ┆ i64 ┆ str         ┆ str            ┆ str                 ┆ str            ┆ str          ┆ str           ┆ str            ┆ i64            ┆ i64                   ┆ bool                │
╞═══════════╪═════╪═════════════╪════════════════╪═════════════════════╪════════════════╪══════════════╪═══════════════╪════════════════╪════════════════╪══════════════════════


# Analysis Per Column

## Client ID

Talk about how useless client id is, and how we can drop it.

In [221]:
removed_client_id = semi_cleaned_df.select(pl.all().exclude("Client ID"))

print(removed_client_id)

shape: (41_188, 11)
┌─────┬─────────────┬────────────────┬─────────────────────┬────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┐
│ Age ┆ Occupation  ┆ Marital Status ┆ Education Level     ┆ Credit Default ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status │
│ --- ┆ ---         ┆ ---            ┆ ---                 ┆ ---            ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 │
│ i64 ┆ str         ┆ str            ┆ str                 ┆ str            ┆ str          ┆ str           ┆ str            ┆ i64            ┆ i64                   ┆ bool                │
╞═════╪═════════════╪════════════════╪═════════════════════╪════════════════╪══════════════╪═══════════════╪════════════════╪════════════════╪═══════════════════════╪═════════════════════╡
│ 57  ┆ technician  ┆ married      

## Campaign Calls

In [222]:
plot_distribution(removed_client_id, "Campaign Calls")

Talk about why absoluting the negative values makes sense (distribution looks correct).



In [223]:
campaign_calls_absoluted = removed_client_id.with_columns(
    pl.col("Campaign Calls").abs()
)

print(campaign_calls_absoluted)

shape: (41_188, 11)
┌─────┬─────────────┬────────────────┬─────────────────────┬────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┐
│ Age ┆ Occupation  ┆ Marital Status ┆ Education Level     ┆ Credit Default ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status │
│ --- ┆ ---         ┆ ---            ┆ ---                 ┆ ---            ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 │
│ i64 ┆ str         ┆ str            ┆ str                 ┆ str            ┆ str          ┆ str           ┆ str            ┆ i64            ┆ i64                   ┆ bool                │
╞═════╪═════════════╪════════════════╪═════════════════════╪════════════════╪══════════════╪═══════════════╪════════════════╪════════════════╪═══════════════════════╪═════════════════════╡
│ 57  ┆ technician  ┆ married      

In [224]:
plot_distribution(campaign_calls_absoluted, "Campaign Calls")

## Previous Contact Days

In [225]:
plot_distribution(semi_cleaned_df, "Previous Contact Days")

Previous Contact Days is the number of days that passed since the client was last contacted in a previous campaign (999 means no prior contact)

- Describe distribution (Talk about how most have never been contacted before)
- Talk about why creating the boolean column is important (decision tree)

In [226]:
previous_contact_days_cleaned = removed_client_id.with_columns(
    pl.when(pl.col("Previous Contact Days") == -1)
    .then(True)
    .otherwise(False)
    .alias("Has been Contacted Previously")
)

print(previous_contact_days_cleaned)

shape: (41_188, 12)
┌─────┬─────────────┬────────────────┬─────────────────────┬────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┬───────────────────────────────┐
│ Age ┆ Occupation  ┆ Marital Status ┆ Education Level     ┆ Credit Default ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status ┆ Has been Contacted Previously │
│ --- ┆ ---         ┆ ---            ┆ ---                 ┆ ---            ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 ┆ ---                           │
│ i64 ┆ str         ┆ str            ┆ str                 ┆ str            ┆ str          ┆ str           ┆ str            ┆ i64            ┆ i64                   ┆ bool                ┆ bool                          │
╞═════╪═════════════╪════════════════╪═════════════════════╪════════════════╪══════════════╪════

In [227]:
plot_distribution(previous_contact_days_cleaned, "Has been Contacted Previously")

## Occupation

In [228]:
plot_distribution(previous_contact_days_cleaned, "Occupation")

Talk about occupation distribution and why its ok to drop the unknown (amount is very low 0.8%)

In [229]:
occupation_cleaned = previous_contact_days_cleaned.remove(
    pl.col("Occupation") == "unknown"
)

print(occupation_cleaned)

shape: (40_858, 12)
┌─────┬─────────────┬────────────────┬─────────────────────┬────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┬───────────────────────────────┐
│ Age ┆ Occupation  ┆ Marital Status ┆ Education Level     ┆ Credit Default ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status ┆ Has been Contacted Previously │
│ --- ┆ ---         ┆ ---            ┆ ---                 ┆ ---            ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 ┆ ---                           │
│ i64 ┆ str         ┆ str            ┆ str                 ┆ str            ┆ str          ┆ str           ┆ str            ┆ i64            ┆ i64                   ┆ bool                ┆ bool                          │
╞═════╪═════════════╪════════════════╪═════════════════════╪════════════════╪══════════════╪════

## Marital Status

In [230]:
plot_distribution(occupation_cleaned, "Marital Status")

In [231]:
marital_status_cleaned = occupation_cleaned.remove(pl.col("Marital Status") == "unkown")

print(marital_status_cleaned)

shape: (40_858, 12)
┌─────┬─────────────┬────────────────┬─────────────────────┬────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┬───────────────────────────────┐
│ Age ┆ Occupation  ┆ Marital Status ┆ Education Level     ┆ Credit Default ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status ┆ Has been Contacted Previously │
│ --- ┆ ---         ┆ ---            ┆ ---                 ┆ ---            ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 ┆ ---                           │
│ i64 ┆ str         ┆ str            ┆ str                 ┆ str            ┆ str          ┆ str           ┆ str            ┆ i64            ┆ i64                   ┆ bool                ┆ bool                          │
╞═════╪═════════════╪════════════════╪═════════════════════╪════════════════╪══════════════╪════

## Education Level

In [232]:
plot_distribution(marital_status_cleaned, "Education Level")

Talk about distribution and why it should be unchanged

## Credit Default

In [233]:
plot_distribution(marital_status_cleaned, "Credit Default")

Talk about how little the true count is, which shows that this entire column is quite useless, and therefore can be dropped.

In [234]:
credit_default_removed = marital_status_cleaned.drop("Credit Default")
print(credit_default_removed)

shape: (40_858, 11)
┌─────┬─────────────┬────────────────┬─────────────────────┬──────────────┬───────────────┬────────────────┬────────────────┬───────────────────────┬─────────────────────┬───────────────────────────────┐
│ Age ┆ Occupation  ┆ Marital Status ┆ Education Level     ┆ Housing Loan ┆ Personal Loan ┆ Contact Method ┆ Campaign Calls ┆ Previous Contact Days ┆ Subscription Status ┆ Has been Contacted Previously │
│ --- ┆ ---         ┆ ---            ┆ ---                 ┆ ---          ┆ ---           ┆ ---            ┆ ---            ┆ ---                   ┆ ---                 ┆ ---                           │
│ i64 ┆ str         ┆ str            ┆ str                 ┆ str          ┆ str           ┆ str            ┆ i64            ┆ i64                   ┆ bool                ┆ bool                          │
╞═════╪═════════════╪════════════════╪═════════════════════╪══════════════╪═══════════════╪════════════════╪════════════════╪═══════════════════════╪═══════════════

## Contact Method

In [240]:
plot_distribution(semi_cleaned_df, "Contact Method")

## Subscription Status


In [241]:
plot_distribution(semi_cleaned_df, "Subscription Status")

## Age

In [235]:
plot_distribution(semi_cleaned_df, "Age")

## Housing Loan

In [236]:
plot_distribution(semi_cleaned_df, "Housing Loan")

## Personal Loan

In [237]:
plot_distribution(semi_cleaned_df, "Personal Loan")