## Dataset Transition Review (JUN - SEP Transition)

Process
- Review each of the monthly snapshots to include additional variables
- Combine loan level data to understand transitions
- Produce summary analysis metrics
- Export to excel

In [None]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import scipy.stats as stats
import time
import sys
import polars as pl
import plotly.express as px
import os
import subprocess

In [None]:
# Notebook setting updates
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Adjust options for displaying the float columns
pd.options.display.float_format = '{:,.2f}'.format

# Set options to display all columns
pd.set_option('display.max_columns', None)  # None means no limit

# Warning settings
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# Key variables
columns_to_import = ['...',] # select subset of columns
file_jun = 'file_jun.csv'
file_sep = 'file_sep.csv'
file_run = 'review'

In [None]:
# import data
# df_j = pd.read_csv(file_jun, encoding = "ISO-8859-1",) # review initial file to understand column names
df_j = pd.read_csv(file_jun, encoding = "ISO-8859-1",usecols=columns_to_import)
df_s = pd.read_csv(file_sep, encoding = "ISO-8859-1",usecols=columns_to_import)

In [None]:
# Summary detail on the imported datasets
df_j.info(memory_usage='deep')
df_s.info(memory_usage='deep')

In [None]:
# Define a function to determine arrears status
def arrears_status(score):
    if score >= 53:
        return "53+"
    elif score >= 40:
        return "40-52"
    elif score >= 27:
        return "27-39"
    elif score >= 19:
        return "19-26"
    elif score >= 10:
        return "10-18"
    elif score >= 1:
        return "1-9"
    else:
        return "0"

def coll_rates(score):
    if score >= 26:
        return 1  
    elif score >= 19:
        return 0.000    
    elif score >= 10:
        return 0.000    
    elif score >= 1:
        return 0.0000    
    else:
        return 0.0000     

In [None]:
# Review with relation variable details
# Function used to clean imported data to allow for exploration.
# .assign : steps used to update variables
def tweak_jb(df):
    return (
        df
        .rename(columns=lambda c:c.replace(' ','_'))
        .rename(columns={'ï»¿LOAN_NO':'Loan_No',
                         'LOAN_PRINC':'Balance',
                         ...,
                   })
        # .loc[df.OPEN_FLAG == 'Y']  # Used to define loan open/closed details at timestamp.
        .assign(
                # Attached_Shares_check=lambda df_:df_[['Outstanding_Balance', 'TOTAL_SAVINGS', '...',]].min(axis=1).clip(lower=0),
                Arrears_Band=lambda df_:df_.WeeksInArrears.apply(arrears_status),#.astype('category'),
                Loan_Vol=lambda df_:df_.groupby('Borrower_No')['Borrower_No'].transform('size'),
                Borrower_Out_Bal=lambda df_:df_.groupby('Borrower_No')['Balance'].transform('sum'),
                Purpose_Count=lambda df_:df_.groupby(['Borrower_No'])['Purpose'].transform('nunique'),
                Purpose_Match=lambda df_:np.where(df_.Loan_Vol != df_.Purpose_Count,1,0),
                Rank_Balance=lambda df_: df_['Balance'].rank(ascending=False, method='min'),
                Prov_coll=lambda df_:df_.WeeksInArrears.apply(coll_rates) * df_.Balance,
                Prov_final=lambda df_:df_.apply(lambda row: min(sum(row[['Prov_coll', 'ProvisionAmount',]]), row.Balance), axis=1),
               )
        # .drop(columns=[...,])
    )

In [None]:
# Run function to create the updated DataFrame for analysis
df_j1 = tweak_jb(df_j)
df_j1.sample(5)

In [None]:
df_j1.Net_bal_review.value_counts()

# df_check = (
#     df_j1
#     .loc[df_j1.Net_bal_review == 0]
#     .sample(n=5, random_state=1)
# )

# df_check

In [None]:
df_check = (
    df_j1
    .groupby(['Net_bal_review', 'Write_Off'])
    # .Balance.sum()
    .Loan_No.count()
)
df_check

In [None]:
# # Create a function to calculate all required statistics
# def calculate_statistics(group):
#     return pd.Series({
#         'count_balance': group['Balance'].count(),
#         'sum_balance': group['Balance'].sum(),
#         'mean_balance': group['Balance'].mean(),
#         'variance_balance': group['Balance'].var(ddof=0),  # Population variance
#         'min_balance': group['Balance'].min(),
#         'median_balance': group['Balance'].median(),
#         'max_balance': group['Balance'].max(),
#         'p5_balance': np.percentile(group['Balance'], 5),
#         'p95_balance': np.percentile(group['Balance'], 95),
#         'skewness_balance': stats.skew(group['Balance']),
#         'kurtosis_balance': stats.kurtosis(group['Balance']),
        
#         'count_shares': group['Attached_Shares'].count(),
#         'sum_shares': group['Attached_Shares'].sum(),
#         'mean_shares': group['Attached_Shares'].mean(),
#         'variance_shares': group['Attached_Shares'].var(ddof=0),  # Population variance
#         'min_shares': group['Attached_Shares'].min(),
#         'median_shares': group['Attached_Shares'].median(),
#         'max_shares': group['Attached_Shares'].max(),
#         'p5_shares': np.percentile(group['Attached_Shares'], 5),
#         'p95_shares': np.percentile(group['Attached_Shares'], 95),
#         'skewness_shares': stats.skew(group['Attached_Shares']),
#         'kurtosis_shares': stats.kurtosis(group['Attached_Shares']),
#     })

# # Group by the binary review flag and apply the statistics function
# df_summ_check = df_j1.groupby('Net_bal_review').apply(calculate_statistics).reset_index()

# # Display the summary DataFrame
# df_summ_check

In [None]:
# Top 10 Rankings
df_top10_nob = (
    df_j1
    .loc[df_j1.Rank_Balance <= 10]
    .sort_values(by='Rank_Balance')
)

df_top10_nob.shape
df_top10_nob

In [None]:
class DataMerger:
    def __init__(self, df1: pd.DataFrame, df2: pd.DataFrame):
        self.df1 = df1
        self.df2 = df2
        self.merged_df = None

    def merge_dataframes(self, on_columns: list, how: str = 'outer', suffix1: str = '_T0', suffix2: str = '_T1') -> pd.DataFrame:
        """
        Merges two DataFrames with specified parameters.

        :param on_columns: list - A list of column names to merge on.
        :param how: str - The type of merge to be performed. Default is 'outer'.
        :param suffix1: str - Suffix to apply to overlapping columns in the first DataFrame. Default is '_T0'.
        :param suffix2: str - Suffix to apply to overlapping columns in the second DataFrame. Default is '_T1'.
        :return: pd.DataFrame - The merged DataFrame.
        """
        # Perform the merge
        df = pd.merge(
            self.df1,
            self.df2,
            how=how,
            on=on_columns,
            suffixes=(suffix1, suffix2)
        )
        
        # Assign flags for outstanding balances
        self.merged_df = df.assign(
            T0_Flag=lambda df_: np.where(df_.Balance_T0.notnull(), 1, 0),
            T1_Flag=lambda df_: np.where(df_.Balance_T1.notnull(), 1, 0),
            Loan_Status=lambda df_: np.where(
                (df_.Write_Off_T0 == 1) & (df_.Write_Off_T1 == 1), 'Write_off',
                np.where((df_.T0_Flag == 0) & (df_.T1_Flag == 1), 'New',
                np.where((df_.T0_Flag == 1) & (df_.T1_Flag == 0), 'Closed', 
                np.where((df_.T0_Flag == 1) & (df_.T1_Flag == 1), 'Stock', np.nan)))),
            Arrears_Band_T0=lambda df_: df_['Arrears_Band_T0'].fillna('Unknown'),  # Replace NaN with 'Unknown'
            Arrears_Band_T1=lambda df_: df_['Arrears_Band_T1'].fillna('Unknown'),   # Replace NaN with 'Unknown'
            )

        # Check that all loans from both DataFrames are present
        loans_in_df1 = set(self.df1[on_columns[0]])
        loans_in_df2 = set(self.df2[on_columns[0]])
        merged_loans = set(self.merged_df[on_columns[0]])

        missing_loans_df1_count = len(loans_in_df1 - merged_loans)
        missing_loans_df2_count = len(loans_in_df2 - merged_loans)

        if missing_loans_df1_count > 0:
            print(f"Count of missing loans from df1: {missing_loans_df1_count}")
        if missing_loans_df2_count > 0:
            print(f"Count of missing loans from df2: {missing_loans_df2_count}")

        return self.merged_df

# Example usage
df_merger = DataMerger(df_j1, df_s1)
df_loan = df_merger.merge_dataframes(on_columns=['Loan_No',])
df_loan.shape
df_loan.head()

In [None]:
# check for duplicates
dups_check_l = df_loan.Loan_No.is_unique
dups_check_l

In [None]:
# Addition of KW parameter provides output for non-numeric features
df_loan_description = df_loan.describe(include='all').T
df_loan_description 

In [None]:
# Summary table
# Create the summary DataFrame using groupby and aggregate functions in one line
# Create the aggregation dictionary
agg_dict = {
    'loan_count': ('Loan_No', 'count'),
    'count': ('Balance_T0', 'count'),
    'sum': ('Balance_T0', 'sum'),
    'mean': ('Balance_T0', 'mean'),
    'variance': ('Balance_T0', lambda x: x.var(ddof=0)),  # Population variance
    'min': ('Balance_T0', 'min'),
    'median': ('Balance_T0', 'median'),
    'max': ('Balance_T0', 'max'),
    'p5': ('Balance_T0', lambda x: np.percentile(x, 5)),
    'p95': ('Balance_T0', lambda x: np.percentile(x, 95)),
    'skewness': ('Balance_T0', lambda x: stats.skew(x)),
    'kurtosis': ('Balance_T0', lambda x: stats.kurtosis(x))
}

# Create the summary DataFrame using groupby and the aggregation dictionary
df_summ = (
    df_loan
    .groupby(['Loan_Status'])
    .agg(**agg_dict)
    .reset_index()
)

# Display the summary DataFrame
df_summ

In [None]:
# Split aggregation dictionaries for T0 and T1
agg_dict_T0 = {
    'loan_no_T0': ('Loan_No', 'count'),
    'count_T0': ('Balance_T0', 'count'),
    'sum_ob_T0': ('Balance_T0', 'sum'),
    'sum_nb_T0': ('Net_Outstanding_Balance_T0', 'sum'),
    'sum_prov_T0': ('Prov_final_T0', 'sum'),
    'mean_pc_T0': ('Provision_pct_T0', 'mean'),
}

agg_dict_T1 = {
    'loan_no_T1': ('Loan_No', 'count'),
    'count_T1': ('Balance_T1', 'count'),
    'sum_ob_T1': ('Balance_T1', 'sum'),
    'sum_nb_T1': ('Net_Outstanding_Balance_T1', 'sum'),
    'sum_prov_T1': ('Prov_final_T1', 'sum'),
    'mean_pc_T1': ('Provision_pct_T1', 'mean'),
}

# Create a combined summary DataFrame
agg_dict_combined = {**agg_dict_T0, **agg_dict_T1}
df_combined_summ = (
    df_loan
    .groupby(['Loan_Status', 'Covered_Loans_T0', 'Covered_Loans_T1'],dropna=False)
    .agg(**agg_dict_combined)
    .reset_index()
)

# Remove rows where count_T0 is zero
df_combined_summ = df_combined_summ.loc[(df_combined_summ['loan_no_T0'] > 0) | (df_combined_summ['loan_no_T1'] > 0)]

df_combined_summ_a = (
    df_loan
    .groupby(['Loan_Status', 'Covered_Loans_T0', 'Covered_Loans_T1','Arrears_Band_T0','Arrears_Band_T1'],dropna=False)
    .agg(**agg_dict_combined)
    .reset_index()
)

# Remove rows where count_T0 is zero
df_combined_summ_a = df_combined_summ_a.loc[(df_combined_summ_a['loan_no_T0'] > 0) | (df_combined_summ_a['loan_no_T1'] > 0)]

# Create the T0 summary DataFrame
df_summ_art0a = (
    df_loan
    .groupby(['Covered_Loans_T0','Arrears_Band_T0'],dropna=False)
    .agg(**agg_dict_T0)
    .reset_index()
)

# Remove rows where count_T0 is zero
df_summ_art0a = df_summ_art0a.loc[(df_summ_art0a['loan_no_T0'] > 0)]

df_summ_art0b = (
    df_loan
    .groupby(['Loan_Status', 'Covered_Loans_T0', 'Covered_Loans_T1','Arrears_Band_T0'],dropna=False)
    .agg(**agg_dict_T0)
    .reset_index()
)

# Remove rows where count_T0 is zero
df_summ_art0b = df_summ_art0b.loc[(df_summ_art0b['loan_no_T0'] > 0)]

# Create the T1 summary DataFrame
df_summ_art1a = (
    df_loan
    .groupby(['Covered_Loans_T1','Arrears_Band_T1'],dropna=False)
    .agg(**agg_dict_T1)
    .reset_index()
)

# Remove rows where count_T1 is zero
df_summ_art1a = df_summ_art1a.loc[(df_summ_art1a['loan_no_T1'] > 0)]

df_summ_art1b = (
    df_loan
    .groupby(['Loan_Status', 'Covered_Loans_T0', 'Covered_Loans_T1','Arrears_Band_T1'],dropna=False)
    .agg(**agg_dict_T1)
    .reset_index()
)

# Remove rows where count_T1 is zero
df_summ_art1b = df_summ_art1b.loc[(df_summ_art1b['loan_no_T1'] > 0)]

# Export all three DataFrames to one Excel file
with pd.ExcelWriter(f'summary_report_{file_run}_{pd.to_datetime("today").date()}.xlsx') as writer:
    df_combined_summ.to_excel(writer, sheet_name='Summary', index=False)
    df_combined_summ_a.to_excel(writer, sheet_name='Summary_all', index=False)
    df_summ_art0a.to_excel(writer, sheet_name='Arrears_Band_T0_Summary', index=False)
    df_summ_art0b.to_excel(writer, sheet_name='Arrears_Band_T0_Summaryb', index=False)
    df_summ_art1a.to_excel(writer, sheet_name='Arrears_Band_T1_Summary', index=False)
    df_summ_art1b.to_excel(writer, sheet_name='Arrears_Band_T1_Summaryb', index=False)
    
    # Add the DataFrame description and original DataFrame to new sheets
    df_loan_description.to_excel(writer, sheet_name='Loan_Description', index=True)
    df_loan.to_excel(writer, sheet_name='Loan_Data', index=False)

print(f'DataFrames exported to summary_report_{file_run}_{pd.to_datetime("today").date()}.xlsx')

In [None]:
# Transition Rates
df_loan_filtered = (
    df_loan
    # .loc[(df_loan.Arrears_Band_T0 != '0') | (df_loan.Arrears_Band_T1 != '0')]
)

df_rev_t0 = (
    pd
    .crosstab(index=[df_loan_filtered['Loan_Status'], df_loan_filtered['Arrears_Band_T0']]
               ,columns=df_loan_filtered['Arrears_Band_T1']
               ,values=df_loan_filtered['Loan_No']
               ,aggfunc='count'
               ,normalize='index').stack().reset_index().rename(columns={0:'transRate'})
)
df_rev_t0

In [None]:
# Create all combinations of T0_Flag and T1_Flag
# status = df_rev_t0['Loan_Status'].unique()
# arr_flags = df_rev_t0['Arrears_Band_T0'].unique()
# all_combinations = pd.MultiIndex.from_product(
#     [status, arr_flags], 
#     names=['status', 'Arr_Flag']
# )

# Pivot table for visualization
heatmap_data = df_rev_t0.pivot_table(
    index=['Loan_Status','Arrears_Band_T0'], 
    columns='Arrears_Band_T1', 
    values='transRate',
    # fill_value=0  # Fill missing values with 0
)

# Visualizing the transRate in a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', cbar=True)
plt.title('Heatmap of Transaction Rates for T0 and T1 Flags')
plt.xlabel('Arrears Tranche T1')
plt.ylabel('T0 and T1 Flags')
plt.show()

In [None]:
# Reviewing movements
def var_movement(df, var_cm, var_pm, loan_no='Loan_No', exclude_zeros=False, exclude_negatives=False):
    """
    Analyzes movements between two variables and identifies significant changes.

    This function calculates the movement between two specified columns in the DataFrame
    (`var_cm` and `var_pm`) and filters the results based on their deviations from the mean.
    It can optionally exclude rows where either of the specified columns is zero.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the data.
    var_cm (str): The name of the column representing the current value.
    var_pm (str): The name of the column representing the previous value.
    loan_no (str, optional): The name of the column representing loan numbers. Defaults to 'Loan_No'.
    exclude_zeros (bool, optional): If True, excludes rows where var_cm or var_pm are zero. Defaults to False.
    exclude_negatives (bool, optional): If True, excludes rows where var_cm or var_pm less than zero. Defaults to False.

    Returns:
    tuple: Two DataFrames containing significant movements:
        - significant_movements_one_sd: Movements greater than one standard deviation from the mean.
        - significant_movements_two_sd: Movements greater than two standard deviations from the mean.
    
    Visuals:
    Displays a histogram of the movements with lines indicating one and two standard deviations,
    as well as a QQ plot to assess the normality of the movements.
    """
    
    # Calculate the val_move variable as the difference between var_cm and var_pm
    df['val_move'] = df[var_cm] - df[var_pm]

    # Optionally exclude rows where var_cm or var_pm are zero
    if exclude_zeros:
        df = df[(df[var_cm] != 0) & (df[var_pm] != 0)]
        
    # Optionally exclude rows where var_cm or var_pm less than zero
    if exclude_negatives:
        df = df[(df[var_cm] >= 0) & (df[var_pm] >= 0)]

    # Calculate mean and standard deviation
    mean = df['val_move'].mean()
    std_dev = df['val_move'].std()

    print(f'Mean: {mean}, Standard Deviation: {std_dev}')

    # Use .loc to filter for significant movements using one and two standard deviations
    significant_movements_one_sd = df.loc[
        (df['val_move'] > (mean + std_dev)) | (df['val_move'] < (mean - std_dev)),
        [loan_no, 'val_move', var_cm, var_pm]
    ].dropna().sort_values(by='val_move')

    significant_movements_two_sd = df.loc[
        (df['val_move'] > (mean + 2 * std_dev)) | (df['val_move'] < (mean - 2 * std_dev)),
        [loan_no, 'val_move', var_cm, var_pm]
    ].dropna().sort_values(by='val_move')

    # Plotting
    plt.figure(figsize=(12, 6))
    
    # Histogram and KDE with trimmed data
    sns.histplot(df['val_move'], bins=30, kde=True, color='lightgray', label='Distribution of val_move', stat='density')
    
    # Limit the x-axis for better visualization
    plt.xlim(mean - 4 * std_dev, mean + 4 * std_dev)

    # Add vertical lines for one and two standard deviations
    plt.axvline(mean + std_dev, color='blue', linestyle='--', label='Mean + 1 SD')
    plt.axvline(mean - std_dev, color='blue', linestyle='--', label='Mean - 1 SD')
    plt.axvline(mean + 2 * std_dev, color='red', linestyle='--', label='Mean + 2 SD')
    plt.axvline(mean - 2 * std_dev, color='red', linestyle='--', label='Mean - 2 SD')

    plt.title('Histogram of val_move with Standard Deviations')
    plt.legend()

    # QQ Plot
    plt.figure(figsize=(12, 6))
    stats.probplot(df['val_move'], dist="norm", plot=plt)
    plt.title('QQ Plot of val_move')
    
    plt.tight_layout()
    plt.show()

    # Return the significant movements DataFrames
    return significant_movements_one_sd, significant_movements_two_sd

# Example usage:
# df = pd.read_csv('your_data.csv')
# one_sd_df, two_sd_df = var_movement(df, 'val_cm', 'val_pm', exclude_zeros=True)

In [None]:
# Net Outstanding Balance
one_sd_df, two_sd_df = var_movement(df_loan, 'Net_Balance_T1', 'Net_Balance_T0', exclude_zeros=True)

one_sd_df.shape
one_sd_df

two_sd_df.shape
two_sd_df

In [None]:
# Provision Amount
one_sd_df, two_sd_df = var_movement(df_loan, 'Prov_final_T1', 'Prov_final_T0', exclude_zeros=True)

one_sd_df.shape
one_sd_df

two_sd_df.shape
two_sd_df

In [None]:
# Weeks in Arrears
one_sd_df, two_sd_df = var_movement(df_loan, 'WeeksInArrears_T1', 'WeeksInArrears_T0', exclude_negatives=True)

one_sd_df.shape
one_sd_df

two_sd_df.shape
two_sd_df

In [None]:
# Decile review
def decile_summary(df, variable, additional_variable=None):
    """
    Produces a decile summary report for a specified variable in a DataFrame,
    with an optional additional variable to show values associated with the decile.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the data.
    variable (str): The name of the column for which to calculate deciles.
    additional_variable (str, optional): An additional variable to be summarized with respect to the deciles.

    Returns:
    pd.DataFrame: A summary DataFrame containing decile information.
    """
    # Calculate deciles
    deciles = np.percentile(df[variable].dropna(), np.arange(0, 101, 10))
    
    # Create bins based on the deciles
    df['Decile'] = pd.cut(df[variable], bins=deciles, include_lowest=True, labels=np.arange(1, 11))
    
    # Prepare aggregation dictionaries
    agg_dict = {
        'Count': ('Decile', 'size'),
        'Sum': (variable, 'sum'),
        'Mean': (variable, 'mean'),
        'Std_Dev': (variable, 'std'),
        'Min': (variable, 'min'),
        'Max': (variable, 'max'),
    }

    # If an additional variable is provided, add it to the aggregation
    if additional_variable:
        agg_dict[f'Count_GT_0_{additional_variable}'] = (additional_variable, lambda x: (x > 0).sum())
        agg_dict[f'Mean_{additional_variable}'] = (additional_variable, 'mean')
        agg_dict[f'Sum_{additional_variable}'] = (additional_variable, 'sum')
        agg_dict[f'Std_Dev_{additional_variable}'] = (additional_variable, 'std')

    # Group by Decile and calculate summary statistics
    summary = df.groupby('Decile').agg(**agg_dict).reset_index()

    return summary

# Example usage:
# Sample DataFrame with multiple variables
# Get the decile summary with an additional variable
# decile_report = decile_summary(df, 'val_move', additional_variable='another_variable')
# print(decile_report)

In [None]:
decile_report_1 = decile_summary(df_loan, 'Balance_T1', 'Prov_final_T1')
decile_report_1

In [None]:
# Function to create scatter graphs with and without filtering
def create_scatter_graph(df, x_variable, y_variable, z_variable, filter_values=None):
    """
    Creates scatter graphs for the specified x and y variables, with the z variable
    determining the color of the points. Displays two plots: one with all data and
    another filtered by the specified values in the z variable.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the data.
    x_variable (str): The name of the column for the x-axis.
    y_variable (str): The name of the column for the y-axis.
    z_variable (str): The name of the column for coloring the points.
    filter_values (list, optional): A list of values in the z variable to filter out.
    """
    # Plot all data
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    sns.scatterplot(data=df, x=x_variable, y=y_variable, hue=z_variable, palette='viridis', style=z_variable)
    plt.title('Scatter Graph of Balance vs Provision Amount (All Data)')
    plt.xlabel(x_variable)
    plt.ylabel(y_variable)
    plt.legend(title=z_variable)
    plt.grid(True)

    # Plot filtered data if filter_values is provided
    if filter_values:
        filtered_df = df[~df[z_variable].isin(filter_values)]
        
        plt.subplot(1, 2, 2)
        sns.scatterplot(data=filtered_df, x=x_variable, y=y_variable, hue=z_variable, palette='viridis', style=z_variable)
        plt.title(f'Scatter Graph (Excluding {", ".join(filter_values)})')
        plt.xlabel(x_variable)
        plt.ylabel(y_variable)
        plt.legend(title=z_variable)
        plt.grid(True)

    plt.tight_layout()
    plt.show()

# Example usage
create_scatter_graph(df_loan, 'Balance_T0', 'Prov_final_T0', 'Arrears_Band_T0', filter_values=['0','1-9'])


In [None]:
# Function to create scatter graphs with filtering and colored bounding boxes
def create_scatter_graph_p(df, x_variable, y_variable, z_variable, filter_values=None):
    """
    Creates scatter graphs for the specified x and y variables, with the z variable
    determining the color of the points. Displays two plots: one with all data and
    another filtered by the specified values in the z variable, with bounding boxes around cohorts.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the data.
    x_variable (str): The name of the column for the x-axis.
    y_variable (str): The name of the column for the y-axis.
    z_variable (str): The name of the column for coloring the points.
    filter_values (list, optional): A list of values in the z variable to filter out.
    """
    # Set a color palette
    palette = sns.color_palette("viridis", len(df[z_variable].unique()))
    
    # Plot all data
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    sns.scatterplot(data=df, x=x_variable, y=y_variable, hue=z_variable, palette='viridis', style=z_variable)
    plt.title('Scatter Graph of Balance vs Provision Amount (All Data)')
    plt.xlabel(x_variable)
    plt.ylabel(y_variable)
    plt.legend(title=z_variable)
    plt.grid(True)

    # Plot filtered data if filter_values is provided
    if filter_values:
        filtered_df = df[~df[z_variable].isin(filter_values)]
        
        plt.subplot(1, 2, 2)
        scatter = sns.scatterplot(data=filtered_df, x=x_variable, y=y_variable, hue=z_variable, palette='viridis', style=z_variable)
        plt.title(f'Scatter Graph (Excluding {", ".join(filter_values)})')
        plt.xlabel(x_variable)
        plt.ylabel(y_variable)
        plt.legend(title=z_variable)
        plt.grid(True)

        # Draw bounding boxes around the different cohorts
        for i, cohort in enumerate(df[z_variable].unique()):
            if cohort not in filter_values:
                cohort_data = filtered_df[filtered_df[z_variable] == cohort]
                if not cohort_data.empty:
                    x_min, x_max = cohort_data[x_variable].min(), cohort_data[x_variable].max()
                    y_min, y_max = cohort_data[y_variable].min(), cohort_data[y_variable].max()
                    
                    # Create a rectangle patch with color from the palette
                    rect = patches.Rectangle((x_min, y_min), x_max - x_min, y_max - y_min,
                                             linewidth=2, edgecolor=palette[i], facecolor='none', linestyle='--')
                    plt.gca().add_patch(rect)

    plt.tight_layout()
    plt.show()

# Example usage
create_scatter_graph_p(df_loan, 'Balance_T0', 'Prov_final_T0', 'Arrears_Band_T0', filter_values=['0',])
create_scatter_graph_p(df_loan, 'Balance_T0', 'Prov_final_T0', 'Arrears_Band_T0', filter_values=['0','1-9'])