# Disseration Experiment 
# Dataset Visualisations
Ciaran Finnegan February 2024

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

# Box Plots

In [2]:
def generate_box_plots(df, sTarget_feature, 
                       sFeature_analysis_1, 
                       sFeature_analysis_2, 
                       sFeature_analysis_3,
                       sFeature3_ticklabel1, 
                       sFeature3_ticklabel2):
    # Set up the figure and axes
    fig, ax = plt.subplots(2, 2, figsize=(14, 10))

    # Plot distribution of the dataset target variable
    sns.countplot(data=df, x=sTarget_feature, ax=ax[0, 0])
    sPlot_title1 = 'Distribution of ' + sTarget_feature.upper() + ' Status'
    ax[0, 0].set_title(sPlot_title1)
    ax[0, 0].set_xticklabels(['Non ' + sTarget_feature.upper(), sTarget_feature.upper()])

    # Plot distribution of <feature one> based on target variable status
    sns.boxplot(data=df, x=sTarget_feature, y=sFeature_analysis_1, ax=ax[0, 1])
    sPlot_title2 = 'Credit Limit Distribution by ' + sTarget_feature.upper() + ' Status'
    ax[0, 1].set_title(sPlot_title2)
    ax[0, 1].set_xticklabels(['Non ' + sTarget_feature.upper(), sTarget_feature.upper()])

    # Plot distribution of <feature two>  based on target variable status
    sns.boxplot(data=df, x=sTarget_feature, y=sFeature_analysis_2, ax=ax[1, 0])
    sPlot_title3 = 'Age Distribution by ' + sTarget_feature.upper() + ' Status'
    ax[1, 0].set_title(sPlot_title3)
    ax[1, 0].set_xticklabels(['Non ' + sTarget_feature.upper(), sTarget_feature.upper()])

    # Plot distribution of <feature three> based on target variable status
    sns.countplot(data=df, x=sFeature_analysis_3, hue=sTarget_feature, ax=ax[1, 1])
    sPlot_title4 = sFeature_analysis_3.upper() + ' Distribution by ' + sTarget_feature.upper() + ' Status'
    ax[1, 1].set_title(sPlot_title4)
    ax[1, 1].set_xticklabels([sFeature3_ticklabel1, sFeature3_ticklabel2])
    ax[1, 1].legend(title=sTarget_feature.upper() + ' Status', labels=['Non ' + sTarget_feature.upper(), sTarget_feature.upper()])

    plt.tight_layout()
    plt.show()

In [3]:
def generate_cc_f_box_plots(df, sTarget_feature, 
                       sFeature_analysis_1, 
                       sFeature_analysis_2, 
                       sFeature_analysis_3,
                       sFeature3_ticklabel1, 
                       sFeature3_ticklabel2):
    # Set up the figure and axes
    fig, ax = plt.subplots(2, 2, figsize=(14, 10))

    # Plot distribution of the dataset target variable
    sns.countplot(data=df, x=sTarget_feature, ax=ax[0, 0])
    sPlot_title1 = 'Distribution of ' + sTarget_feature.upper() + ' Status'
    ax[0, 0].set_title(sPlot_title1)
    ax[0, 0].set_xticklabels(['Non ' + sTarget_feature.upper(), sTarget_feature.upper()])

    # Plot distribution of <feature one> based on target variable status
    sns.boxplot(data=df, x=sTarget_feature, y=sFeature_analysis_1, ax=ax[0, 1])
    sPlot_title2 = sFeature_analysis_1.upper() + ' Distribution by ' + sTarget_feature.upper() + ' Status'
    ax[0, 1].set_title(sPlot_title2)
    ax[0, 1].set_xticklabels(['Non ' + sTarget_feature.upper(), sTarget_feature.upper()])

    # Plot distribution of <feature two>  based on target variable status
    sns.countplot(data=df, x=sFeature_analysis_2, hue=sTarget_feature, ax=ax[1, 0])
    sPlot_title3 = sFeature_analysis_2.upper() + ' Distribution by ' + sTarget_feature.upper() + ' Status'
    ax[1, 0].set_title(sPlot_title3)
    ax[1, 0].legend(title=sTarget_feature.upper() + ' Status', labels=['Non ' + sTarget_feature.upper(), sTarget_feature.upper()])
    plt.xlabel(sFeature_analysis_2)
    plt.ylabel('Count')

    
    # Plot distribution of <feature three> based on target variable status
    sns.countplot(data=df, x=sFeature_analysis_3, hue=sTarget_feature, ax=ax[1, 1])
    sPlot_title4 = sFeature_analysis_3.upper() + ' Distribution by ' + sTarget_feature.upper() + ' Status'
    ax[1, 1].set_title(sPlot_title4)
    ax[1, 1].set_xticklabels([sFeature3_ticklabel1, sFeature3_ticklabel2])
    ax[1, 1].legend(title=sTarget_feature.upper() + ' Status', labels=['Non ' + sTarget_feature.upper(), sTarget_feature.upper()])

    plt.tight_layout()
    plt.show()

# HeatMaps

In [4]:
def generate_heatmap(df, df_desc):
    # Plotting correlation heatmap
    plt.figure(figsize=(15, 10))
    sns.heatmap(df.corr(), cmap='coolwarm', annot=True, fmt=".2f", linewidths=.5)
    plt.title("Correlation Heatmap for " + df_desc)
    plt.show()

In [5]:
def plot_top_correlated_features(df, df_desc='DataFrame'):
    """
    Generates a heatmap of the top 15 most correlated features in the DataFrame.
    
    Args:
    df (DataFrame): The DataFrame to analyze.
    df_desc (str): Description of the DataFrame.
    """
    
    # Calculate correlation matrix
    corr = df.corr()

    # Select the upper triangle of correlation matrix
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool8))

    # Find index of feature columns with the top 15 absolute correlations to the target
    # Assuming the target variable is the last column, adjust if necessary.
    top_features = upper.abs().unstack().sort_values(ascending=False).dropna().index.levels[1][:15]

    # Get the top correlated features' correlation matrix
    top_corr = df[top_features].corr()

    # Use seaborn to generate a heatmap
    sns.set(style="white")
    plt.figure(figsize=(14, 10))
    plt.title(f'Top 15 Most Correlated Features in {df_desc}', fontsize=18)
    sns.heatmap(top_corr, annot=True, cmap='coolwarm', fmt=".2f", cbar_kws={'shrink': .5})
    plt.show()

# Data Distributions

In [6]:
def generate_distributions(df,                        
                       sFeature_analysis_1, 
                       sFeature_analysis_2, 
                       sFeature_analysis_3):
    # Plotting distributions for continuous features
    fig, ax = plt.subplots(1, 2, figsize=(15, 6))

    sns.histplot(df[sFeature_analysis_1], bins=30, ax=ax[0], color="skyblue")
    ax[0].set_title("Distribution of "+sFeature_analysis_1.upper())
    ax[0].set_xlabel(sFeature_analysis_1.upper())
    #ax[0].set_xlabel("Credit Limit")
    ax[0].set_ylabel("Count")

    sns.histplot(df[sFeature_analysis_2], bins=30, ax=ax[1], color="salmon")
    ax[1].set_title("Distribution of "+ sFeature_analysis_2.upper())
    ax[1].set_xlabel(sFeature_analysis_2.upper())
    ax[1].set_ylabel("Count")

    plt.tight_layout()
    plt.show()

    # Plotting distributions for categorical features
    fig, ax = plt.subplots(1, 3, figsize=(18, 5))

    sns.countplot(data=df, x=sFeature_analysis_3, ax=ax[0], palette="pastel")
    ax[0].set_title("Distribution of " + sFeature_analysis_3.upper())
    ax[0].set_xlabel("Gender (1 = Male, 2 = Female)")
    ax[0].set_ylabel("Count")

    sns.countplot(data=df, x="EDUCATION", ax=ax[1], palette="pastel")
    ax[1].set_title("Distribution of Education")
    ax[1].set_xlabel("Education Level")
    ax[1].set_ylabel("Count")

    sns.countplot(data=df, x="MARRIAGE", ax=ax[2], palette="pastel")
    ax[2].set_title("Distribution of Marital Status")
    ax[2].set_xlabel("Marital Status")
    ax[2].set_ylabel("Count")

    plt.tight_layout()
    plt.show()

# Dataframe Display Functions

In [7]:
def styled_dataframe(df):
    header_styles = {
        'selector': 'th',
        'props': [('text-align', 'center')]
    }
    
    body_styles = {
        'selector': 'td',
        'props': [('text-align', 'center')]
    }
    
    general_styles = {
        'selector': 'table',
        'props': [('background-color', '#f4f4f4'),
                  ('color', '#000000'),
                  ('border-color', '#e0e0e0'),
                  ('border', '1px solid #e0e0e0'),
                  ('font-family', 'Arial, sans-serif'),
                  ('width', '100%'),
                  ('font-size', '12px'),  # Reduce font size
                  ('padding', '5px')]     # Adjust cell padding
    }
    
    # Apply the styles to the dataframe
    styled_df = (df.style.set_table_styles([general_styles, header_styles, body_styles])
                 .set_properties(subset=df.columns, 
                                 **{'min-width': '150px',  # Adjust column width
                                    'padding': '5px'})  # Adjust cell padding
                 .format(None, na_rep='NA'))
    
    # Convert styled dataframe to HTML and wrap in a div container for scrolling
    styled_html = f'<div style="width:100%; overflow-x:auto;">{styled_df.to_html()}</div>'
    
    return display(HTML(styled_html))

In [8]:
# Enhanced table styling using HTML and CSS
xai_styles = """
    <style>
        table {
            border-collapse: collapse;
            width: 50%;
            font-family: Arial, sans-serif;
        }
        th {
            background-color: #4CAF50;
            color: white;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        tr:nth-child(even) {
            background-color: #f2f2f2;
        }
        tr:hover {
            background-color: #ddd;
        }
    </style>
"""

In [9]:
from IPython.display import display, HTML

def display_with_scrollbar(df, height='300px'):
    """
    Display a dataframe with a scrollbar in a notebook.

    Args:
    - df (pd.DataFrame): The dataframe to display.
    - height (str): Height of the display area. Default is 300 pixels.
    """
    # Convert the dataframe to HTML
    df_html = df.to_html()

    # Display the dataframe with a fixed height and a scrollbar
    display(HTML(f'<div style="height: {height}; overflow-y: auto;">{df_html}</div>'))