In [1]:
# Standard libraries
import math
import warnings

# Data manipulation libraries
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Machine learning
from sklearn.metrics import roc_curve, precision_recall_curve


warnings.filterwarnings('ignore')

In [None]:
# Before working on Dataset, I made dataset column names and some values consistant

df = pd.read_csv('Datasets\\Stroke_Dataset.csv')

def rename_columns(df):
    column_mapping = {
        "id": "Patient_ID",
        "gender": "Gender",
        "age": "Age",
        "hypertension": "Hypertension",
        "heart_disease": "Heart_Disease",
        "ever_married": "Ever_Married",
        "work_type": "Work_Type",
        "Residence_type": "Residence_Type",
        "avg_glucose_level": "Avg_Glucose_Level",
        "bmi": "BMI",
        "smoking_status": "Smoking_Status",
        "stroke": "Stroke"
    }
    
    df = df.rename(columns=column_mapping)
    return df

def replace_binary_values(df, column):
    df[column] = df[column].replace({1: "Yes", 0: "No"})
    return df

df = rename_columns(df)  # Rename columns

df = replace_binary_values(df, "Hypertension")
df = replace_binary_values(df, "Heart_Disease")
df = replace_binary_values(df, "Stroke")

df['Smoking_Status'] = df['Smoking_Status'].replace({
    'formerly smoked': 'Formerly Smoked',
    'never smoked': 'Never Smoked',
    'smokes': 'Smokes'
    })

df['Age'] = df['Age'].astype(int)

# Save as a Parquet file
df.to_parquet("Datasets\\Stroke_Dataset.parquet", index=False)


In [None]:
# Loads Dataset and shows it's basic structure, (also handles any errors)
def Load_Dataset(filePath):
    try:
        df = pd.read_parquet(filePath)
        print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns")
        print("DataFrame Shape: ", df.shape)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filePath}")
        return None
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [None]:
def print_list(list):
    # Convert the list to a DataFrame and transpose it for a horizontal display
    df_summary = pd.DataFrame([list], columns=[f"{i+1}" for i in range(len(list))])
    
    # Display the DataFrame in a horizontal format
    display(df_summary.style.set_properties(**{'text-align': 'center'}).set_table_styles(
        [{'selector': 'th', 'props': [('text-align', 'center')]}]
    ))

In [None]:
def df_info(dataframe):
    # Create a DataFrame and displays it horizontaly
    df_summary = pd.DataFrame(
        {col: [str(dataframe[col].dtype), dataframe[col].notnull().sum()] for col in dataframe.columns},
        index=["Data_type", "Non_Null"]
    )
    
    # Display as a well-formatted table
    display(df_summary.style.set_properties(**{'text-align': 'center'}).set_table_styles(
        [{'selector': 'th', 'props': [('text-align', 'center')]}]
    ))

In [None]:
# Shows information of dataset in a well formatted table

def df_unique_counts(dataframe):
    df_summary = pd.DataFrame(dataframe.nunique()).T  # This make df columns as columns
    display(df_summary.style.set_properties(**{'text-align': 'center'}).set_table_styles(
        [{'selector': 'th', 'props': [('text-align', 'center')]}]
    ))

In [None]:
# Function to display feature importances as a well-formatted table (I'll print it after plot)
def df_feature_importances(dataframe):
    df_summary = dataframe.T
    display(df_summary.style.set_properties(**{'text-align': 'left'}).set_table_styles(
        [{'selector': 'th', 'props': [('text-align', 'left')]}]
    ))

In [4]:
# Display model metrics in table
def display_metrics(metrics, model_name="Model"):
    
    
    # Create Matrics
    df_metrics = pd.DataFrame({metric: [value] for metric, value in metrics.items()})
    
    # A row index
    df_metrics.index = [' ']
    
    # Format all values to decimal places
    for col in df_metrics.columns:
        df_metrics[col] = df_metrics[col].apply(lambda x: f"{x:.2f}")
    
    # Apply minimal styling
    styled_df = df_metrics.style.set_caption(f"{model_name} Performance Metrics").set_properties(**{
        'text-align': 'center'
    })
    
    display(styled_df)

#### Apply style to all plots

##### Included some general styles and settings, (I'll be using this in all of my plots for consistency)

In [3]:
def visualization(fig_size=(10, 6)):

    plt.style.use('petroff10')

    # Color Palette
    color_palette  = [
    "#cdb4db",
    "#ffc8dd",
    "#ffafcc",
    "#bde0fe",
    "#a2d2ff",
    "#fbc3bc",
    "#d4a5a5",  
    "#9cadce",  
    "#99c1b9",  
    "#f7d6e0"   
]

    # Apply globally
    plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_palette)

    light_bg = '#ffffff'
    light_panel = '#f7f7f7'

    # Configure figure properties
    plt.rcParams['figure.facecolor'] = light_bg
    plt.rcParams['figure.figsize'] = fig_size
    plt.rcParams['figure.dpi'] = 120

    # Configure axes properties
    plt.rcParams['axes.facecolor'] = light_bg
    plt.rcParams['axes.edgecolor'] = '#cccccc'
    plt.rcParams['axes.labelcolor'] = '#333333'
    plt.rcParams['axes.axisbelow'] = True
    plt.rcParams['axes.grid'] = True
    plt.rcParams['axes.titleweight'] = 'bold'

    # Remove top and right spines so that, it looks cleaner
    plt.rcParams['axes.spines.top'] = False
    plt.rcParams['axes.spines.right'] = False

    # Configure grid 
    plt.rcParams['grid.color'] = '#e0e0e0'
    plt.rcParams['grid.alpha'] = 0.5
    plt.rcParams['grid.linestyle'] = '--'

    # tick properties
    plt.rcParams['xtick.color'] = '#555555'
    plt.rcParams['ytick.color'] = '#555555'
    plt.rcParams['xtick.labelsize'] = 10
    plt.rcParams['ytick.labelsize'] = 10

    # overall font settings
    plt.rcParams['font.family'] = 'sans-serif'
    plt.rcParams['font.size'] = 12
    plt.rcParams['axes.titlesize'] = 16
    plt.rcParams['axes.labelsize'] = 12

    # Configure legend
    plt.rcParams['legend.facecolor'] = light_panel
    plt.rcParams['legend.edgecolor'] = '#dddddd'
    plt.rcParams['legend.fontsize'] = 10
    plt.rcParams['legend.framealpha'] = 0.8

    # seaborn style t(needed for matplotlib settings)
    sns.set_style("whitegrid")
    sns.set_context("notebook", font_scale=1)

    plt.tight_layout()

In [2]:
# I've created a single function to create subplots for bar & pie..

def generate_subplots(plot, df, column_list, columns=3, fig_size=(18, 5)):
    if plot in ['bar', 'pie']:

        visualization(fig_size)  # Applies global color cycle

        n_features = len(column_list)
        n_cols = columns
        n_rows = math.ceil(n_features / n_cols)

        fig, axes = plt.subplots(n_rows, n_cols, figsize=(fig_size[0], fig_size[1] * n_rows))
        
        # Handle the case where there's only one subplot
        if n_features == 1 and n_rows == 1 and n_cols == 1:
            axes = np.array([axes])
        else:
            axes = axes.flatten()

        if plot == 'bar':
            for i, feature in enumerate(column_list):
                ax = axes[i]
                value_counts = df[feature].value_counts()

                # Get colors from the Matplotlib color cycle
                colors = plt.cm.get_cmap('tab10', len(value_counts)).colors

                ax.bar(value_counts.index, value_counts.values, color=colors)
                ax.set_title(f'Distribution of {feature}', fontsize=12)
                ax.set_xlabel(feature)
                ax.set_ylabel('Count')

                for j, val in enumerate(value_counts.values):
                    ax.text(j, val, str(val), ha='center', va='bottom', fontsize=10)

        elif plot == 'pie':
            for i, feature in enumerate(column_list):
                ax = axes[i]
                value_counts = df[feature].value_counts()

                labels = [f'{val} ({count})' for val, count in zip(value_counts.index, value_counts.values)]
                colors = plt.cm.get_cmap('tab10', len(value_counts)).colors

                ax.pie(value_counts, autopct='%1.1f%%', startangle=90, colors=colors)
                ax.set_title(f'Distribution of {feature}', fontsize=12)
                ax.legend(labels, loc='upper center', bbox_to_anchor=(0.9, 1.0), ncol=1, fontsize='small')

        for j in range(i + 1, len(axes)):
            fig.delaxes(axes[j])

        plt.tight_layout(pad=3.0)
        plt.show()
    else:
        print("Invalid Argument:", plot)
        return

In [1]:
# Just a simple function, (to avoid repetitive code)

def gen_dist_plot_grp(dataframe, feature, bins=30, kde=True):

    # I used subplot2grid to place each plot as I wanted

    # Histogram Plot
    ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2, rowspan=2)
    sns.histplot(dataframe[feature], bins=bins, kde=kde, ax=ax1)
    ax1.set_title(f'Histogram of {feature}')
    ax1.set_xlabel(feature)
    ax1.set_ylabel('Count')

    # Box Plot
    ax2 = plt.subplot2grid((3, 2), (2, 0))
    sns.boxplot(x=dataframe[feature], ax=ax2)
    ax2.set_title(f'Box Plot of {feature}')
    ax2.set_xlabel(feature)

    # Violin Plot
    ax3 = plt.subplot2grid((3, 2), (2, 1))
    sns.violinplot(x=dataframe[feature], ax=ax3)
    ax3.set_title(f'Violin Plot of {feature}')
    ax3.set_xlabel(feature)

    plt.tight_layout()
    plt.show()

# **Performance Analysis Plot Functions**

In [5]:
def plot_roc_curves(y_test, model_results_dict):

    plt.figure(figsize=(15, 10))
    
    # Plot diagonal line (random classifier)
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    
    # Plot ROC curve for each model
    for model_name, results in model_results_dict.items():
        probabilities = results['probabilities']
        
        # Calculate ROC curve
        fpr, tpr, _ = roc_curve(y_test, probabilities)
        roc_auc = results['metrics']['ROC_AUC']
        
        # Plot the ROC curve
        plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.4f})')
    
    # Customize plot
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves Comparison', fontsize=14)
    plt.legend(loc="lower right")
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()

In [6]:
def plot_pr_curves(y_test, model_results_dict):

    plt.figure(figsize=(15, 10))
    
    # Plot PR curve for each model
    for model_name, results in model_results_dict.items():
        probabilities = results['probabilities']
        
        # Calculate Precision-Recall curve
        precision, recall, _ = precision_recall_curve(y_test, probabilities)
        pr_auc = results['metrics']['PR_AUC']
        
        # Plot the PR curve
        plt.plot(recall, precision, lw=2, label=f'{model_name} (PR AUC = {pr_auc:.4f})')
    
    # Plot baseline
    no_skill = sum(y_test) / len(y_test)
    plt.plot([0, 1], [no_skill, no_skill], 'k--', lw=2, label=f'Baseline ({no_skill:.4f})')
    
    # Customize plot
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall', fontsize=12)
    plt.ylabel('Precision', fontsize=12)
    plt.title('Precision-Recall Curves Comparison', fontsize=14)
    plt.legend(loc="best")
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()

In [7]:
def plot_metrics_comparison(model_results_dict, metrics_to_plot=None):

    if metrics_to_plot is None:
        metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC_AUC']
    
    # Extract metrics from each model
    data = {}
    for model_name, results in model_results_dict.items():
        model_metrics = results['metrics']
        data[model_name] = [model_metrics[metric] for metric in metrics_to_plot]
    
    # Create plot
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Set width of bars
    bar_width = 0.15
    index = np.arange(len(metrics_to_plot))
    
    # Plot bars for each model
    for i, (model_name, values) in enumerate(data.items()):
        ax.bar(index + i * bar_width, values, bar_width, label=model_name)
    
    # Add labels and customize plot
    ax.set_xlabel('Metrics', fontsize=12)
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Model Performance Comparison', fontsize=14)
    ax.set_xticks(index + bar_width * (len(data) - 1) / 2)
    ax.set_xticklabels(metrics_to_plot)
    ax.legend()
    ax.grid(True, linestyle='--', alpha=0.7, axis='y')
    
    # Add value labels on top of bars
    for i, rect in enumerate(ax.patches):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., height + 0.01,
                f'{height:.2f}', ha='center', va='bottom', rotation=0)
    
    plt.ylim(0, 1.1)  # Metrics are typically between 0 and 1
    plt.tight_layout()
    plt.show()

In [None]:
def plot_confusion_matrices(model_results_dict, class_names=None):
    
    if class_names is None:
        class_names = ['Negative', 'Positive']
    
    # Calculate number of rows and columns for subplots
    n_models = len(model_results_dict)
    n_cols = min(2, n_models)
    n_rows = (n_models + n_cols - 1) // n_cols
    
    # Create figure
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*6, n_rows*5))
    
    # Make axes a 2D array even if there's only one row
    if n_rows == 1:
        axes = axes.reshape(1, -1)
    
    # Flatten axes for easy iteration if there's only one model
    if n_models == 1:
        axes = np.array([axes])
    
    # Plot each confusion matrix
    for i, (model_name, results) in enumerate(model_results_dict.items()):
        row = i // n_cols
        col = i % n_cols
        
        cm = results['confusion_matrix']
        
        # Normalize confusion matrix
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # use thin in next line if you want to see normalized ones..
        
        # Plot
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                    xticklabels=class_names, yticklabels=class_names, ax=axes[row, col])
        
        # Add labels and title
        axes[row, col].set_xlabel('Predicted Label')
        axes[row, col].set_ylabel('True Label')
        axes[row, col].set_title(f'{model_name}\nConfusion Matrix')
        
        # Calculate and display accuracy
        accuracy = results['metrics']['Accuracy']
        axes[row, col].text(0.12, 1.02, f'Accuracy: {accuracy:.4f}', 
                         ha='center', va='center', transform=axes[row, col].transAxes)
    
    # Hide unused subplots
    for i in range(n_models, n_rows * n_cols):
        row = i // n_cols
        col = i % n_cols
        fig.delaxes(axes[row, col])
    
    plt.tight_layout()
    plt.show()