In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import pygwalker as pyg

# data = pd.read_csv("Fraud.csv")  # Importing the dataset into a dataframe

def run_pyg(data):
    pyg.walk(data)

# Summary Statistics

def summary_stats(data):
    print("\n--- Summary Statistics ---")
    n_rows, n_cols = data.shape
    print(f"Number of rows (instances): {n_rows}")
    print(f"Number of columns (features): {n_cols}")

    numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = data.select_dtypes(exclude=[np.number]).columns.tolist()
    
    print(f"Number of numerical features: {len(numerical_features)}")
    print(f"Number of categorical features: {len(categorical_features)}")
    print(f"Numerical features: {numerical_features}")
    print(f"Categorical features: {categorical_features}")

    return numerical_features, categorical_features

# Correlation with Target ('isFraud')

def correlation_with_target(data, numerical_features, target_col):
    print("\n--- Correlation with 'isFraud' ---")
    if target_col not in data.columns:
        raise ValueError(f" Target column '{target_col}' not found!")
    
    # Compute Pearson correlation for numerical features
    correlations = {}
    for col in numerical_features:
        if col != target_col:
            # Drop NaNs for correlation computation
            valid_rows = data[[col, target_col]].dropna()
            if not valid_rows.empty:
                corr, _ = pearsonr(valid_rows[col], valid_rows[target_col])
                correlations[col] = abs(corr)
   
    # Print all correlations with isFraud
    correlation_values = sorted(correlations, key=correlations.get, reverse=True)
    print("Pearson Correlation Values with 'isFraud':")
    for feature in correlation_values:
        print(f"{feature}: {correlations[feature]:.4f}")
    
    return correlation_values, target_col

# Plotting Histograms
def plot_histograms(data):
    print("\n--- Plotting Histograms ---")
    data.hist(figsize=(12, 10), edgecolor = 'black')
    plt.tight_layout()
    plt.show()

# Plotting Violin plot
def plot_violin(data, numerical_features, target):
    print("\n--- Plotting Violin Plots ---")
    features = [f for f in numerical_features if f != target]
    n = len(features)

    ncols = 3
    nrows = math.ceil(n / ncols)

    fig, axes = plt.subplots(nrows, ncols, figsize=(5*ncols, 4*nrows))
    axes = axes.flatten()

    for ax, feature in zip(axes, features):
        sns.violinplot(data=data, x=target, y=feature, ax=ax)
        ax.set_title(f'Violin Plot of {feature} by {target}')
        ax.set_xlabel(target)
        ax.set_ylabel(feature)

    for ax in axes[len(features):]:
        ax.set_visible(False)

    plt.tight_layout()
    plt.show()

# Plotting Strip plot
def plot_strip(data, numerical_features, target):
    print("\n--- Plotting Stripplots ---")
    features = [f for f in numerical_features if f != target]
    n = len(features)

    ncols = 3
    nrows = math.ceil(n / ncols)

    fig, axes = plt.subplots(nrows, ncols, figsize=(5*ncols, 4*nrows))
    axes = axes.flatten()  # flatten grid to 1D list

    for ax, feature in zip(axes, features):
        sns.stripplot(data=data, x=target, y=feature, alpha=0.7, size=4, ax=ax)
        ax.set_title(f'Strip Plot of {feature} by {target}')
        ax.set_xlabel(target)
        ax.set_ylabel(feature)

    # Hide unused subplots (if features < rows*cols)
    for ax in axes[len(features):]:
        ax.set_visible(False)

    plt.tight_layout()
    plt.show()

# Plotting Correlation Heatmap

def plot_heatmap(data):
    print("\n--- Plotting Correlation ---")
    # Create correlation matrix from the numeric features
    # Get a list of columns that are not integers with only two unique values
    numeric_columns = [
        col for col in data.columns
        if not (data[col].dtype.kind in 'i' and data[col].nunique() == 2)
    ]

    # Create the filtered DataFrame
    filtered_df = data[numeric_columns]
    corr_matrix = filtered_df.corr(numeric_only=True)  # Calculate correlation matrix
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
    plt.title("Heatmap of Correlation between features")
    plt.tight_layout()  # Ensures the plot fits well in the figure
    plt.show()