# Univariate, Bivariate, and Multivariate Analysis

This notebook performs:
1. **Univariate Analysis** - Analysis of individual variables
2. **Bivariate Analysis** - Analysis of relationships between two variables  
3. **Multivariate Analysis** - Analysis of relationships among multiple variables


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from scipy import stats

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
np.random.seed(42)

# Load data
data_path = Path('../../data/fraud_data.csv')
df = pd.read_csv(data_path)
print(f"Data loaded: {df.shape}")


## 1. Univariate Analysis

Univariate analysis examines individual variables to understand their distribution, central tendencies, and variability.


In [None]:
# Univariate analysis for TransactionAmt
if 'TransactionAmt' in df.columns:
    print("Univariate Analysis: TransactionAmt")
    print("="*80)
    
    # Basic statistics
    print(f"Mean: {df['TransactionAmt'].mean():.2f}")
    print(f"Median: {df['TransactionAmt'].median():.2f}")
    print(f"Mode: {df['TransactionAmt'].mode().values[0] if len(df['TransactionAmt'].mode()) > 0 else 'N/A'}")
    print(f"Std: {df['TransactionAmt'].std():.2f}")
    print(f"Variance: {df['TransactionAmt'].var():.2f}")
    print(f"Skewness: {df['TransactionAmt'].skew():.4f}")
    print(f"Kurtosis: {df['TransactionAmt'].kurtosis():.4f}")
    print(f"Min: {df['TransactionAmt'].min():.2f}")
    print(f"Max: {df['TransactionAmt'].max():.2f}")
    print(f"25th percentile: {df['TransactionAmt'].quantile(0.25):.2f}")
    print(f"75th percentile: {df['TransactionAmt'].quantile(0.75):.2f}")
    print(f"IQR: {df['TransactionAmt'].quantile(0.75) - df['TransactionAmt'].quantile(0.25):.2f}")
    
    # Comprehensive visualization
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    
    # Histogram
    axes[0, 0].hist(df['TransactionAmt'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
    axes[0, 0].set_title('Histogram', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Transaction Amount')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].axvline(df['TransactionAmt'].mean(), color='red', linestyle='--', label=f'Mean: {df["TransactionAmt"].mean():.2f}')
    axes[0, 0].axvline(df['TransactionAmt'].median(), color='green', linestyle='--', label=f'Median: {df["TransactionAmt"].median():.2f}')
    axes[0, 0].legend()
    
    # Box plot
    axes[0, 1].boxplot(df['TransactionAmt'], vert=True)
    axes[0, 1].set_title('Box Plot', fontsize=12, fontweight='bold')
    axes[0, 1].set_ylabel('Transaction Amount')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Q-Q Plot
    stats.probplot(df['TransactionAmt'], dist="norm", plot=axes[0, 2])
    axes[0, 2].set_title('Q-Q Plot (Normal Distribution)', fontsize=12, fontweight='bold')
    axes[0, 2].grid(True, alpha=0.3)
    
    # Log transformation
    log_trans_amt = np.log1p(df['TransactionAmt'])
    axes[1, 0].hist(log_trans_amt, bins=50, edgecolor='black', alpha=0.7, color='orange')
    axes[1, 0].set_title('Log-Transformed Distribution', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Log(Transaction Amount + 1)')
    axes[1, 0].set_ylabel('Frequency')
    
    # Density plot
    df['TransactionAmt'].plot.density(ax=axes[1, 1], color='purple')
    axes[1, 1].set_title('Density Plot', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Transaction Amount')
    axes[1, 1].set_ylabel('Density')
    axes[1, 1].grid(True, alpha=0.3)
    
    # Violin plot (by fraud status if available)
    if 'isFraud' in df.columns:
        fraud_data = [df[df['isFraud']==0]['TransactionAmt'], df[df['isFraud']==1]['TransactionAmt']]
        axes[1, 2].violinplot(fraud_data, positions=[0, 1], showmeans=True)
        axes[1, 2].set_xticks([0, 1])
        axes[1, 2].set_xticklabels(['Legitimate', 'Fraud'])
        axes[1, 2].set_title('Transaction Amount by Fraud Status', fontsize=12, fontweight='bold')
        axes[1, 2].set_ylabel('Transaction Amount')
        axes[1, 2].grid(True, alpha=0.3)
    else:
        axes[1, 2].text(0.5, 0.5, 'Fraud status not available', ha='center', va='center', transform=axes[1, 2].transAxes)
        axes[1, 2].set_title('Transaction Amount by Fraud Status', fontsize=12, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('../../outputs/figures/univariate_transactionamt.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Normality test
    sample_size = min(5000, len(df))
    sample_data = df['TransactionAmt'].sample(sample_size, random_state=42)
    _, p_value = stats.normaltest(sample_data)
    print(f"\nNormality Test (D'Agostino's test):")
    print(f"  p-value: {p_value:.6f}")
    if p_value < 0.05:
        print(f"  Result: Distribution is NOT normal (p < 0.05)")
    else:
        print(f"  Result: Distribution appears normal (p >= 0.05)")


In [None]:
# Univariate analysis for multiple numerical features
print("\nUnivariate Analysis: Multiple Numerical Features")
print("="*80)

# Select key numerical features
numerical_features = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2']
numerical_features = [f for f in numerical_features if f in df.columns and df[f].dtype in [np.int64, np.float64]]

if len(numerical_features) > 0:
    # Create summary statistics
    univariate_summary = pd.DataFrame({
        'Mean': df[numerical_features].mean(),
        'Median': df[numerical_features].median(),
        'Std': df[numerical_features].std(),
        'Min': df[numerical_features].min(),
        'Max': df[numerical_features].max(),
        'Skewness': df[numerical_features].skew(),
        'Kurtosis': df[numerical_features].kurtosis(),
        'Missing_Count': df[numerical_features].isnull().sum(),
        'Missing_Percentage': (df[numerical_features].isnull().sum() / len(df)) * 100
    })
    
    print("\nUnivariate Summary Statistics:")
    display(univariate_summary.round(4))
    
    # Visualize distributions for top features
    n_features = min(6, len(numerical_features))
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(numerical_features[:n_features]):
        axes[i].hist(df[feature].dropna(), bins=30, edgecolor='black', alpha=0.7)
        axes[i].set_title(f'{feature} Distribution', fontsize=11, fontweight='bold')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)
    
    # Hide unused subplots
    for i in range(n_features, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig('../../outputs/figures/univariate_multiple_features.png', dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
# Univariate analysis for categorical features
print("\nUnivariate Analysis: Categorical Features")
print("="*80)

categorical_features = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'DeviceType']
categorical_features = [f for f in categorical_features if f in df.columns]

if len(categorical_features) > 0:
    for feature in categorical_features[:3]:  # Analyze first 3 categorical features
        print(f"\n{feature}:")
        print("-" * 40)
        value_counts = df[feature].value_counts()
        print(f"Unique values: {df[feature].nunique()}")
        print(f"Most frequent value: {value_counts.index[0]} ({value_counts.values[0]} occurrences)")
        print(f"\nTop 10 values:")
        display(value_counts.head(10))
        
        # Visualize
        if df[feature].nunique() <= 20:
            plt.figure(figsize=(12, 6))
            value_counts.head(10).plot(kind='bar', color='steelblue')
            plt.title(f'{feature} Distribution (Top 10)', fontsize=12, fontweight='bold')
            plt.xlabel(feature)
            plt.ylabel('Count')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.savefig(f'../../outputs/figures/univariate_{feature}.png', dpi=300, bbox_inches='tight')
            plt.show()


## 2. Bivariate Analysis

Bivariate analysis examines the relationship between two variables to identify patterns, correlations, and associations.


In [None]:
# Bivariate: TransactionAmt vs isFraud
if 'TransactionAmt' in df.columns and 'isFraud' in df.columns:
    print("Bivariate Analysis: TransactionAmt vs isFraud")
    print("="*80)
    
    # Statistical summary
    print("\nStatistical Summary by Fraud Status:")
    summary = df.groupby('isFraud')['TransactionAmt'].agg(['count', 'mean', 'median', 'std', 'min', 'max'])
    summary.columns = ['Count', 'Mean', 'Median', 'Std', 'Min', 'Max']
    display(summary)
    
    # Correlation
    corr = df[['TransactionAmt', 'isFraud']].corr().iloc[0, 1]
    print(f"\nCorrelation coefficient: {corr:.4f}")
    
    # Statistical test
    from scipy.stats import mannwhitneyu
    fraud_amt = df[df['isFraud']==1]['TransactionAmt'].dropna()
    legit_amt = df[df['isFraud']==0]['TransactionAmt'].dropna()
    statistic, p_value = mannwhitneyu(fraud_amt, legit_amt, alternative='two-sided')
    print(f"Mann-Whitney U test p-value: {p_value:.6f}")
    print(f"Significant difference: {'Yes' if p_value < 0.05 else 'No'}")
    
    # Comprehensive visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Box plot
    df.boxplot(column='TransactionAmt', by='isFraud', ax=axes[0, 0])
    axes[0, 0].set_title('Transaction Amount by Fraud Status', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Fraud Status (0=Legitimate, 1=Fraud)')
    axes[0, 0].set_ylabel('Transaction Amount')
    axes[0, 0].set_yscale('log')
    plt.suptitle('')
    
    # Violin plot
    sns.violinplot(data=df, x='isFraud', y='TransactionAmt', ax=axes[0, 1])
    axes[0, 1].set_title('Distribution by Fraud Status', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Fraud Status (0=Legitimate, 1=Fraud)')
    axes[0, 1].set_ylabel('Transaction Amount')
    axes[0, 1].set_yscale('log')
    
    # Histogram overlay
    axes[1, 0].hist(df[df['isFraud']==0]['TransactionAmt'], bins=50, alpha=0.6, label='Legitimate', color='blue', density=True)
    axes[1, 0].hist(df[df['isFraud']==1]['TransactionAmt'], bins=50, alpha=0.6, label='Fraud', color='red', density=True)
    axes[1, 0].set_title('Transaction Amount Distribution by Fraud Status', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Transaction Amount')
    axes[1, 0].set_ylabel('Density')
    axes[1, 0].set_xlim(0, df['TransactionAmt'].quantile(0.99))
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # Scatter plot (sample for performance)
    sample_size = min(5000, len(df))
    sample_df = df.sample(sample_size, random_state=42)
    axes[1, 1].scatter(sample_df['isFraud'], sample_df['TransactionAmt'], alpha=0.3, s=10)
    axes[1, 1].set_title('Transaction Amount vs Fraud Status (Sample)', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Fraud Status (0=Legitimate, 1=Fraud)')
    axes[1, 1].set_ylabel('Transaction Amount')
    axes[1, 1].set_yscale('log')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../../outputs/figures/bivariate_transaction_fraud.png', dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
# Bivariate: Numerical feature pairs
print("\nBivariate Analysis: Numerical Feature Pairs")
print("="*80)

# Select key numerical features for pairwise analysis
pair_features = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5']
pair_features = [f for f in pair_features if f in df.columns and df[f].dtype in [np.int64, np.float64]]

if len(pair_features) >= 2:
    # Correlation matrix
    corr_matrix = df[pair_features].corr()
    
    # Visualize correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Bivariate Correlation Matrix', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('../../outputs/figures/bivariate_correlation_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Scatter plot matrix (sample for performance)
    sample_size = min(2000, len(df))
    sample_df = df.sample(sample_size, random_state=42)
    
    # Create pairplot for top features
    if len(pair_features) <= 4:
        sns.pairplot(sample_df[pair_features + ['isFraud']], hue='isFraud', diag_kind='kde', 
                     plot_kws={'alpha': 0.5, 's': 10})
        plt.savefig('../../outputs/figures/bivariate_pairplot.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    print("\nTop Correlated Pairs:")
    print("-" * 40)
    # Get upper triangle of correlation matrix
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_pairs.append({
                'Feature1': corr_matrix.columns[i],
                'Feature2': corr_matrix.columns[j],
                'Correlation': corr_matrix.iloc[i, j]
            })
    corr_pairs_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
    display(corr_pairs_df.head(10))


In [None]:
# Bivariate: Categorical vs Numerical
print("\nBivariate Analysis: Categorical vs Numerical")
print("="*80)

if 'ProductCD' in df.columns and 'TransactionAmt' in df.columns:
    # ProductCD vs TransactionAmt
    product_stats = df.groupby('ProductCD')['TransactionAmt'].agg(['count', 'mean', 'median', 'std'])
    product_stats.columns = ['Count', 'Mean', 'Median', 'Std']
    print("\nTransaction Amount Statistics by ProductCD:")
    display(product_stats)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Box plot
    df.boxplot(column='TransactionAmt', by='ProductCD', ax=axes[0])
    axes[0].set_title('Transaction Amount by ProductCD', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('ProductCD')
    axes[0].set_ylabel('Transaction Amount')
    axes[0].set_yscale('log')
    plt.suptitle('')
    
    # Bar plot of means
    product_stats['Mean'].plot(kind='bar', ax=axes[1], color='steelblue')
    axes[1].set_title('Mean Transaction Amount by ProductCD', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('ProductCD')
    axes[1].set_ylabel('Mean Transaction Amount')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('../../outputs/figures/bivariate_product_transaction.png', dpi=300, bbox_inches='tight')
    plt.show()

if 'card4' in df.columns and 'TransactionAmt' in df.columns:
    # Card4 vs TransactionAmt
    card_stats = df.groupby('card4')['TransactionAmt'].agg(['count', 'mean', 'median'])
    card_stats.columns = ['Count', 'Mean', 'Median']
    print("\nTransaction Amount Statistics by Card Type:")
    display(card_stats)


In [None]:
# Bivariate: Categorical vs Categorical
print("\nBivariate Analysis: Categorical vs Categorical")
print("="*80)

if 'ProductCD' in df.columns and 'card4' in df.columns:
    # Contingency table
    contingency_table = pd.crosstab(df['ProductCD'], df['card4'])
    print("\nContingency Table: ProductCD vs Card4")
    display(contingency_table)
    
    # Chi-square test
    from scipy.stats import chi2_contingency
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    print(f"\nChi-square Test:")
    print(f"  Chi-square statistic: {chi2:.4f}")
    print(f"  p-value: {p_value:.6f}")
    print(f"  Degrees of freedom: {dof}")
    print(f"  Significant association: {'Yes' if p_value < 0.05 else 'No'}")
    
    # Visualization
    contingency_pct = contingency_table.div(contingency_table.sum(axis=1), axis=0) * 100
    plt.figure(figsize=(12, 6))
    contingency_pct.plot(kind='bar', stacked=True, figsize=(12, 6))
    plt.title('ProductCD vs Card4 (Percentage)', fontsize=12, fontweight='bold')
    plt.xlabel('ProductCD')
    plt.ylabel('Percentage (%)')
    plt.legend(title='Card Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.savefig('../../outputs/figures/bivariate_product_card.png', dpi=300, bbox_inches='tight')
    plt.show()


## 3. Multivariate Analysis

Multivariate analysis examines relationships among multiple variables simultaneously to identify complex patterns and interactions.


In [None]:
# Multivariate: Correlation matrix
print("Multivariate Analysis: Correlation Matrix")
print("="*80)

key_features = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 
                'addr1', 'addr2', 'dist1', 'dist2', 'isFraud']
key_features = [f for f in key_features if f in df.columns and df[f].dtype in [np.int64, np.float64]]

if len(key_features) > 1:
    # Calculate correlation matrix
    corr_matrix = df[key_features].corr()
    
    # Visualize correlation matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8}, 
                xticklabels=True, yticklabels=True)
    plt.title('Multivariate Correlation Matrix', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('../../outputs/figures/multivariate_correlation.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Features most correlated with fraud
    if 'isFraud' in corr_matrix.columns:
        fraud_corr = corr_matrix['isFraud'].sort_values(ascending=False)
        print("\nFeatures Most Correlated with Fraud:")
        print("-" * 40)
        display(fraud_corr)
        
        # Visualize correlation with fraud
        plt.figure(figsize=(10, 6))
        fraud_corr[fraud_corr.index != 'isFraud'].plot(kind='barh', color='steelblue')
        plt.title('Correlation with Fraud (isFraud)', fontsize=12, fontweight='bold')
        plt.xlabel('Correlation Coefficient')
        plt.ylabel('Features')
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.savefig('../../outputs/figures/multivariate_fraud_correlation.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    print("\nMultivariate correlation analysis complete!")


In [None]:
# Multivariate: Principal Component Analysis (PCA)
print("\nMultivariate Analysis: Principal Component Analysis (PCA)")
print("="*80)

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Select numerical features for PCA
pca_features = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 
                'addr1', 'addr2', 'dist1', 'dist2']
pca_features = [f for f in pca_features if f in df.columns and df[f].dtype in [np.int64, np.float64]]

if len(pca_features) >= 3:
    # Prepare data
    X_pca = df[pca_features].fillna(df[pca_features].median())
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_pca)
    
    # Perform PCA
    pca = PCA(n_components=min(5, len(pca_features)))
    pca_result = pca.fit_transform(X_scaled)
    
    # Explained variance
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    
    print(f"\nExplained Variance by Component:")
    for i, (var, cum_var) in enumerate(zip(explained_variance, cumulative_variance)):
        print(f"  PC{i+1}: {var:.4f} ({var*100:.2f}%) - Cumulative: {cum_var:.4f} ({cum_var*100:.2f}%)")
    
    # Visualize explained variance
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Scree plot
    axes[0].bar(range(1, len(explained_variance)+1), explained_variance, color='steelblue', alpha=0.7)
    axes[0].plot(range(1, len(explained_variance)+1), explained_variance, 'ro-', color='red')
    axes[0].set_xlabel('Principal Component')
    axes[0].set_ylabel('Explained Variance Ratio')
    axes[0].set_title('Scree Plot', fontsize=12, fontweight='bold')
    axes[0].grid(True, alpha=0.3, axis='y')
    
    # Cumulative variance
    axes[1].plot(range(1, len(cumulative_variance)+1), cumulative_variance, 'bo-', color='green')
    axes[1].axhline(y=0.8, color='r', linestyle='--', label='80% Variance')
    axes[1].axhline(y=0.9, color='orange', linestyle='--', label='90% Variance')
    axes[1].set_xlabel('Principal Component')
    axes[1].set_ylabel('Cumulative Explained Variance')
    axes[1].set_title('Cumulative Explained Variance', fontsize=12, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../../outputs/figures/multivariate_pca.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # PCA components visualization (first 2 components)
    if 'isFraud' in df.columns:
        plt.figure(figsize=(10, 8))
        scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], 
                            c=df['isFraud'], alpha=0.5, cmap='coolwarm', s=10)
        plt.colorbar(scatter, label='Fraud Status')
        plt.xlabel(f'PC1 ({explained_variance[0]*100:.2f}% variance)')
        plt.ylabel(f'PC2 ({explained_variance[1]*100:.2f}% variance)')
        plt.title('PCA: First Two Principal Components', fontsize=12, fontweight='bold')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig('../../outputs/figures/multivariate_pca_scatter.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    # Feature contributions to principal components
    components_df = pd.DataFrame(
        pca.components_.T,
        columns=[f'PC{i+1}' for i in range(pca.n_components_)],
        index=pca_features
    )
    print("\nFeature Contributions to Principal Components:")
    display(components_df.round(4))


In [None]:
# Multivariate: Cluster Analysis (K-Means)
print("\nMultivariate Analysis: Cluster Analysis (K-Means)")
print("="*80)

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Select features for clustering
cluster_features = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5']
cluster_features = [f for f in cluster_features if f in df.columns and df[f].dtype in [np.int64, np.float64]]

if len(cluster_features) >= 2:
    # Prepare data (sample for performance)
    sample_size = min(5000, len(df))
    sample_df = df.sample(sample_size, random_state=42)
    X_cluster = sample_df[cluster_features].fillna(sample_df[cluster_features].median())
    
    # Standardize
    scaler = StandardScaler()
    X_cluster_scaled = scaler.fit_transform(X_cluster)
    
    # Determine optimal number of clusters using elbow method
    inertias = []
    K_range = range(2, 8)
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_cluster_scaled)
        inertias.append(kmeans.inertia_)
    
    # Visualize elbow method
    plt.figure(figsize=(10, 6))
    plt.plot(K_range, inertias, 'bo-', color='steelblue')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal k', fontsize=12, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('../../outputs/figures/multivariate_elbow_method.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Perform clustering with optimal k (let's use k=3)
    k = 3
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_cluster_scaled)
    
    # Add cluster labels to sample dataframe
    sample_df = sample_df.copy()
    sample_df['Cluster'] = clusters
    
    # Analyze clusters
    print(f"\nCluster Analysis (k={k}):")
    cluster_summary = sample_df.groupby('Cluster')[cluster_features].mean()
    display(cluster_summary)
    
    # Compare clusters with fraud status if available
    if 'isFraud' in sample_df.columns:
        cluster_fraud = sample_df.groupby('Cluster')['isFraud'].agg(['count', 'mean'])
        cluster_fraud.columns = ['Count', 'Fraud_Rate']
        print("\nFraud Rate by Cluster:")
        display(cluster_fraud)
        
        # Visualize clusters
        if len(cluster_features) >= 2:
            fig, axes = plt.subplots(1, 2, figsize=(16, 6))
            
            # Clusters
            scatter1 = axes[0].scatter(X_cluster_scaled[:, 0], X_cluster_scaled[:, 1], 
                                     c=clusters, cmap='viridis', alpha=0.6, s=20)
            axes[0].scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
                           c='red', marker='x', s=200, linewidths=3, label='Centroids')
            axes[0].set_xlabel(f'{cluster_features[0]} (standardized)')
            axes[0].set_ylabel(f'{cluster_features[1]} (standardized)')
            axes[0].set_title('K-Means Clustering', fontsize=12, fontweight='bold')
            axes[0].legend()
            axes[0].grid(True, alpha=0.3)
            plt.colorbar(scatter1, ax=axes[0], label='Cluster')
            
            # Fraud status
            scatter2 = axes[1].scatter(X_cluster_scaled[:, 0], X_cluster_scaled[:, 1],
                                     c=sample_df['isFraud'], cmap='coolwarm', alpha=0.6, s=20)
            axes[1].set_xlabel(f'{cluster_features[0]} (standardized)')
            axes[1].set_ylabel(f'{cluster_features[1]} (standardized)')
            axes[1].set_title('Fraud Status', fontsize=12, fontweight='bold')
            axes[1].grid(True, alpha=0.3)
            plt.colorbar(scatter2, ax=axes[1], label='Fraud Status')
            
            plt.tight_layout()
            plt.savefig('../../outputs/figures/multivariate_clustering.png', dpi=300, bbox_inches='tight')
            plt.show()


## Summary and Conclusions


In [None]:
# Summary and Conclusions
print("="*80)
print("UNIVARIATE, BIVARIATE, AND MULTIVARIATE ANALYSIS SUMMARY")
print("="*80)

print("\n1. Univariate Analysis:")
print("   - Analyzed individual variable distributions, central tendencies, and variability")
print("   - Identified key statistical measures (mean, median, std, skewness, kurtosis)")
print("   - Examined normality of distributions")
print("   - Analyzed both numerical and categorical features")

print("\n2. Bivariate Analysis:")
print("   - Examined relationships between pairs of variables")
print("   - Identified correlations between numerical features")
print("   - Analyzed associations between categorical and numerical features")
print("   - Performed statistical tests to validate relationships")

print("\n3. Multivariate Analysis:")
print("   - Explored complex relationships among multiple variables")
print("   - Performed Principal Component Analysis (PCA) to identify key dimensions")
print("   - Conducted cluster analysis to identify patterns in data")
print("   - Identified features most correlated with fraud")

print("\n" + "="*80)
print("Analysis Complete! Check outputs/figures/ for visualizations.")
print("="*80)
