In [85]:
"""
═══════════════════════════════════════════════════════════════════════════════════════════════════════
Enhanced Synthetic Data Validation Suite v2.0
Urals Loading ML Forecasting

Statistical tests for synthetic vs real data comparison
Date: November 2025
═══════════════════════════════════════════════════════════════════════════════════════════════════════

OVERVIEW:
---------
This module implements a comprehensive statistical validation framework for comparing
synthetic data against real observable data. Essential for validating ML training data
in commodity forecasting applications.

TESTS IMPLEMENTED:
------------------
1. Descriptive Statistics (mean, std, skewness, kurtosis, percentiles)
2. Kolmogorov-Smirnov Two-Sample Test (distribution comparison)
3. Anderson-Darling Test (tail-sensitive normality)
4. Jarque-Bera Test (combined skew/kurtosis normality)
5. D'Agostino-Pearson Test (normality)
6. Mann-Whitney U Test (non-parametric comparison)
7. Levene's Test (variance equality)
8. Correlation Structure Validation
9. Tail Risk Analysis (VaR percentiles)

"""

"\n═══════════════════════════════════════════════════════════════════════════════════════════════════════\nEnhanced Synthetic Data Validation Suite v2.0\nUrals Loading ML Forecasting\n\nStatistical tests for synthetic vs real data comparison\nDate: November 2025\n═══════════════════════════════════════════════════════════════════════════════════════════════════════\n\nOVERVIEW:\n---------\nThis module implements a comprehensive statistical validation framework for comparing\nsynthetic data against real observable data. Essential for validating ML training data\nin commodity forecasting applications.\n\nTESTS IMPLEMENTED:\n------------------\n1. Descriptive Statistics (mean, std, skewness, kurtosis, percentiles)\n2. Kolmogorov-Smirnov Two-Sample Test (distribution comparison)\n3. Anderson-Darling Test (tail-sensitive normality)\n4. Jarque-Bera Test (combined skew/kurtosis normality)\n5. D'Agostino-Pearson Test (normality)\n6. Mann-Whitney U Test (non-parametric comparison)\n7. Levene's

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

In [87]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

class Config:
    """Configuration for statistical tests"""
    ALPHA = 0.05  # Significance level
    KS_THRESHOLD = 0.05
    AD_CRITICAL = 0.787  # Anderson-Darling critical value at 5%
    CORR_ERROR_EXCELLENT = 0.03
    CORR_ERROR_GOOD = 0.05
    CORR_ERROR_ACCEPTABLE = 0.10

In [88]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 1: DESCRIPTIVE STATISTICS
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def compute_descriptive_stats(series: pd.Series, name: str) -> Dict:
    """
    Compute comprehensive descriptive statistics for a series.
    
    Parameters:
    -----------
    series : pd.Series
        Data series to analyze
    name : str
        Variable name for labeling
        
    Returns:
    --------
    dict : Dictionary containing all descriptive statistics
    """
    return {
        'Variable': name,
        'N': len(series),
        'Mean': series.mean(),
        'Std': series.std(),
        'Min': series.min(),
        'Q1': series.quantile(0.25),
        'Median': series.median(),
        'Q3': series.quantile(0.75),
        'Max': series.max(),
        'Range': series.max() - series.min(),
        'IQR': series.quantile(0.75) - series.quantile(0.25),
        'CV_pct': (series.std() / series.mean() * 100) if series.mean() != 0 else np.nan,
        'Skewness': stats.skew(series),
        'Kurtosis': stats.kurtosis(series),  # Excess kurtosis
        'P1': series.quantile(0.01),
        'P5': series.quantile(0.05),
        'P95': series.quantile(0.95),
        'P99': series.quantile(0.99)
    }


def compare_descriptive_stats(real: pd.Series, synthetic: pd.Series, var_name: str) -> Dict:
    """
    Compare descriptive statistics between real and synthetic data.
    
    Returns percentage differences for key metrics.
    """
    real_stats = compute_descriptive_stats(real, f"Real_{var_name}")
    synth_stats = compute_descriptive_stats(synthetic, f"Synth_{var_name}")
    
    mean_diff_pct = abs(real_stats['Mean'] - synth_stats['Mean']) / abs(real_stats['Mean']) * 100
    std_diff_pct = abs(real_stats['Std'] - synth_stats['Std']) / abs(real_stats['Std']) * 100
    
    return {
        'Variable': var_name,
        'Real_Mean': real_stats['Mean'],
        'Synth_Mean': synth_stats['Mean'],
        'Mean_Error_Pct': mean_diff_pct,
        'Real_Std': real_stats['Std'],
        'Synth_Std': synth_stats['Std'],
        'Std_Error_Pct': std_diff_pct,
        'Real_Skewness': real_stats['Skewness'],
        'Synth_Skewness': synth_stats['Skewness'],
        'Real_Kurtosis': real_stats['Kurtosis'],
        'Synth_Kurtosis': synth_stats['Kurtosis']
    }

In [89]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 2: DISTRIBUTION SHAPE ANALYSIS (SKEWNESS & KURTOSIS)
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def analyze_distribution_shape(series: pd.Series) -> Dict:
    """
    Analyze distribution shape through skewness and kurtosis.
    
    Skewness interpretation:
        |skew| < 0.5: Approximately symmetric
        |skew| < 1.0: Moderate skew
        |skew| > 1.0: Highly skewed
    
    Kurtosis interpretation (excess):
        kurt ≈ 0: Normal tails (mesokurtic)
        kurt > 0: Fat tails (leptokurtic) - more extreme events
        kurt < 0: Thin tails (platykurtic) - fewer extreme events
    """
    skewness = stats.skew(series)
    kurtosis = stats.kurtosis(series)
    
    # Interpret skewness
    if abs(skewness) < 0.5:
        skew_interp = "Symmetric"
    elif abs(skewness) < 1.0:
        skew_interp = "Moderate skew"
    else:
        skew_interp = "Highly skewed"
    
    # Interpret kurtosis
    if abs(kurtosis) < 1:
        kurt_interp = "Normal tails"
    elif kurtosis > 1:
        kurt_interp = "Fat tails (risk)"
    else:
        kurt_interp = "Thin tails"
    
    return {
        'Skewness': skewness,
        'Skewness_Interpretation': skew_interp,
        'Kurtosis': kurtosis,
        'Kurtosis_Interpretation': kurt_interp
    }

In [90]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 3: KOLMOGOROV-SMIRNOV TWO-SAMPLE TEST
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def ks_two_sample_test(real: pd.Series, synthetic: pd.Series) -> Dict:
    """
    Kolmogorov-Smirnov two-sample test for distribution comparison.
    
    H0: Both samples come from the same distribution
    H1: Samples come from different distributions
    
    The KS statistic measures the maximum distance between empirical CDFs.
    
    Parameters:
    -----------
    real : pd.Series
        Real data series
    synthetic : pd.Series
        Synthetic data series
        
    Returns:
    --------
    dict : Test results including statistic, p-value, and interpretation
    """
    ks_stat, p_value = stats.ks_2samp(real.dropna(), synthetic.dropna())
    
    passed = p_value > Config.ALPHA
    
    if p_value > 0.10:
        quality = "EXCELLENT"
    elif p_value > 0.05:
        quality = "GOOD"
    else:
        quality = "POOR"
    
    return {
        'Test': 'Kolmogorov-Smirnov',
        'Statistic': ks_stat,
        'p_value': p_value,
        'Alpha': Config.ALPHA,
        'Passed': passed,
        'Quality': quality,
        'Interpretation': f"Distributions are {'statistically similar' if passed else 'significantly different'}"
    }


In [91]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 4: ANDERSON-DARLING TEST (TAIL-SENSITIVE)
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def anderson_darling_test(series: pd.Series) -> Dict:
    """
    Anderson-Darling test for normality.
    
    More sensitive to distribution tails than KS test.
    Critical for VaR/CVaR risk modeling where tails matter.
    
    H0: Data follows normal distribution
    H1: Data does not follow normal distribution
    """
    # Standardize the data
    standardized = (series - series.mean()) / series.std()
    
    result = stats.anderson(standardized, dist='norm')
    
    # Critical value at 5% significance (index 2)
    critical_5pct = result.critical_values[2]
    passed = result.statistic < critical_5pct
    
    return {
        'Test': 'Anderson-Darling',
        'Statistic': result.statistic,
        'Critical_Value_5pct': critical_5pct,
        'Passed': passed,
        'Interpretation': f"Data is {'approximately normal' if passed else 'non-normal'}"
    }

In [92]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 5: JARQUE-BERA TEST
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def jarque_bera_test(series: pd.Series) -> Dict:
    """
    Jarque-Bera test for normality.
    
    Combines skewness and kurtosis into a single test statistic.
    JB = (n/6) * [S² + (K²/4)]
    
    Widely used in econometrics and finance.
    
    H0: Data follows normal distribution (S=0, K=0)
    H1: Data does not follow normal distribution
    """
    jb_stat, p_value = stats.jarque_bera(series.dropna())
    
    passed = p_value > Config.ALPHA
    
    return {
        'Test': 'Jarque-Bera',
        'Statistic': jb_stat,
        'p_value': p_value,
        'Passed': passed,
        'Interpretation': f"Data is {'approximately normal' if passed else 'non-normal'}"
    }

In [93]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 6: D'AGOSTINO-PEARSON TEST
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def dagostino_pearson_test(series: pd.Series) -> Dict:
    """
    D'Agostino-Pearson test for normality.
    
    Tests whether skewness and kurtosis significantly deviate from normal.
    Combines tests for skewness and kurtosis.
    
    H0: Data follows normal distribution
    H1: Data does not follow normal distribution
    """
    try:
        stat, p_value = stats.normaltest(series.dropna())
        passed = p_value > Config.ALPHA
        
        return {
            'Test': 'DAgostino-Pearson',
            'Statistic': stat,
            'p_value': p_value,
            'Passed': passed,
            'Interpretation': f"Data is {'approximately normal' if passed else 'non-normal'}"
        }
    except Exception as e:
        return {
            'Test': 'DAgostino-Pearson',
            'Error': str(e)
        }

In [94]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 7: MANN-WHITNEY U TEST (NON-PARAMETRIC)
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def mann_whitney_test(real: pd.Series, synthetic: pd.Series) -> Dict:
    """
    Mann-Whitney U test (non-parametric).
    
    Does not assume normality - more robust when data is skewed.
    Tests whether two samples have the same distribution.
    
    H0: Both samples have the same distribution
    H1: Samples have different distributions
    """
    u_stat, p_value = stats.mannwhitneyu(
        real.dropna(), 
        synthetic.dropna(), 
        alternative='two-sided'
    )
    
    passed = p_value > Config.ALPHA
    
    return {
        'Test': 'Mann-Whitney U',
        'Statistic': u_stat,
        'p_value': p_value,
        'Passed': passed,
        'Interpretation': f"Distributions are {'statistically similar' if passed else 'significantly different'}"
    }

In [95]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 8: LEVENE'S TEST FOR VARIANCE EQUALITY
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def levene_test(real: pd.Series, synthetic: pd.Series) -> Dict:
    """
    Levene's test for equality of variances.
    
    Critical for risk modeling where variance (volatility) drives metrics.
    
    H0: Both samples have equal variances
    H1: Samples have different variances
    """
    lev_stat, p_value = stats.levene(real.dropna(), synthetic.dropna())
    
    passed = p_value > Config.ALPHA
    
    return {
        'Test': 'Levene',
        'Statistic': lev_stat,
        'p_value': p_value,
        'Real_Variance': real.var(),
        'Synthetic_Variance': synthetic.var(),
        'Passed': passed,
        'Interpretation': f"Variances are {'equal' if passed else 'significantly different'}"
    }

In [96]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 9: CORRELATION STRUCTURE VALIDATION
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def validate_correlation_structure(
    real_df: pd.DataFrame, 
    synthetic_df: pd.DataFrame,
    var_names: List[str]
) -> Dict:
    """
    Validate that correlation structure is preserved between real and synthetic data.
    
    Critical for:
    - ML model training (relationships must be preserved)
    - Hedging strategies (correlation drives hedge ratios)
    - Portfolio optimization (covariance matrix)
    """
    real_corr = real_df.corr()
    synth_corr = synthetic_df.corr()
    
    # Compute error matrix
    error_matrix = (real_corr - synth_corr).abs()
    
    # Extract pairwise comparisons
    pairs = []
    n = len(var_names)
    for i in range(n):
        for j in range(i+1, n):
            var1, var2 = var_names[i], var_names[j]
            real_c = real_corr.iloc[i, j]
            synth_c = synth_corr.iloc[i, j]
            error = abs(real_c - synth_c)
            error_pct = (error / abs(real_c) * 100) if real_c != 0 else np.inf
            
            # Quality rating
            if error < Config.CORR_ERROR_EXCELLENT:
                quality = "EXCELLENT"
            elif error < Config.CORR_ERROR_GOOD:
                quality = "GOOD"
            elif error < Config.CORR_ERROR_ACCEPTABLE:
                quality = "ACCEPTABLE"
            else:
                quality = "POOR"
            
            pairs.append({
                'Pair': f"{var1}-{var2}",
                'Real_Corr': real_c,
                'Synth_Corr': synth_c,
                'Error': error,
                'Error_Pct': error_pct,
                'Quality': quality
            })
    
    # Summary statistics
    errors = [p['Error'] for p in pairs]
    
    return {
        'Real_Correlation_Matrix': real_corr,
        'Synthetic_Correlation_Matrix': synth_corr,
        'Error_Matrix': error_matrix,
        'Pairwise_Comparison': pairs,
        'Mean_Absolute_Error': np.mean(errors),
        'Max_Absolute_Error': np.max(errors),
        'Std_of_Errors': np.std(errors),
        'Pairs_Excellent': sum(1 for p in pairs if p['Quality'] == 'EXCELLENT'),
        'Pairs_Good': sum(1 for p in pairs if p['Quality'] == 'GOOD'),
        'Pairs_Acceptable': sum(1 for p in pairs if p['Quality'] == 'ACCEPTABLE'),
        'Pairs_Poor': sum(1 for p in pairs if p['Quality'] == 'POOR')
    }

In [97]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# SECTION 10: TAIL RISK ANALYSIS (VaR/CVaR)
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def analyze_tail_risk(real: pd.Series, synthetic: pd.Series) -> Dict:
    """
    Analyze tail risk preservation for VaR/CVaR modeling.
    
    Compares key percentiles that matter for risk management:
    - 1st percentile (99% VaR)
    - 5th percentile (95% VaR)
    - 95th percentile
    - 99th percentile
    """
    percentiles = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]
    
    comparisons = []
    for p in percentiles:
        real_q = real.quantile(p)
        synth_q = synthetic.quantile(p)
        diff_pct = abs(real_q - synth_q) / abs(real_q) * 100 if real_q != 0 else 0
        
        comparisons.append({
            'Percentile': f"{p*100:.0f}%",
            'Real': real_q,
            'Synthetic': synth_q,
            'Diff_Pct': diff_pct
        })
    
    # Calculate tail coverage ratios
    real_iqr = real.quantile(0.75) - real.quantile(0.25)
    synth_iqr = synthetic.quantile(0.75) - synthetic.quantile(0.25)
    
    real_tail_range = real.quantile(0.99) - real.quantile(0.01)
    synth_tail_range = synthetic.quantile(0.99) - synthetic.quantile(0.01)
    
    return {
        'Percentile_Comparison': comparisons,
        'IQR_Coverage': synth_iqr / real_iqr * 100 if real_iqr != 0 else np.nan,
        'Tail_Range_Coverage': synth_tail_range / real_tail_range * 100 if real_tail_range != 0 else np.nan,
        'VaR_99_Real': real.quantile(0.01),
        'VaR_99_Synthetic': synthetic.quantile(0.01),
        'VaR_95_Real': real.quantile(0.05),
        'VaR_95_Synthetic': synthetic.quantile(0.05)
    }

In [98]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# COMPREHENSIVE VALIDATION SUITE
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

class SyntheticDataValidator:
    """
    Comprehensive validation suite for synthetic data.
    
    Usage:
    ------
    validator = SyntheticDataValidator(real_df, synthetic_df, column_mapping)
    results = validator.run_all_tests()
    validator.generate_report()
    """
    
    def __init__(
        self, 
        real_df: pd.DataFrame, 
        synthetic_df: pd.DataFrame,
        column_mapping: Dict[str, Tuple[str, str]]
    ):
        """
        Initialize validator.
        
        Parameters:
        -----------
        real_df : pd.DataFrame
            Real data
        synthetic_df : pd.DataFrame
            Synthetic data
        column_mapping : dict
            Maps variable names to (real_col, synth_col) tuples
        """
        self.real_df = real_df
        self.synthetic_df = synthetic_df
        self.column_mapping = column_mapping
        self.results = {}
    
    def run_all_tests(self) -> Dict:
        """Run all validation tests."""
        results = {}
        
        for var_name, (real_col, synth_col) in self.column_mapping.items():
            real_series = self.real_df[real_col].dropna()
            synth_series = self.synthetic_df[synth_col].dropna()
            
            results[var_name] = {
                'Descriptive': compare_descriptive_stats(real_series, synth_series, var_name),
                'Distribution_Shape_Real': analyze_distribution_shape(real_series),
                'Distribution_Shape_Synth': analyze_distribution_shape(synth_series),
                'KS_Test': ks_two_sample_test(real_series, synth_series),
                'Mann_Whitney': mann_whitney_test(real_series, synth_series),
                'Levene': levene_test(real_series, synth_series),
                'Jarque_Bera_Real': jarque_bera_test(real_series),
                'Jarque_Bera_Synth': jarque_bera_test(synth_series),
                'Anderson_Darling_Real': anderson_darling_test(real_series),
                'Anderson_Darling_Synth': anderson_darling_test(synth_series),
                'DAgostino_Real': dagostino_pearson_test(real_series),
                'DAgostino_Synth': dagostino_pearson_test(synth_series),
                'Tail_Risk': analyze_tail_risk(real_series, synth_series)
            }
        
        self.results = results
        return results
    
    def get_summary_table(self) -> pd.DataFrame:
        """Generate summary table for presentation."""
        rows = []
        for var_name, tests in self.results.items():
            row = {
                'Variable': var_name,
                'KS_Test': 'PASS' if tests['KS_Test']['Passed'] else 'FAIL',
                'KS_pvalue': tests['KS_Test']['p_value'],
                'Mann_Whitney': 'PASS' if tests['Mann_Whitney']['Passed'] else 'FAIL',
                'MW_pvalue': tests['Mann_Whitney']['p_value'],
                'Levene': 'PASS' if tests['Levene']['Passed'] else 'FAIL',
                'Levene_pvalue': tests['Levene']['p_value'],
                'JB_Synth': 'PASS' if tests['Jarque_Bera_Synth']['Passed'] else 'FAIL',
                'Mean_Error_Pct': tests['Descriptive']['Mean_Error_Pct'],
                'Std_Error_Pct': tests['Descriptive']['Std_Error_Pct']
            }
            rows.append(row)
        
        return pd.DataFrame(rows)
    
    def calculate_overall_score(self) -> float:
        """Calculate overall validation score (0-10)."""
        scores = []
        
        for var_name, tests in self.results.items():
            var_score = 0
            
            # KS test (weight: 2)
            if tests['KS_Test']['Passed']:
                var_score += 2
            elif tests['KS_Test']['p_value'] > 0.01:
                var_score += 1
            
            # Mann-Whitney (weight: 1.5)
            if tests['Mann_Whitney']['Passed']:
                var_score += 1.5
            
            # Levene (weight: 1.5)
            if tests['Levene']['Passed']:
                var_score += 1.5
            
            # Mean error (weight: 2)
            mean_err = tests['Descriptive']['Mean_Error_Pct']
            if mean_err < 1:
                var_score += 2
            elif mean_err < 5:
                var_score += 1.5
            elif mean_err < 10:
                var_score += 1
            
            # Std error (weight: 1.5)
            std_err = tests['Descriptive']['Std_Error_Pct']
            if std_err < 5:
                var_score += 1.5
            elif std_err < 10:
                var_score += 1
            
            # Normalize to 10
            var_score = var_score / 8.5 * 10
            scores.append(var_score)
        
        return np.mean(scores)

In [99]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# MAIN EXECUTION
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    # Example usage
    print("=" * 80)
    print("GS QUANTITATIVE STRATEGIES - SYNTHETIC DATA VALIDATION SUITE")
    print("=" * 80)
    
    # Load data
    real_data = pd.read_csv('Data_ML.csv')
    synthetic_data = pd.read_csv('urals_synthetic_data.csv')
    
    # Fix Urals loading parsing
    real_data['Urals loading'] = real_data['Urals loading'].str.replace(',', '').astype(float)
    
    # Define column mapping
    column_mapping = {
        'URALS': ('Urals loading', 'Urals loading'),
        'BRENT': ('LCOc1', 'LCOc1'),
        'MOEX': ('.IMOEX', '.IMOEX'),
        'NWEMURL': ('NWEMURLCRKMc1', 'NWEMURLCRMc1')
    }
    
    # Initialize validator
    validator = SyntheticDataValidator(real_data, synthetic_data, column_mapping)
    
    # Run all tests
    results = validator.run_all_tests()
    
    # Get summary
    summary = validator.get_summary_table()
    print("\nSUMMARY TABLE:")
    print(summary.to_string(index=False))
    
    # Calculate overall score
    score = validator.calculate_overall_score()
    print(f"\nOVERALL VALIDATION SCORE: {score:.1f}/10")
    
    # Correlation validation
    real_cols = ['Urals loading', 'LCOc1', '.IMOEX', 'NWEMURLCRKMc1']
    synth_cols = ['Urals loading', 'LCOc1', '.IMOEX', 'NWEMURLCRMc1']
    
    corr_results = validate_correlation_structure(
        real_data[real_cols],
        synthetic_data[synth_cols],
        ['URALS', 'BRENT', 'MOEX', 'NWEMURL']
    )
    
    print("\nCORRELATION PRESERVATION:")
    print(f"Mean Absolute Error: {corr_results['Mean_Absolute_Error']:.4f}")
    print(f"Max Absolute Error: {corr_results['Max_Absolute_Error']:.4f}")
    print(f"Pairs EXCELLENT: {corr_results['Pairs_Excellent']}/{len(corr_results['Pairwise_Comparison'])}")


GS QUANTITATIVE STRATEGIES - SYNTHETIC DATA VALIDATION SUITE

SUMMARY TABLE:
Variable KS_Test  KS_pvalue Mann_Whitney  MW_pvalue Levene  Levene_pvalue JB_Synth  Mean_Error_Pct  Std_Error_Pct
   URALS    PASS   0.466606         PASS   0.777607   PASS       0.127501     PASS        0.501464       8.912962
   BRENT    PASS   0.950188         PASS   0.975810   PASS       0.856867     PASS        0.601098       0.125665
    MOEX    PASS   0.066274         PASS   0.391148   PASS       0.748713     PASS        0.479987       0.597316
 NWEMURL    FAIL   0.048194         PASS   0.534538   PASS       0.074544     PASS       10.027315       2.529534

OVERALL VALIDATION SCORE: 9.0/10

CORRELATION PRESERVATION:
Mean Absolute Error: 0.0301
Max Absolute Error: 0.0612
Pairs EXCELLENT: 4/6


In [100]:
"""
═══════════════════════════════════════════════════════════════════════════════════════════════════════
VISUALIZATION MODULE
═══════════════════════════════════════════════════════════════════════════════════════════════════════

USAGE:
------
python GS_Synthetic_Validation_Charts.py

OUTPUT:
-------
4 publication-quality PNG charts at 300 DPI:
1. GS_Chart_1_Distributions.png     - Distribution overlay with KS test results
2. GS_Chart_2_QQ_Plots.png          - Q-Q plots for normality assessment
3. GS_Chart_3_Tests_Heatmap.png     - Statistical tests results matrix
4. GS_Chart_4_Moments.png           - Moments comparison (mean, std, skew, kurt)
5. GS_Chart_5_Correlation_Heatmap.png - Correlation structure preservation
6. GS_Chart_6_Tail_Risk.png         - VaR percentile comparison
"""

'\n═══════════════════════════════════════════════════════════════════════════════════════════════════════\nVISUALIZATION MODULE\n═══════════════════════════════════════════════════════════════════════════════════════════════════════\n\nUSAGE:\n------\npython GS_Synthetic_Validation_Charts.py\n\nOUTPUT:\n-------\n4 publication-quality PNG charts at 300 DPI:\n1. GS_Chart_1_Distributions.png     - Distribution overlay with KS test results\n2. GS_Chart_2_QQ_Plots.png          - Q-Q plots for normality assessment\n3. GS_Chart_3_Tests_Heatmap.png     - Statistical tests results matrix\n4. GS_Chart_4_Moments.png           - Moments comparison (mean, std, skew, kurt)\n5. GS_Chart_5_Correlation_Heatmap.png - Correlation structure preservation\n6. GS_Chart_6_Tail_Risk.png         - VaR percentile comparison\n'

In [101]:
from scipy.stats import gaussian_kde
import warnings
warnings.filterwarnings('ignore')

In [102]:
class GSChartingStandards:
   
    COLORS = {
        'real': '#1f77b4',           # Blue (real data)
        'synthetic': '#ff7f0e',      # Orange (synthetic data)
        'pass': '#2ca02c',           # Green (pass threshold)
        'fail': '#d62728',           # Red (fail threshold)
        'caution': '#ff9800',        # Amber (caution zone)
        'neutral': '#7f7f7f',        # Gray (neutral)
        'highlight': '#1f77b4'       # Highlight blue
    }
    
    
    FONT_SIZE = {
        'title': 14,
        'label': 11,
        'tick': 10,
        'legend': 10,
        'annotation': 9
    }
    
    FIGURE_DPI = 300  # Publication quality
    FIGURE_FORMAT = 'png'
    
    @staticmethod
    def configure():
        """Apply GS charting standards to matplotlib session"""
        sns.set_style("darkgrid")
        sns.set_palette("husl")
        
        # Set matplotlib rcParams for consistency
        plt.rcParams.update({
            'font.family': 'sans-serif',
            'font.sans-serif': ['Arial', 'Helvetica'],
            'font.size': GSChartingStandards.FONT_SIZE['label'],
            'axes.labelsize': GSChartingStandards.FONT_SIZE['label'],
            'axes.titlesize': GSChartingStandards.FONT_SIZE['title'],
            'axes.labelweight': 'bold',
            'axes.titleweight': 'bold',
            'xtick.labelsize': GSChartingStandards.FONT_SIZE['tick'],
            'ytick.labelsize': GSChartingStandards.FONT_SIZE['tick'],
            'legend.fontsize': GSChartingStandards.FONT_SIZE['legend'],
            'axes.linewidth': 1.2,
            'axes.edgecolor': '#333333',
            'grid.linewidth': 0.8,
            'grid.alpha': 0.3,
            'lines.linewidth': 2.0,
            'patch.linewidth': 0.5,
            'figure.facecolor': 'white',
            'axes.facecolor': '#f8f9fa',
            'savefig.facecolor': 'white',
            'savefig.edgecolor': 'white',
            'savefig.dpi': GSChartingStandards.FIGURE_DPI
        })


In [103]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# DATA LOADING & PREPARATION
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def load_data():
    """Load and prepare data for visualization"""
    print("Loading data...")
    
    real_data = pd.read_csv('Data_ML.csv')
    synthetic_data = pd.read_csv('urals_synthetic_data.csv')
    
    # Fix Urals loading (parse comma-separated format)
    real_data['Urals loading'] = real_data['Urals loading'].str.replace(',', '').astype(float)
    
    print("✓ Data loaded successfully\n")
    return real_data, synthetic_data

In [104]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# CHART 1: DISTRIBUTION OVERLAYS WITH KS TEST
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def create_distribution_overlays(real_data, synthetic_data):
    """
    Create distribution overlay plots with KDE and histograms.
    Tests: Kolmogorov-Smirnov test for distribution equivalence.
    """
    
    print("Creating Chart 1: Distribution Overlays with KS Test...")
    
    columns_map = {
        'URALS': ('Urals loading', 'Urals loading'),
        'BRENT': ('LCOc1', 'LCOc1'),
        'MOEX': ('.IMOEX', '.IMOEX'),
        'NWEMURL': ('NWEMURLCRKMc1', 'NWEMURLCRMc1')
    }
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Distribution Comparison: Real vs Synthetic Data\nKolmogorov-Smirnov Test Results', 
                 fontsize=GSChartingStandards.FONT_SIZE['title'], fontweight='bold', y=0.995)
    
    for idx, (var_name, (real_col, synth_col)) in enumerate(columns_map.items()):
        ax = axes[idx // 2, idx % 2]
        
        real_vals = real_data[real_col].dropna()
        synth_vals = synthetic_data[synth_col].dropna()
        
        # Kolmogorov-Smirnov test
        ks_stat, ks_pval = stats.ks_2samp(real_vals, synth_vals)
        
        # Histograms with transparency
        bins = 40
        ax.hist(real_vals, bins=bins, alpha=0.5, color=GSChartingStandards.COLORS['real'], 
                label='Real Data', density=True, edgecolor='black', linewidth=0.5)
        ax.hist(synth_vals, bins=bins, alpha=0.5, color=GSChartingStandards.COLORS['synthetic'], 
                label='Synthetic Data', density=True, edgecolor='black', linewidth=0.5)
        
        # KDE curves
        kde_real = gaussian_kde(real_vals)
        kde_synth = gaussian_kde(synth_vals)
        x_range = np.linspace(min(real_vals.min(), synth_vals.min()), 
                              max(real_vals.max(), synth_vals.max()), 200)
        ax.plot(x_range, kde_real(x_range), color=GSChartingStandards.COLORS['real'], 
                linewidth=2.5, label='Real KDE', linestyle='-')
        ax.plot(x_range, kde_synth(x_range), color=GSChartingStandards.COLORS['synthetic'], 
                linewidth=2.5, label='Synthetic KDE', linestyle='--')
        
        # Test result formatting
        result_color = GSChartingStandards.COLORS['pass'] if ks_pval > 0.05 else GSChartingStandards.COLORS['fail']
        result_text = f"KS p={ks_pval:.3f} {'✓ PASS' if ks_pval > 0.05 else '✗ FAIL'}"
        
        ax.set_title(f'{var_name}\n{result_text}', 
                     fontsize=GSChartingStandards.FONT_SIZE['label'], 
                     fontweight='bold', color=result_color)
        ax.set_xlabel('Value', fontsize=GSChartingStandards.FONT_SIZE['label'])
        ax.set_ylabel('Density', fontsize=GSChartingStandards.FONT_SIZE['label'])
        ax.legend(loc='upper right', fontsize=GSChartingStandards.FONT_SIZE['legend'])
        ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.5)
    
    plt.tight_layout()
    filename = f'GS_Chart_1_Distributions.{GSChartingStandards.FIGURE_FORMAT}'
    plt.savefig(filename, dpi=GSChartingStandards.FIGURE_DPI, bbox_inches='tight')
    print(f"✓ Saved: {filename}\n")
    plt.close()

In [105]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# CHART 2: Q-Q PLOTS FOR NORMALITY
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def create_qq_plots(synthetic_data):
    """
    Create Q-Q plots for normality assessment.
    Tests: Jarque-Bera test for normality of synthetic data.
    """
    
    print("Creating Chart 2: Q-Q Plots for Normality Assessment...")
    
    columns_map = {
        'URALS': 'Urals loading',
        'BRENT': 'LCOc1',
        'MOEX': '.IMOEX',
        'NWEMURL': 'NWEMURLCRMc1'
    }
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Q-Q Plots: Assessment of Normality\nSynthetic Data Alignment with Normal Distribution', 
                 fontsize=GSChartingStandards.FONT_SIZE['title'], fontweight='bold', y=0.995)
    
    for idx, (var_name, synth_col) in enumerate(columns_map.items()):
        ax = axes[idx // 2, idx % 2]
        
        synth_vals = synthetic_data[synth_col].dropna()
        
        # Q-Q plot
        stats.probplot(synth_vals, dist="norm", plot=ax)
        
        # Styling
        ax.get_lines()[0].set_color(GSChartingStandards.COLORS['synthetic'])
        ax.get_lines()[0].set_markersize(6)
        ax.get_lines()[0].set_alpha(0.7)
        ax.get_lines()[1].set_color(GSChartingStandards.COLORS['real'])
        ax.get_lines()[1].set_linewidth(2.5)
        
        # Jarque-Bera test
        jb_stat, jb_pval = stats.jarque_bera(synth_vals)
        result_color = GSChartingStandards.COLORS['pass'] if jb_pval > 0.05 else GSChartingStandards.COLORS['caution']
        
        ax.set_title(f'{var_name} (Synthetic)\nJarque-Bera p={jb_pval:.4f}', 
                     fontsize=GSChartingStandards.FONT_SIZE['label'], 
                     fontweight='bold', color=result_color)
        ax.set_xlabel('Theoretical Quantiles', fontsize=GSChartingStandards.FONT_SIZE['label'])
        ax.set_ylabel('Sample Quantiles', fontsize=GSChartingStandards.FONT_SIZE['label'])
        ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.5)
    
    plt.tight_layout()
    filename = f'GS_Chart_2_QQ_Plots.{GSChartingStandards.FIGURE_FORMAT}'
    plt.savefig(filename, dpi=GSChartingStandards.FIGURE_DPI, bbox_inches='tight')
    print(f"✓ Saved: {filename}\n")
    plt.close()

In [106]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# CHART 3: STATISTICAL TESTS HEATMAP
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def create_tests_heatmap(real_data, synthetic_data):
    """
    Create heatmap of statistical test results.
    Tests: KS, Mann-Whitney, Levene, Jarque-Bera
    """
    
    print("Creating Chart 3: Statistical Tests Heatmap...")
    
    columns_map = {
        'URALS': ('Urals loading', 'Urals loading'),
        'BRENT': ('LCOc1', 'LCOc1'),
        'MOEX': ('.IMOEX', '.IMOEX'),
        'NWEMURL': ('NWEMURLCRKMc1', 'NWEMURLCRMc1')
    }
    
    # Compute all test results
    test_results = []
    for var_name, (real_col, synth_col) in columns_map.items():
        real_vals = real_data[real_col].dropna()
        synth_vals = synthetic_data[synth_col].dropna()
        
        ks_stat, ks_pval = stats.ks_2samp(real_vals, synth_vals)
        u_stat, u_pval = stats.mannwhitneyu(real_vals, synth_vals, alternative='two-sided')
        lev_stat, lev_pval = stats.levene(real_vals, synth_vals)
        jb_stat, jb_pval = stats.jarque_bera(synth_vals)
        
        test_results.append({
            'Variable': var_name,
            'KS Test': 1 if ks_pval > 0.05 else 0,
            'Mann-Whitney': 1 if u_pval > 0.05 else 0,
            'Levene': 1 if lev_pval > 0.05 else 0,
            'Jarque-Bera': 1 if jb_pval > 0.05 else 0
        })
    
    test_df = pd.DataFrame(test_results).set_index('Variable')
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Heatmap
    im = ax.imshow(test_df.values, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
    
    # Set ticks and labels
    ax.set_xticks(np.arange(len(test_df.columns)))
    ax.set_yticks(np.arange(len(test_df.index)))
    ax.set_xticklabels(test_df.columns, fontsize=GSChartingStandards.FONT_SIZE['label'])
    ax.set_yticklabels(test_df.index, fontsize=GSChartingStandards.FONT_SIZE['label'])
    
    # Rotate x labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    
    # Add text annotations
    for i in range(len(test_df.index)):
        for j in range(len(test_df.columns)):
            value = test_df.values[i, j]
            text = ax.text(j, i, '✓' if value == 1 else '✗',
                          ha="center", va="center", color="black", 
                          fontsize=14, fontweight='bold')
    
    ax.set_title('Statistical Tests Results Matrix\nGreen=PASS (p>0.05), Red=FAIL (p<0.05)', 
                 fontsize=GSChartingStandards.FONT_SIZE['title'], 
                 fontweight='bold', pad=20)
    
    # Colorbar
    cbar = plt.colorbar(im, ax=ax, orientation='vertical', pad=0.02)
    cbar.set_label('Test Result', fontsize=GSChartingStandards.FONT_SIZE['label'])
    
    plt.tight_layout()
    filename = f'GS_Chart_3_Tests_Heatmap.{GSChartingStandards.FIGURE_FORMAT}'
    plt.savefig(filename, dpi=GSChartingStandards.FIGURE_DPI, bbox_inches='tight')
    print(f"✓ Saved: {filename}\n")
    plt.close()

In [107]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# CHART 4: MOMENTS COMPARISON
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def create_moments_comparison(real_data, synthetic_data):
    """
    Compare statistical moments (mean, std, skewness, kurtosis).
    """
    
    print("Creating Chart 4: Moments Comparison...")
    
    columns_map = {
        'URALS': ('Urals loading', 'Urals loading'),
        'BRENT': ('LCOc1', 'LCOc1'),
        'MOEX': ('.IMOEX', '.IMOEX'),
        'NWEMURL': ('NWEMURLCRKMc1', 'NWEMURLCRMc1')
    }
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Statistical Moments Comparison: Real vs Synthetic\nMean, Std Dev, Skewness, Kurtosis', 
                 fontsize=GSChartingStandards.FONT_SIZE['title'], fontweight='bold', y=0.995)
    
    moments_list = ['Mean', 'Std Dev', 'Skewness', 'Kurtosis']
    moment_funcs = [
        lambda x: x.mean(),
        lambda x: x.std(),
        lambda x: stats.skew(x),
        lambda x: stats.kurtosis(x)
    ]
    
    for idx, (moment_name, moment_func) in enumerate(zip(moments_list, moment_funcs)):
        ax = axes[idx // 2, idx % 2]
        
        real_moments = []
        synth_moments = []
        var_names = []
        
        for var_name, (real_col, synth_col) in columns_map.items():
            real_vals = real_data[real_col].dropna()
            synth_vals = synthetic_data[synth_col].dropna()
            
            real_moments.append(moment_func(real_vals))
            synth_moments.append(moment_func(synth_vals))
            var_names.append(var_name)
        
        # Bar plot
        x = np.arange(len(var_names))
        width = 0.35
        
        bars1 = ax.bar(x - width/2, real_moments, width, label='Real', 
                       color=GSChartingStandards.COLORS['real'], 
                       alpha=0.8, edgecolor='black', linewidth=1)
        bars2 = ax.bar(x + width/2, synth_moments, width, label='Synthetic', 
                       color=GSChartingStandards.COLORS['synthetic'], 
                       alpha=0.8, edgecolor='black', linewidth=1)
        
        # Add value labels on bars
        for bars in [bars1, bars2]:
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.2f}',
                       ha='center', va='bottom', 
                       fontsize=GSChartingStandards.FONT_SIZE['annotation'])
        
        ax.set_xlabel('Variable', fontsize=GSChartingStandards.FONT_SIZE['label'])
        ax.set_ylabel(moment_name, fontsize=GSChartingStandards.FONT_SIZE['label'])
        ax.set_title(moment_name, fontsize=GSChartingStandards.FONT_SIZE['label'], fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels(var_names)
        ax.legend(fontsize=GSChartingStandards.FONT_SIZE['legend'])
        ax.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
    
    plt.tight_layout()
    filename = f'GS_Chart_4_Moments.{GSChartingStandards.FIGURE_FORMAT}'
    plt.savefig(filename, dpi=GSChartingStandards.FIGURE_DPI, bbox_inches='tight')
    print(f"✓ Saved: {filename}\n")
    plt.close()

In [108]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# CHART 5: CORRELATION STRUCTURE HEATMAP
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def create_correlation_heatmap(real_data, synthetic_data):
    """
    Compare correlation structures (real vs synthetic).
    Critical for ML training validation.
    """
    
    print("Creating Chart 5: Correlation Structure Heatmap...")
    
    real_corr_cols = ['Urals loading', 'LCOc1', '.IMOEX', 'NWEMURLCRKMc1']
    synth_corr_cols = ['Urals loading', 'LCOc1', '.IMOEX', 'NWEMURLCRMc1']
    
    real_corr = real_data[real_corr_cols].corr()
    synth_corr = synthetic_data[synth_corr_cols].corr()
    
    # Rename for display
    display_names = ['URALS', 'BRENT', 'MOEX', 'NWEMURL']
    real_corr.index = display_names
    real_corr.columns = display_names
    synth_corr.index = display_names
    synth_corr.columns = display_names
    
    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
    fig.suptitle('Correlation Structure: Real vs Synthetic with Error Matrix', 
                 fontsize=GSChartingStandards.FONT_SIZE['title'], fontweight='bold', y=0.98)
    
    # Real correlation
    sns.heatmap(real_corr, annot=True, fmt='.3f', cmap='coolwarm', center=0, 
                vmin=-1, vmax=1, ax=axes[0], cbar_kws={'label': 'Correlation'})
    axes[0].set_title('Real Data', fontsize=GSChartingStandards.FONT_SIZE['label'], fontweight='bold')
    
    # Synthetic correlation
    sns.heatmap(synth_corr, annot=True, fmt='.3f', cmap='coolwarm', center=0, 
                vmin=-1, vmax=1, ax=axes[1], cbar_kws={'label': 'Correlation'})
    axes[1].set_title('Synthetic Data', fontsize=GSChartingStandards.FONT_SIZE['label'], fontweight='bold')
    
    # Error matrix
    corr_error = (real_corr - synth_corr).abs()
    sns.heatmap(corr_error, annot=True, fmt='.4f', cmap='Reds', 
                vmin=0, vmax=0.1, ax=axes[2], cbar_kws={'label': 'Absolute Error'})
    axes[2].set_title('Correlation Error |Real - Synthetic|', 
                     fontsize=GSChartingStandards.FONT_SIZE['label'], fontweight='bold')
    
    plt.tight_layout()
    filename = f'GS_Chart_5_Correlation_Heatmap.{GSChartingStandards.FIGURE_FORMAT}'
    plt.savefig(filename, dpi=GSChartingStandards.FIGURE_DPI, bbox_inches='tight')
    print(f"✓ Saved: {filename}\n")
    plt.close()

In [109]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# CHART 6: TAIL RISK ANALYSIS
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def create_tail_risk_analysis(real_data, synthetic_data):
    """
    Compare tail risk (VaR percentiles).
    Critical for risk management validation.
    """
    
    print("Creating Chart 6: Tail Risk Analysis (VaR)...")
    
    columns_map = {
        'URALS': ('Urals loading', 'Urals loading'),
        'BRENT': ('LCOc1', 'LCOc1'),
        'MOEX': ('.IMOEX', '.IMOEX')
    }
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    fig.suptitle('Tail Risk Analysis: VaR Percentile Comparison\n1%, 5%, 95%, 99% Quantiles', 
                 fontsize=GSChartingStandards.FONT_SIZE['title'], fontweight='bold', y=0.98)
    
    percentiles = [0.01, 0.05, 0.95, 0.99]
    percentile_labels = ['1% (99% VaR)', '5% (95% VaR)', '95%', '99%']
    
    for idx, (var_name, (real_col, synth_col)) in enumerate(columns_map.items()):
        ax = axes[idx]
        
        real_vals = real_data[real_col].dropna()
        synth_vals = synthetic_data[synth_col].dropna()
        
        real_quantiles = [real_vals.quantile(p) for p in percentiles]
        synth_quantiles = [synth_vals.quantile(p) for p in percentiles]
        
        # Bar plot
        x = np.arange(len(percentile_labels))
        width = 0.35
        
        bars1 = ax.bar(x - width/2, real_quantiles, width, label='Real', 
                       color=GSChartingStandards.COLORS['real'], 
                       alpha=0.8, edgecolor='black', linewidth=1)
        bars2 = ax.bar(x + width/2, synth_quantiles, width, label='Synthetic', 
                       color=GSChartingStandards.COLORS['synthetic'], 
                       alpha=0.8, edgecolor='black', linewidth=1)
        
        # Add error percentages
        for i, (r, s) in enumerate(zip(real_quantiles, synth_quantiles)):
            error_pct = abs(r - s) / abs(r) * 100 if r != 0 else 0
            ax.text(i, max(r, s) * 1.05, f'{error_pct:.1f}%',
                   ha='center', va='bottom', fontsize=GSChartingStandards.FONT_SIZE['annotation'],
                   fontweight='bold', color='darkred')
        
        ax.set_xlabel('Percentile', fontsize=GSChartingStandards.FONT_SIZE['label'])
        ax.set_ylabel(f'{var_name} Value', fontsize=GSChartingStandards.FONT_SIZE['label'])
        ax.set_title(f'{var_name}', fontsize=GSChartingStandards.FONT_SIZE['label'], fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels(percentile_labels, fontsize=GSChartingStandards.FONT_SIZE['tick'])
        ax.legend(fontsize=GSChartingStandards.FONT_SIZE['legend'])
        ax.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
    
    plt.tight_layout()
    filename = f'GS_Chart_6_Tail_Risk.{GSChartingStandards.FIGURE_FORMAT}'
    plt.savefig(filename, dpi=GSChartingStandards.FIGURE_DPI, bbox_inches='tight')
    print(f"✓ Saved: {filename}\n")
    plt.close()

In [110]:
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════
# MAIN EXECUTION
# ═══════════════════════════════════════════════════════════════════════════════════════════════════════

def main():
    """Execute full visualization suite"""
    
    print("\n" + "="*80)
    print("QUANTITATIVE STRATEGIES - SYNTHETIC DATA VALIDATION CHARTS")
    print("="*80 + "\n")
    
    # Configure charting standards
    GSChartingStandards.configure()
    
    # Load data
    real_data, synthetic_data = load_data()
    
    # Create all charts
    create_distribution_overlays(real_data, synthetic_data)
    create_qq_plots(synthetic_data)
    create_tests_heatmap(real_data, synthetic_data)
    create_moments_comparison(real_data, synthetic_data)
    create_correlation_heatmap(real_data, synthetic_data)
    create_tail_risk_analysis(real_data, synthetic_data)
    
    print("="*80)
    print("SUCCESS: All 6 professional charts created")
    print("="*80)
    print("\nOutput files:")
    print("  1. GS_Chart_1_Distributions.png")
    print("  2. GS_Chart_2_QQ_Plots.png")
    print("  3. GS_Chart_3_Tests_Heatmap.png")
    print("  4. GS_Chart_4_Moments.png")
    print("  5. GS_Chart_5_Correlation_Heatmap.png")
    print("  6. GS_Chart_6_Tail_Risk.png")
    print(f"\nFormat: PNG at {GSChartingStandards.FIGURE_DPI} DPI (publication quality)")


if __name__ == "__main__":
    main()



QUANTITATIVE STRATEGIES - SYNTHETIC DATA VALIDATION CHARTS

Loading data...
✓ Data loaded successfully

Creating Chart 1: Distribution Overlays with KS Test...
✓ Saved: GS_Chart_1_Distributions.png

Creating Chart 2: Q-Q Plots for Normality Assessment...
✓ Saved: GS_Chart_2_QQ_Plots.png

Creating Chart 3: Statistical Tests Heatmap...
✓ Saved: GS_Chart_3_Tests_Heatmap.png

Creating Chart 4: Moments Comparison...
✓ Saved: GS_Chart_4_Moments.png

Creating Chart 5: Correlation Structure Heatmap...
✓ Saved: GS_Chart_5_Correlation_Heatmap.png

Creating Chart 6: Tail Risk Analysis (VaR)...
✓ Saved: GS_Chart_6_Tail_Risk.png

SUCCESS: All 6 professional charts created

Output files:
  1. GS_Chart_1_Distributions.png
  2. GS_Chart_2_QQ_Plots.png
  3. GS_Chart_3_Tests_Heatmap.png
  4. GS_Chart_4_Moments.png
  5. GS_Chart_5_Correlation_Heatmap.png
  6. GS_Chart_6_Tail_Risk.png

Format: PNG at 300 DPI (publication quality)
