# Prepare the chemical data

In [2092]:
# magic commands to make sure external modules are reloaded every complete run
%load_ext autoreload
%autoreload 2

import pandas as pd
from zci.data_process.dataframe_ops import get_block
from sklearn.preprocessing import StandardScaler

# read the merged dataframe
master = pd.read_excel("../data/processed/complete_env_taxa_chemical.xlsx", 
                      sheet_name="all_data_merged", 
                      header=[0, 1, 2], 
                      index_col=0)

master.head()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


block,sample_info,sample_info,sample_info,sample_info,environmental,environmental,environmental,environmental,environmental,environmental,...,taxa,taxa,taxa,taxa,taxa,taxa,taxa,2008_results,2008_results,2008_results
subblock,raw,raw,raw,raw,raw,raw,raw,raw,raw,raw,...,raw,raw,raw,raw,raw,raw,raw,DR_clusters,DR_clusters,corridor_clusters
var,Latitude,Longitude,Waterbody,Year,LOI (%),MPS (Phi),Measured Depth (m),Temperature (oC),Velocity at bottom (m/sec),Water DO Bottom (mg/L),...,Hydropsychidae,Hydrozoa,Nematoda,Oligochaeta,Other Trichoptera,Sphaeriidae,Turbellaria,DR_cluster,if_RF,corridor_cluster
StationID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
A10,42.90455,-82.4676,SCR,2004,3.436193,1.734259,1.8,19.17,,10.21,...,3.203427e-16,3.203427e-16,1.727234,6.576942,3.203427e-16,3.203427e-16,3.203427e-16,0,0,1
A23,42.56658,-82.57595,LSC,2004,3.226637,1.969984,3.0,19.1,,10.09,...,3.203427e-16,3.203427e-16,2.427993,5.872905,3.203427e-16,1.022139,0.5997595,0,0,1
A27,42.56007,-82.42132,LSC,2004,2.802642,1.319519,1.0,18.4,,10.3,...,3.203427e-16,3.203427e-16,1.802219,6.40228,1.580169,1.317615,0.9964067,0,0,1
A28,42.54577,-82.42073,LSC,2004,3.201399,1.398687,0.5,18.9,,12.8,...,3.203427e-16,3.203427e-16,2.7718,5.867874,2.049287,2.863547,3.203427e-16,0,0,1
A29,42.5144,-82.43462,LSC,2004,6.180718,1.065748,0.5,19.3,,9.7,...,3.203427e-16,3.203427e-16,4.616441,5.181664,3.203427e-16,3.203427e-16,3.203427e-16,0,0,1


In [2093]:
# transform the stressor features: standardization (z-score)
scaler = StandardScaler()
stressor_standardized = pd.DataFrame(scaler.fit_transform(stressor), 
                       columns=stressor.columns, 
                       index=stressor.index)
stressor_standardized.head()


var,%OC,1234-TCB,1245-TCB,Al,As,Bi,Ca,Cd,Co,Cr,...,OCS,Pb,QCB,Sb,V,Zn,mirex,"p,p'-DDD","p,p'-DDE",total PCB
StationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10,0.891094,2.27375,-0.109891,-0.534484,-0.024307,0.102204,-0.243792,-0.228167,-0.816185,-0.283098,...,-0.304576,-0.130198,-0.372118,0.680575,-0.405459,-0.160339,-0.481049,-0.142024,-0.279366,-0.273062
A23,0.739427,1.524231,-0.148941,-0.212959,0.406751,-0.039232,0.890165,0.046278,-0.142737,-0.109473,...,-0.391815,-0.197983,-0.03025,-0.67058,-0.215132,-0.105197,-0.028991,-0.106894,-0.051642,-0.138919
A27,0.432558,0.803277,-0.090275,1.824332,0.592565,-1.730117,0.849493,-0.432147,1.042867,0.742822,...,-0.251922,-0.191915,-0.37679,-1.342046,1.429484,0.253704,0.718048,0.035209,-0.045607,-0.118412
A28,0.721161,-0.068323,-0.25539,1.630346,0.479723,-1.728875,0.171884,-0.692553,0.807736,0.544536,...,-0.347174,-0.210492,-0.329924,-1.342046,1.107393,0.167874,-0.481049,0.040881,0.091281,-0.284692
A29,2.87746,0.220356,-0.14989,4.080804,1.326794,-1.720237,0.755132,-0.602748,2.794566,2.683189,...,-0.13556,-0.003297,-0.266558,-1.33308,3.291268,1.201183,1.819931,-0.162225,-0.003537,-0.165168


# Train a weight-driven PCA model and evaluate its performance

In [2094]:
from zci.sediment_pollution_assessment.weighted_pca import WeightedPCA_Scores
from zci.sediment_pollution_assessment.chemical_weights import build_weights_for_columns, VARIABLE_TYPE_BY_NAME # function to build weight mapping for variables

# take the stressor data block
stressor = get_block(master, block="chemical", subblock= "raw")

# define weights for different variable types
chem_cols = stressor.columns.tolist()

# specify custom weights for certain types (others will use defaults)
high_weight = 1

# Build the final weight map for the variables for later use in weighted PCA
custom_weight_map = build_weights_for_columns(chem_cols,
                                              weights_by_name={ "As": high_weight,
                                                               } # set a subtle weight for Cd specifically
                                             )

# initialize a weighted PCA grader with the custom weights
weighted_PCA_grader = WeightedPCA_Scores(
    custom_weight_map,
    weight_threshold=high_weight,
    group_thresholds=(0.2, 0.8)
)

# compute the weighted PCA scores
scores_with_labels = weighted_PCA_grader.fit_transform(stressor_standardized)
scores_with_labels.head()


=== Selected Principal Components ===
PC       Explained Var   High-Weighted Variable Loadings
--------------------------------------------------------------------------------
PC1      0.3494          %OC: 0.245, 1234-TCB: 0.047, 1245-TCB: 0.058, Al: 0.245, As: 0.151, Bi: -0.139, Ca: 0.140, Cd: 0.156, Co: 0.262, Cr: 0.262, Cu: 0.219, Fe: 0.256, HCB: 0.012, Heptachlor Epoxide: -0.043, Hg: 0.117, K: 0.243, Mg: 0.145, Mn: 0.254, Na: 0.221, Ni: 0.300, OCS: 0.046, Pb: 0.167, QCB: 0.032, Sb: -0.184, V: 0.255, Zn: 0.243, mirex: 0.072, p,p'-DDD: 0.103, p,p'-DDE: 0.120, total PCB: 0.141
PC2      0.1657          %OC: -0.101, 1234-TCB: 0.001, 1245-TCB: -0.122, Al: -0.172, As: -0.048, Bi: -0.089, Ca: -0.199, Cd: 0.193, Co: -0.156, Cr: 0.186, Cu: 0.224, Fe: -0.123, HCB: -0.072, Heptachlor Epoxide: 0.075, Hg: 0.247, K: -0.190, Mg: -0.212, Mn: -0.169, Na: -0.174, Ni: 0.019, OCS: -0.132, Pb: 0.328, QCB: -0.113, Sb: 0.000, V: -0.160, Zn: 0.169, mirex: 0.283, p,p'-DDD: 0.325, p,p'-DDE: 0.226, total PCB

Unnamed: 0_level_0,pollution_score,pollution_quality
StationID,Unnamed: 1_level_1,Unnamed: 2_level_1
A10,-5.244765,medium
A23,0.812907,medium
A27,1.880309,medium
A28,-2.874817,medium
A29,12.653827,degraded


In [2095]:
# Get the loadings and sort them by variable type categories
loadings = weighted_PCA_grader.pca_results.loadings

# Define the sorting order based on the variable type hierarchy
variable_type_order = {
    "Trace Metal (pollutant)": 1,
    "Hydrocarbon pollutant": 2, 
    "organochlorine pesticide": 3,
    "Sum of all PCBs": 4,
    "Binding agent": 5,
    "Earth element (nontoxic)": 6
}

# Create a sorting key based on variable type and then alphabetically within type
def get_sort_key(variable_name):
    var_type = VARIABLE_TYPE_BY_NAME.get(variable_name, "Unknown")
    type_priority = variable_type_order.get(var_type, 99)
    return (type_priority, variable_name)

# Sort the loadings index based on the sorting key
sorted_variables = sorted(loadings.index, key=get_sort_key)

# Reindex the loadings DataFrame with sorted variables
loadings_sorted = loadings.reindex(sorted_variables)

# Display the sorted loadings
print("Sorted Loadings by Variable Type:")
loadings_sorted

Sorted Loadings by Variable Type:


Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30
var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
As,0.150545,-0.047617,-0.147616,-0.105092,0.069276,-0.316993,-0.11287,0.539898,0.176796,0.5624,...,-0.077502,-0.020531,-0.189688,0.004889,-0.098492,0.011943,0.06596,-0.007532,-0.020825,-0.026935
Bi,-0.139147,-0.089034,0.182475,0.422443,-0.104737,0.13512,0.185862,-0.150193,0.056757,0.305297,...,-0.365898,0.077416,0.011447,0.137035,-0.003435,0.007387,0.093467,0.048037,0.011099,-0.011012
Cd,0.155737,0.193026,0.215688,0.262644,-0.038244,0.111832,-0.023718,0.046718,0.211121,0.06207,...,0.031262,0.101244,0.050621,-0.280087,-0.281635,-0.168824,-0.107697,0.034775,0.179774,0.02113
Co,0.262189,-0.156191,-0.07824,0.000316,0.124677,0.005837,0.153451,-0.113286,0.116396,0.136257,...,0.1932,-0.361942,0.007972,-0.114549,0.412573,-0.130974,-0.289032,-0.347706,0.147559,-0.217377
Cr,0.261748,0.186086,-0.022927,-0.039987,-0.135828,0.075233,0.096965,-0.102741,0.075945,-0.012217,...,-0.055315,0.3872,-0.206757,0.559289,-0.06657,0.002255,-0.0285,-0.350758,-0.057317,-0.341376
Cu,0.218705,0.224452,-0.014441,-0.097329,-0.293162,0.007928,0.076101,-0.002939,-0.082627,-0.015925,...,-0.089741,0.198148,-0.041821,-0.232724,0.156738,-0.299456,-0.237665,0.442098,0.323365,-0.264326
Hg,0.116713,0.246796,0.241324,0.181535,0.227977,-0.159047,-0.074208,-0.055306,0.141864,0.178216,...,0.208922,-0.006563,0.090709,0.272115,0.057932,-0.062778,0.087712,0.066531,0.035797,-0.009789
Mn,0.253713,-0.169164,-0.060606,0.10323,-0.03863,-0.018274,-0.067225,0.142697,-0.244294,0.083193,...,0.55533,0.321963,0.170227,-0.047035,-0.027702,-0.013058,-0.025893,-0.09658,0.008926,0.033962
Ni,0.300317,0.018704,-0.039707,-0.007893,-0.047804,0.05149,0.106761,-0.081416,0.084697,0.024619,...,0.037852,-0.035163,-0.239592,0.218552,0.161412,-0.017345,-0.36494,0.32225,-0.38843,0.571001
Pb,0.167189,0.327962,0.0661,-0.058357,-0.1825,-0.021307,0.005944,0.030418,-0.150445,0.087041,...,-0.316145,-0.257231,0.091145,-0.160528,-0.238563,-0.046549,-0.164611,-0.365339,-0.091091,0.10125


## Completeness Check of the weight-driven PCA results

In [2096]:
from zci.sediment_pollution_assessment.ordination_metrices import evaluate_weighted_pca_representativeness
from sklearn.decomposition import PCA

representative_results = evaluate_weighted_pca_representativeness(weighted_PCA_grader)
print(f"Similarity set: {representative_results['similarity_set']}")
print(f"Representativeness: {representative_results['representativeness']}")

Similarity set: [np.float64(1.0), np.float64(1.0), np.float64(0.9999999999999998), np.float64(1.0), np.float64(0.9999999999999999)]
Representativeness: 1.0


## Discriminatory Power Check of the weight-driven PCA results

In [2097]:
from zci.sediment_pollution_assessment.ordination_metrices import evaluate_weighted_pca_discrimination

discrimination_metrics_results = evaluate_weighted_pca_discrimination(weighted_PCA_grader)
discrimination_metrics_results

{'t_test_p_values_hw': [{'variable': '%OC',
   'D_mean': np.float64(0.0),
   't_statistic': np.float64(0.0),
   'p_value': np.float64(0.5),
   'significant_at_0.01': np.False_},
  {'variable': '1234-TCB',
   'D_mean': np.float64(0.0),
   't_statistic': np.float64(0.0),
   'p_value': np.float64(0.5),
   'significant_at_0.01': np.False_},
  {'variable': '1245-TCB',
   'D_mean': np.float64(0.0),
   't_statistic': np.float64(0.0),
   'p_value': np.float64(0.5),
   'significant_at_0.01': np.False_},
  {'variable': 'Al',
   'D_mean': np.float64(0.0),
   't_statistic': np.float64(0.0),
   'p_value': np.float64(0.5),
   'significant_at_0.01': np.False_},
  {'variable': 'As',
   'D_mean': np.float64(0.0),
   't_statistic': np.float64(0.0),
   'p_value': np.float64(0.5),
   'significant_at_0.01': np.False_},
  {'variable': 'Bi',
   'D_mean': np.float64(0.0),
   't_statistic': np.float64(0.0),
   'p_value': np.float64(0.5),
   'significant_at_0.01': np.False_},
  {'variable': 'Ca',
   'D_mean': n

In [2098]:
from zci.sediment_pollution_assessment.ordination_metrices import groupby_aggregation
groupby_aggregation(stressor, scores_with_labels['pollution_quality'], custom_weight_map,
                    weight_threshold=high_weight).loc['As':, ]

Unnamed: 0_level_0,quality_label,reference,medium,degraded
var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
As,mean,2.048671,1.796037,2.411429
As,std,1.434551,1.343758,1.148135
Bi,mean,15.282545,17.875748,18.23114
Bi,std,6.764816,9.563528,13.903441
Ca,mean,19259.238095,31487.951613,42127.142857
Ca,std,9683.507861,9713.526034,11356.148177
Cd,mean,0.093096,0.345793,0.773495
Cd,std,0.105745,0.152814,0.637142
Co,mean,3.178143,4.025887,6.139857
Co,std,1.098406,1.607576,2.183136


## Collect all the metrics together

In [2099]:
import numpy as np

# Extract metrics from results
wd_pcs_similarity = representative_results['similarity_set']
wd_hw_var_t_test = discrimination_metrics_results['t_test_p_values_hw']
wd_lw_var_t_test = discrimination_metrics_results['t_test_p_values_lw']

# Calculate derived metrics
# ratio_wd_lw_var_t_test_qualified = sum(var_tests['p_value'] >= 0.1 for var_tests in wd_lw_var_t_test)/len(wd_lw_var_t_test)
ratio_wd_hw_var_t_test_qualified = sum(var_tests['p_value'] < 0.1 for var_tests in wd_hw_var_t_test)/len(wd_hw_var_t_test)

# Extract PERMANOVA results (direct keys)
wd_permanova_pseudo_f = discrimination_metrics_results['permanova_pseudo_f']
wd_permanova_p_value = discrimination_metrics_results['permanova_p_value']

# Create a comprehensive metrics summary table
metrics_summary = pd.DataFrame({
    'Metric Category': [
        'Repre',
        'Repre',
        'Repre',
        'Discr - HW Variables',
        'Discr - HW Variables',
        'Discr - LW Variables', 
        'Discr - LW Variables',
        'PERMANOVA Test',
        'PERMANOVA Test'
    ],
    'Metric Name': [
        'PC Similarity (mean)',
        'PC Similarity (std)',
        'Repre Ratio',
        'HW Vars Significant (< 0.1)',
        'HW Vars Significant (%)',
        'LW Vars Non-Significant (≥ 0.1)',
        'LW Vars Non-Significant (%)',
        'Pseudo-F Statistic',
        'P-value (one-directional)'
    ],
    'Value': [
        f"{np.mean(np.abs(wd_pcs_similarity)):.4f}",
        f"{np.std(np.abs(wd_pcs_similarity)):.4f}",
        f"{representative_results['representativeness']:.4f}",
        f"{sum(var_tests['p_value'] < 0.1 for var_tests in wd_hw_var_t_test)}/{len(wd_hw_var_t_test)}",
        f"{ratio_wd_hw_var_t_test_qualified:.2%}",
        f"{sum(var_tests['p_value'] >= 0.1 for var_tests in wd_lw_var_t_test)}/{len(wd_lw_var_t_test)}",
        f"{ratio_wd_lw_var_t_test_qualified:.2%}",
        f"{wd_permanova_pseudo_f:.4f}",
        f"{wd_permanova_p_value:.4f}"
    ],
    'Interpretation': [
        'Higher = better similarity to baseline PCA',
        'Lower = more consistent similarity',
        'Higher = weighted PCA captures more variance',
        'Higher = better discrimination of high-weight vars',
        'Target: > 80% for good discrimination',
        'Higher = low-weight vars appropriately non-significant',
        'Target: > 70% for appropriate weighting',
        'Higher = better group separation',
        'Lower = more significant group differences'
    ]
})

print("=== WEIGHTED PCA EVALUATION METRICS SUMMARY ===")
print(metrics_summary.to_string(index=False))

=== WEIGHTED PCA EVALUATION METRICS SUMMARY ===
     Metric Category                     Metric Name  Value                                         Interpretation
               Repre            PC Similarity (mean) 1.0000             Higher = better similarity to baseline PCA
               Repre             PC Similarity (std) 0.0000                     Lower = more consistent similarity
               Repre                     Repre Ratio 1.0000           Higher = weighted PCA captures more variance
Discr - HW Variables     HW Vars Significant (< 0.1)   0/30     Higher = better discrimination of high-weight vars
Discr - HW Variables         HW Vars Significant (%)  0.00%                  Target: > 80% for good discrimination
Discr - LW Variables LW Vars Non-Significant (≥ 0.1)    0/0 Higher = low-weight vars appropriately non-significant
Discr - LW Variables     LW Vars Non-Significant (%) 51.72%                Target: > 70% for appropriate weighting
      PERMANOVA Test            

In [2100]:
# Generate LaTeX table format
def generate_latex_table(df, caption="Weighted PCA Evaluation Metrics", label="tab:weighted_pca_metrics"):
    """Generate a properly formatted LaTeX table"""
    
    latex_table = f"""
\\begin{{table}}[htbp]
\\centering
\\caption{{{caption}}}
\\label{{{label}}}
\\begin{{tabular}}{{p{{3cm}} p{{4cm}} p{{2cm}} p{{6cm}}}}
\\toprule
\\textbf{{Category}} & \\textbf{{Metric}} & \\textbf{{Value}} & \\textbf{{Interpretation}} \\\\
\\midrule
"""
    
    for _, row in df.iterrows():
        category = row['Metric Category'].replace('&', '\\&')
        metric = row['Metric Name'].replace('&', '\\&').replace('%', '\\%')
        value = str(row['Value']).replace('&', '\\&').replace('%', '\\%')
        interpretation = row['Interpretation'].replace('&', '\\&').replace('%', '\\%').replace('>', '$>$').replace('<', '$<$')
        
        latex_table += f"{category} & {metric} & {value} & {interpretation} \\\\\n"
    
    latex_table += """\\bottomrule
\\end{tabular}
\\end{table}
"""
    
    return latex_table

# Generate the LaTeX table
latex_output = generate_latex_table(metrics_summary)
print("=== LaTeX TABLE FORMAT ===")
print(latex_output)

# Also create a simplified summary for quick reference
print("\n=== QUICK SUMMARY ===")
quick_summary = pd.DataFrame({
    'Aspect': ['Representativeness', 'High-Weight Discrimination', 'Low-Weight Appropriateness', 'Group Separation'],
    'Key Metric': [
        f"Ratio: {representative_results['representativeness']:.3f}",
        f"Significant: {ratio_wd_hw_var_t_test_qualified:.1%}",
        f"Non-significant: {ratio_wd_lw_var_t_test_qualified:.1%}",
        f"PERMANOVA p: {wd_permanova_p_value:.4f}"
    ],
    'Status': [
        'Good' if representative_results['representativeness'] > 0.8 else 'Needs Improvement',
        'Good' if ratio_wd_hw_var_t_test_qualified > 0.8 else 'Needs Improvement',
        'Good' if ratio_wd_lw_var_t_test_qualified > 0.7 else 'Needs Improvement',
        'Significant' if wd_permanova_p_value < 0.05 else 'Not Significant'
    ]
})

print(quick_summary.to_string(index=False))

=== LaTeX TABLE FORMAT ===

\begin{table}[htbp]
\centering
\caption{Weighted PCA Evaluation Metrics}
\label{tab:weighted_pca_metrics}
\begin{tabular}{p{3cm} p{4cm} p{2cm} p{6cm}}
\toprule
\textbf{Category} & \textbf{Metric} & \textbf{Value} & \textbf{Interpretation} \\
\midrule
Repre & PC Similarity (mean) & 1.0000 & Higher = better similarity to baseline PCA \\
Repre & PC Similarity (std) & 0.0000 & Lower = more consistent similarity \\
Repre & Repre Ratio & 1.0000 & Higher = weighted PCA captures more variance \\
Discr - HW Variables & HW Vars Significant (< 0.1) & 0/30 & Higher = better discrimination of high-weight vars \\
Discr - HW Variables & HW Vars Significant (\%) & 0.00\% & Target: $>$ 80\% for good discrimination \\
Discr - LW Variables & LW Vars Non-Significant (≥ 0.1) & 0/0 & Higher = low-weight vars appropriately non-significant \\
Discr - LW Variables & LW Vars Non-Significant (\%) & 51.72\% & Target: $>$ 70\% for appropriate weighting \\
PERMANOVA Test & Pseudo-F Stati

In [2101]:
# Detailed breakdown of individual variable test results
print("=== HIGH-WEIGHT VARIABLES T-TEST RESULTS ===")
hw_vars_df = pd.DataFrame(wd_hw_var_t_test)
hw_vars_df['significant'] = hw_vars_df['p_value'] < 0.01
print(hw_vars_df.to_string(index=False))

print("\n=== LOW-WEIGHT VARIABLES T-TEST RESULTS ===")  
lw_vars_df = pd.DataFrame(wd_lw_var_t_test)
lw_vars_df['non_significant'] = lw_vars_df['p_value'] >= 0.05
print(lw_vars_df.to_string(index=False))

print(f"\n=== PC SIMILARITY VALUES ===")
similarity_df = pd.DataFrame({
    'PC_Pair': [f'PC{i+1}' for i in range(len(wd_pcs_similarity))],
    'Similarity': wd_pcs_similarity
})
print(similarity_df.to_string(index=False))
print(f"Mean Similarity: {np.mean(wd_pcs_similarity):.4f}")
print(f"Std Similarity: {np.std(wd_pcs_similarity):.4f}")

=== HIGH-WEIGHT VARIABLES T-TEST RESULTS ===
          variable  D_mean  t_statistic  p_value  significant_at_0.01  significant
               %OC     0.0          0.0      0.5                False        False
          1234-TCB     0.0          0.0      0.5                False        False
          1245-TCB     0.0          0.0      0.5                False        False
                Al     0.0          0.0      0.5                False        False
                As     0.0          0.0      0.5                False        False
                Bi     0.0          0.0      0.5                False        False
                Ca     0.0          0.0      0.5                False        False
                Cd     0.0          0.0      0.5                False        False
                Co     0.0          0.0      0.5                False        False
                Cr     0.0          0.0      0.5                False        False
                Cu     0.0          0.0   

KeyError: 'p_value'