Modified Python version of Cara's 2024 R script attached as the .rmd file

# Ratios 1 & 2

## Import and Clean Data

In [1]:
# # I use the anaconda prompt with my Universal_EEG_Analyzer
# # conda activate Universal_EEG_Analyzer
# # pip install ...
# # Here is code to find location of kernel
# import sys
# print(sys.executable)

In [2]:
# Libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import pingouin as pg
import scikit_posthocs as sp
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
from cliffs_delta import cliffs_delta

In [3]:
# Load the dataset
file_path = "C:/Users/Jalynn/OneDrive/Documents/GitHub/Universal_EEG_Analyzer/StudyTwoGermaneLoadAnalysis/GammaResults2/All_Ratios_Summary.csv"
df = pd.read_csv(file_path)
gamma = "C:/Users/Jalynn/OneDrive/Documents/GitHub/Universal_EEG_Analyzer/StudyTwoGermaneLoadAnalysis/GammaResults2/Combined_LowHigh_Gamma.csv"

In [4]:
# Added for testing with ptps not completely finished
data_cols = [col for col in df.columns if col != 'PIDPhase']
df = df.dropna(subset=data_cols, how='all')

# Split PXX_phase into PXX and phase columns
df[['PID', 'Phase']] = df['PIDPhase'].astype(str).str.extract(r'P(\d+)_([A-Za-z]+)')
df = df.drop(columns=['PIDPhase'])

# Ensure condition is a factor in R/category in Python
df['Condition'] = df['Phase'].astype('category')

# Reset the index
df = df.reset_index(drop=True)
print(df)

    LowGermane_ratio1  LowGermane_ratio2  HighGermane_ratio1  \
0            0.261785           0.097010            0.287086   
1            0.283345           0.077160            0.250629   
2            0.257052           0.065430            0.266024   
3            0.287995           0.092861            0.530054   
4            0.145543           0.061338            0.333142   
..                ...                ...                 ...   
61           0.252608           0.085566            0.370564   
62           0.233573           0.085688            0.794715   
63           0.354569           0.395514            0.267122   
64           0.342092           0.106157            0.469203   
65           0.157268           0.070899            0.322174   

    HighGermane_ratio2 PID  Phase Condition  
0             0.098088  02  train     train  
1             0.083527  03  train     train  
2             0.080363  04  train     train  
3             0.095746  05  train     train  
4

## Is my data normally distributed?

In [5]:
# Shapiro-Wilk Text
# Basically a t-test
shapiro_stat, shapiro_p = stats.shapiro(df['HighGermane_ratio2'].dropna())
print(f"--- Shapiro-Wilk Test for LowGermane_ratio1 ---")
print(f"Statistic: {shapiro_stat:.4f}, p-value: {shapiro_p}\n")

--- Shapiro-Wilk Test for LowGermane_ratio1 ---
Statistic: 0.7405, p-value: 1.7917577719337372e-09



High Level Overview:  
* Used Shapiro-Wilk test to confirm my data is not normally distributed (low p-value)

## Ratios between conditions

In [6]:
print("\n--- Ratios Between Conditions (Paired Wilcoxon Test + Cliff's delta) ---")

for phase in df['Phase'].unique():
    print(f"\n--- Analyzing Phase: {phase.upper()} ---")
    df_phase = df[df['Phase'] == phase]

    # --- Test 1: ratio1 (Low vs High) ---
    data_paired_r1 = df_phase[['LowGermane_ratio1', 'HighGermane_ratio1']].dropna()
    x1 = data_paired_r1['LowGermane_ratio1']
    y1 = data_paired_r1['HighGermane_ratio1']

    paired_test_r1 = pg.wilcoxon(x1, y1, alternative='two-sided')

    delta_r1, res1 = cliffs_delta(x1.values, y1.values)
    cles_r1 = (delta_r1 + 1) / 2 if not np.isnan(delta_r1) else np.nan
    direction_r1 = ("LowGermane > HighGermane" if delta_r1 > 0
                    else "LowGermane < HighGermane" if delta_r1 < 0
                    else "No tendency")

    print("\nTest: LowGermane_ratio1 vs. HighGermane_ratio1")
    print(paired_test_r1)
    print(f"Cliff's delta (δ): {delta_r1:.4f}")
    print(f"Equivalent CLES: {cles_r1:.3f}  ({cles_r1*100:.1f}% chance a random LowGermane > random HighGermane)")
    print(f"Direction: {direction_r1}")

    # --- Test 2: ratio2 (Low vs High) ---
    data_paired_r2 = df_phase[['LowGermane_ratio2', 'HighGermane_ratio2']].dropna()
    x2 = data_paired_r2['LowGermane_ratio2']
    y2 = data_paired_r2['HighGermane_ratio2']

    paired_test_r2 = pg.wilcoxon(x2, y2, alternative='two-sided')

    delta_r2, res2 = cliffs_delta(x2.values, y2.values)
    cles_r2 = (delta_r2 + 1) / 2 if not np.isnan(delta_r2) else np.nan
    direction_r2 = ("LowGermane > HighGermane" if delta_r2 > 0
                    else "LowGermane < HighGermane" if delta_r2 < 0
                    else "No tendency")

    print("\nTest: LowGermane_ratio2 vs. HighGermane_ratio2")
    print(paired_test_r2)
    print(f"Cliff's delta (δ): {delta_r2:.4f}")
    print(f"Equivalent CLES: {cles_r2:.3f}  ({cles_r2*100:.1f}% chance a random LowGermane > random HighGermane)")
    print(f"Direction: {direction_r2}")

# Optional: thresholds for interpreting |δ|
print("\nInterpretation thresholds (absolute δ): small ~0.147, medium ~0.33, large ~0.474")



--- Ratios Between Conditions (Paired Wilcoxon Test + Cliff's delta) ---

--- Analyzing Phase: TRAIN ---

Test: LowGermane_ratio1 vs. HighGermane_ratio1
          W-val alternative     p-val       RBC      CLES
Wilcoxon  128.0   two-sided  0.005484 -0.543672  0.317723
Cliff's delta (δ): -0.3646
Equivalent CLES: 0.318  (31.8% chance a random LowGermane > random HighGermane)
Direction: LowGermane < HighGermane

Test: LowGermane_ratio2 vs. HighGermane_ratio2
          W-val alternative    p-val       RBC      CLES
Wilcoxon  144.0   two-sided  0.01369 -0.486631  0.408632
Cliff's delta (δ): -0.1827
Equivalent CLES: 0.409  (40.9% chance a random LowGermane > random HighGermane)
Direction: LowGermane < HighGermane

--- Analyzing Phase: TEST ---

Test: LowGermane_ratio1 vs. HighGermane_ratio1
          W-val alternative     p-val       RBC      CLES
Wilcoxon  248.0   two-sided  0.572177 -0.115865  0.438935
Cliff's delta (δ): -0.1221
Equivalent CLES: 0.439  (43.9% chance a random LowGermane > 

Between Conditions Key Findings: statistical significance between low and high loads during the training. This difference disappears in the test phase

High Level Overview:
* Used Wilcoxon, non-parametric tests, instead of t-test
* Used Mann-Whitney U to compare train vs. test (within conditions) and found no significant difference
* Used Paired Wilcoxon to compare low vs. high and found a significant difference in the train phase that disappears in the test phase

## Ratios within conditions

In [7]:
phases = df['Phase'].unique()
if len(phases) == 2:
    print(f"X is: {phases[0]}")
    print(f"Y is: {phases[1]}")
    group1_data = df[df['Phase'] == phases[0]]
    group2_data = df[df['Phase'] == phases[1]]
    
    ratios_to_test = [
        "LowGermane_ratio1", "LowGermane_ratio2", "HighGermane_ratio1", "HighGermane_ratio2"
    ]

    for ratio in ratios_to_test:
        x = group1_data[ratio].dropna()
        y = group2_data[ratio].dropna()
        
        w_stat, w_p = wilcoxon(x, y, alternative='two-sided')
        
        delta, res = cliffs_delta(x, y)
        
        print(f"\n{ratio} (by Phase: {phases[0]} vs {phases[1]}):")
        print(f"  Wilcoxon Statistic: {w_stat:.4f}, p-value: {w_p:.4f}")
        print(f"  Cliff's delta (δ): {delta:.4f}")
        
else:
    print(f"Warning: 'Phase' column has {len(phases)} groups. MWU test is for 2 groups.")

X is: train
Y is: test

LowGermane_ratio1 (by Phase: train vs test):
  Wilcoxon Statistic: 251.0000, p-value: 0.6088
  Cliff's delta (δ): -0.0909

LowGermane_ratio2 (by Phase: train vs test):
  Wilcoxon Statistic: 220.0000, p-value: 0.2878
  Cliff's delta (δ): -0.1680

HighGermane_ratio1 (by Phase: train vs test):
  Wilcoxon Statistic: 254.0000, p-value: 0.6464
  Cliff's delta (δ): -0.0193

HighGermane_ratio2 (by Phase: train vs test):
  Wilcoxon Statistic: 197.0000, p-value: 0.1396
  Cliff's delta (δ): 0.0523


Shapiro-Wilk Test Key finding: My p-value is always extremely close to zero. I cannot use a t-test since my data is not normally distributed.

Mann-Whitney U test Key finding: When comparing the train phase to the test phase, there is no significant differences found.

# Visualize Ratios  
82 to 132 Visualize Within Conditions
140 to 205 Visualize Between Conditions

# Gamma: Between Conditions
207 to 244 Shapiro Test on Gamma Freq
246 to 311 Wilcox Test on Gamma Freq
313 to 315 Output Gamma Long to CSV

In [8]:
try:
    df = pd.read_csv(gamma)
    df.columns = df.columns.str.strip()
    df = df.rename(columns={"Train/Test": "Phase"})
    df['Phase'] = df['Phase'].str.lower()
    print("--- Successfully loaded wide data from CSV ---")
    
except FileNotFoundError:
    print(f"Error: The file was not found at {gamma}")
    df = pd.DataFrame()

# --- Paired "Between-Conditions" (Cond1 vs Cond2) ---
if not df.empty:
    paired_results_list = []
    
    eeg_measures = ['Fz', 'Cz', 'Pz']
    phases = df['Phase'].unique() # Should be ['test', 'train']
    
    print(f"\n--- Running Paired Tests (Cond1 vs Cond2) for phases: {phases} ---")

    for phase in phases:
        df_phase = df[df['Phase'] == phase]
        
        for measure in eeg_measures:
            col_1 = f'Cond1_{measure}'
            col_2 = f'Cond2_{measure}'
            
            if col_1 not in df_phase.columns or col_2 not in df_phase.columns:
                print(f"Skipping {measure} in {phase}: Columns {col_1} or {col_2} not found.")
                continue
                
            data_paired = df_phase[['PID', col_1, col_2]].dropna()
            
            x = data_paired[col_1]
            y = data_paired[col_2]
            
            if len(x) > 1:
                try:
                    test_stat = wilcoxon(x, y) 
                    delta, res = cliffs_delta(x.values, y.values)
                    
                    paired_results_list.append({
                        'Measure': measure,
                        'Phase': phase,
                        'Comparison': 'Cond1 vs Cond2',
                        'W-val': test_stat.statistic,
                        'p-val': test_stat.pvalue,
                        'Cliff_delta': delta,
                        'Res': res
                    })
                except Exception as e:
                    print(f"Error running test for {measure} in {phase}: {e}")
            else:
                print(f"Not enough paired data for {measure} in {phase}")

    # Display Paired Results Table
    paired_results_df = pd.DataFrame(paired_results_list)
    
    print("\n" + "="*50)
    print("      Paired 'Between-Conditions' Results")
    print("="*50)
    print(paired_results_df)

else:
    print("\nCould not run analysis because data was not loaded.")

--- Successfully loaded wide data from CSV ---

--- Running Paired Tests (Cond1 vs Cond2) for phases: ['test' 'train'] ---

      Paired 'Between-Conditions' Results
  Measure  Phase      Comparison  W-val     p-val  Cliff_delta         Res
0      Fz   test  Cond1 vs Cond2  242.0  0.502123     0.067034  negligible
1      Cz   test  Cond1 vs Cond2  279.0  0.985937    -0.041322  negligible
2      Pz   test  Cond1 vs Cond2  207.0  0.194807     0.162534       small
3      Fz  train  Cond1 vs Cond2  149.0  0.017816     0.331497      medium
4      Cz  train  Cond1 vs Cond2  279.0  0.985937     0.030303  negligible
5      Pz  train  Cond1 vs Cond2  250.0  0.596490     0.050505  negligible


# Correlate with Behavior  
317 to 342 List all summaries so far
344 to 397 Corresponding EEG with Error / Performance
399 to 442 Etc.
444 to 598 Etc.
600 to 652 Etc.
657 to 725 Etc.