In [None]:
import pandas as pd

In [1]:
import os
import glob
import pandas as pd

# 1) Adjust this path to where your .fxout files live
fxout_path = r'protein_structures/virus/foldxsummary/*.fxout'

# 2) Gather all .fxout files
fxout_files = glob.glob(fxout_path)

# Pre-define columns to match your FoldX output
column_names = [
    "pdb_file",            # e.g. ./sbxp_066123222_Repair.pdb
    "Total",               # column 2
    "BackHbond",           # column 3
    "SideHbond",           # column 4
    "Energy_VdW",          # column 5
    "Electro",             # column 6
    "Energy_SolvP",        # column 7
    "Energy_SolvH",        # column 8
    "Energy_vdwclash",     # column 9
    "Entropy_sidec",       # column 10
    "Entropy_mainc",       # column 11
    "water_bonds",         # column 12
    "helix_dipole",        # column 13
    "loop_entropy",        # column 14
    "energy_torsion",      # column 15
    "backbone_vdwclash",   # column 16
    "cis_bond",            # column 17
    "disulfide",           # column 18
    "kn_electrostatic",    # column 19
    "partial_covalent",    # column 20
    "Energy_Ionisation",   # column 21
    "Entropy_complex",     # column 22
    "extra_col1",          # column 23 - rename if you know what it is
    "extra_col2"           # column 24 - often #residues
]

all_data = []

# 3) Parse each fxout file
for fxfile in fxout_files:
    with open(fxfile, 'r') as f:
        for line in f:
            # Skip comments (#) or empty lines
            if line.startswith('#') or not line.strip():
                continue
            
            # Split columns on whitespace
            parts = line.strip().split()
            
            # The first item is the PDB file path
            pdb_path = parts[0]
            
            # The rest should be numeric
            numeric_strs = parts[1:]
            
            # Convert to float
            numeric_vals = list(map(float, numeric_strs))
            
            # Combine into a single row
            row = [pdb_path] + numeric_vals
            
            all_data.append(row)

# 4) Create a DataFrame
df = pd.DataFrame(all_data, columns=column_names)

# 5) Inspect or save the DataFrame
print(df)

# Optionally, save to CSV
df.to_csv("foldx_summaries.csv", index=False)


                      pdb_file    Total  BackHbond  SideHbond  Energy_VdW  \
0  ./drxp_053766978_Repair.pdb  141.940   -96.8219   -27.2589    -90.2648   
1  ./mbxp_014389106_Repair.pdb  245.430   -91.5259   -36.7521   -104.7190   
2  ./mmxp_036168168_Repair.pdb  204.543  -106.1850   -38.4350   -113.3330   
3  ./moxp_036137889_Repair.pdb  152.022   -92.3881   -25.5217    -84.2316   
4  ./raxp_016009779_Repair.pdb  163.011   -89.3175   -27.4682    -85.3765   
5  ./rfxp_032990674_Repair.pdb  183.648  -142.7340   -43.5087   -124.0790   
6  ./sbxp_066123222_Repair.pdb  157.546   -91.4634   -31.9292    -83.0605   

    Electro  Energy_SolvP  Energy_SolvH  Energy_vdwclash  Entropy_sidec  ...  \
0 -1.174160       144.921     -102.4350          1.94544        53.9740  ...   
1 -0.471549       169.719     -121.9760         10.83560        66.7245  ...   
2 -1.100590       188.189     -127.2330          1.66980        70.0756  ...   
3 -1.383220       134.699      -96.6730          4.62083       

In [2]:
import numpy as np
from scipy import stats


others = np.array([141.940, 152.022, 163.011, 183.648, 157.546])
mb = 245.430   
mm = 204.543   

mean_others = others.mean()
std_others  = others.std(ddof=1)  # sample std with Bessel's correction
n_others    = len(others)

def one_sample_ttest(value, mean, sd, n):
    """
    Compares a single 'value' to the mean of 'n' samples with standard deviation 'sd'.
    Returns a two-tailed p-value for the t-statistic.
    """
    t_stat = (value - mean) / (sd / np.sqrt(n))
    df = n - 1
    p_val = 2 * (1 - stats.t.cdf(abs(t_stat), df=df))
    return t_stat, p_val

t_mb, p_mb = one_sample_ttest(mb, mean_others, std_others, n_others)
print(f"mbxp vs. others: t = {t_mb:.3f}, p = {p_mb:.6f}")

t_mm, p_mm = one_sample_ttest(mm, mean_others, std_others, n_others)
print(f"mmxp vs. others: t = {t_mm:.3f}, p = {p_mm:.6f}")


mbxp vs. others: t = 12.361, p = 0.000246
mmxp vs. others: t = 6.471, p = 0.002939
