In [3]:
import pandas as pd
import scipy.stats as stats


df = pd.read_csv('Cleaned Dataset.csv')

# Split into ph and nonph
ph = df[df['PH'] == 1]
nonph = df[df['PH'] == 0]

# Get n for each group
n_ph = len(ph)
n_nonph = len(nonph)

# Age
age_ph_mean = ph['Age'].mean()
age_ph_std = ph['Age'].std()

age_nonph_mean = nonph['Age'].mean()
age_nonph_std = nonph['Age'].std()

# Independent t-test for age
t_stat, p_age = stats.ttest_ind(ph['Age'], nonph['Age'], equal_var=False)

# Sex
sex_counts_ph = ph['Sex'].value_counts()
sex_counts_nonph = nonph['Sex'].value_counts()

# Chi-squared test for sex
sex_table = pd.crosstab(df['PH'], df['Sex'])
chi2_sex, p_sex, _, _ = stats.chi2_contingency(sex_table)

# Ethnicity
eth_counts_ph = ph['Ethnicity'].value_counts()
eth_counts_nonph = nonph['Ethnicity'].value_counts()

# Chi-squared test for ethnicity
eth_table = pd.crosstab(df['PH'], df['Ethnicity'])
chi2_eth, p_eth, _, _ = stats.chi2_contingency(eth_table)

# Table
baseline_table = pd.DataFrame({
    'Characteristic': [
        'Age (years), mean (SD)', 
        'Female, n (%)',
        'British, n (%)'
    ],
    f'PH (n = {n_ph})': [
        f"{age_ph_mean:.1f} ({age_ph_std:.1f})",
        f"{sex_counts_ph.get('Female', 0)} ({(sex_counts_ph.get('Female', 0)/n_ph*100):.1f}%)",
        f"{eth_counts_ph.get('British', 0)} ({(eth_counts_ph.get('British', 0)/n_ph*100):.1f}%)"
    ],
    f'Non-PH (n = {n_nonph})': [
        f"{age_nonph_mean:.1f} ({age_nonph_std:.1f})",
        f"{sex_counts_nonph.get('Female', 0)} ({(sex_counts_nonph.get('Female', 0)/n_nonph*100):.1f}%)",
        f"{eth_counts_nonph.get('British', 0)} ({(eth_counts_nonph.get('British', 0)/n_nonph*100):.1f}%)"
    ],
    'p-value': [
        f"{p_age:.4f}",
        f"{p_sex:.4f}",
        f"{p_eth:.4f}"
    ]
})

# Display
baseline_table

Unnamed: 0,Characteristic,PH (n = 2571),Non-PH (n = 150283),p-value
0,"Age (years), mean (SD)",79.0 (6.3),77.0 (7.1),0.0
1,"Female, n (%)",1174 (45.7%),70934 (47.2%),0.1265
2,"British, n (%)",2251 (87.6%),132058 (87.9%),0.6445


In [4]:
# --- DISEASE COUNT (Categorical: 2, 3, 4, 5, ≥6) ---
bins = [1.5, 2.5, 3.5, 4.5, 5.5, float('inf')]
labels = ['2', '3', '4', '5', '≥6']
df['Disease Count Category'] = pd.cut(df['Disease Count'], bins=bins, labels=labels)

# Create crosstab
disease_ct = pd.crosstab(df['Disease Count Category'], df['PH'])
disease_percent = disease_ct.div(disease_ct.sum(axis=0), axis=1) * 100
disease_formatted = disease_ct.astype(str) + " (" + disease_percent.round(1).astype(str) + "%)"
disease_formatted.columns = [f'Non-PH (n = {n_nonph})', f'PH (n = {n_ph})']
disease_rows = disease_formatted.reset_index()

# Chi-squared test for disease count
chi2_dc, p_dc, _, _ = stats.chi2_contingency(disease_ct.T)

# --- BASELINE TABLE START ---
baseline_table = pd.DataFrame({
    'Characteristic': [
        'Age (years), mean (SD)', 
        'Female, n (%)',
        'British, n (%)'
    ],
    f'PH (n = {n_ph})': [
        f"{age_ph_mean:.1f} ({age_ph_std:.1f})",
        f"{sex_counts_ph.get('Female', 0)} ({(sex_counts_ph.get('Female', 0)/n_ph*100):.1f}%)",
        f"{eth_counts_ph.get('British', 0)} ({(eth_counts_ph.get('British', 0)/n_ph*100):.1f}%)"
    ],
    f'Non-PH (n = {n_nonph})': [
        f"{age_nonph_mean:.1f} ({age_nonph_std:.1f})",
        f"{sex_counts_nonph.get('Female', 0)} ({(sex_counts_nonph.get('Female', 0)/n_nonph*100):.1f}%)",
        f"{eth_counts_nonph.get('British', 0)} ({(eth_counts_nonph.get('British', 0)/n_nonph*100):.1f}%)"
    ],
    'p-value': [
        f"{p_age:.4f}",
        f"{p_sex:.4f}",
        f"{p_eth:.4f}"
    ]
})

# Append disease count rows
for _, row in disease_rows.iterrows():
    baseline_table = pd.concat([baseline_table, pd.DataFrame({
        'Characteristic': [f"Disease Count = {row['Disease Count Category']}"],
        f'PH (n = {n_ph})': [row[f'PH (n = {n_ph})']],
        f'Non-PH (n = {n_nonph})': [row[f'Non-PH (n = {n_nonph})']],
        'p-value': [f"{p_dc:.4f}"]
    })], ignore_index=True)

# Display
baseline_table

Unnamed: 0,Characteristic,PH (n = 2571),Non-PH (n = 150283),p-value
0,"Age (years), mean (SD)",79.0 (6.3),77.0 (7.1),0.0
1,"Female, n (%)",1174 (45.7%),70934 (47.2%),0.1265
2,"British, n (%)",2251 (87.6%),132058 (87.9%),0.6445
3,Disease Count = 2,184 (7.2%),58759 (39.1%),0.0
4,Disease Count = 3,303 (11.8%),36853 (24.5%),0.0
5,Disease Count = 4,344 (13.4%),23131 (15.4%),0.0
6,Disease Count = 5,383 (14.9%),13776 (9.2%),0.0
7,Disease Count = ≥6,1357 (52.8%),17764 (11.8%),0.0


In [None]:
import pandas as pd
import scipy.stats as stats

# display small p-values correctly
def format_p(p):
    return "<0.0001" if p < 0.0001 else f"{p:.4f}"

df = pd.read_csv('Cleaned Dataset.csv')

# Split by PH status
ph = df[df['PH'] == 1]
nonph = df[df['PH'] == 0]
n_ph = len(ph)
n_nonph = len(nonph)

# age
age_ph_mean = ph['Age'].mean()
age_ph_std = ph['Age'].std()
age_nonph_mean = nonph['Age'].mean()
age_nonph_std = nonph['Age'].std()
t_stat, p_age = stats.ttest_ind(ph['Age'], nonph['Age'], equal_var=False)

# sex
sex_counts_ph = ph['Sex'].value_counts()
sex_counts_nonph = nonph['Sex'].value_counts()
sex_table = pd.crosstab(df['PH'], df['Sex'])
chi2_sex, p_sex, _, _ = stats.chi2_contingency(sex_table)

# ethnicity
eth_counts_ph = ph['Ethnicity'].value_counts()
eth_counts_nonph = nonph['Ethnicity'].value_counts()
eth_table = pd.crosstab(df['PH'], df['Ethnicity'])
chi2_eth, p_eth, _, _ = stats.chi2_contingency(eth_table)

# --- disease count - categorical
bins = [1.5, 2.5, 3.5, 4.5, 5.5, float('inf')]
labels = ['2', '3', '4', '5', '≥6']
df['Disease Count Category'] = pd.cut(df['Disease Count'], bins=bins, labels=labels)

# Crosstab for PH vs Disease Count Category
disease_ct = pd.crosstab(df['Disease Count Category'], df['PH'])
disease_percent = disease_ct.div(disease_ct.sum(axis=0), axis=1) * 100
disease_formatted = disease_ct.astype(str) + " (" + disease_percent.round(1).astype(str) + "%)"
disease_formatted.columns = [f'Non-PH (n = {n_nonph})', f'PH (n = {n_ph})']
disease_rows = disease_formatted.reset_index()

# Chi-squared test for disease count
chi2_dc, p_dc, _, _ = stats.chi2_contingency(disease_ct.T)

# Table
baseline_table = pd.DataFrame({
    'Characteristic': [
        'Age (years), mean (SD)', 
        'Female, n (%)',
        'British, n (%)'
    ],
    f'PH (n = {n_ph})': [
        f"{age_ph_mean:.1f} ({age_ph_std:.1f})",
        f"{sex_counts_ph.get('Female', 0)} ({(sex_counts_ph.get('Female', 0)/n_ph*100):.1f}%)",
        f"{eth_counts_ph.get('British', 0)} ({(eth_counts_ph.get('British', 0)/n_ph*100):.1f}%)"
    ],
    f'Non-PH (n = {n_nonph})': [
        f"{age_nonph_mean:.1f} ({age_nonph_std:.1f})",
        f"{sex_counts_nonph.get('Female', 0)} ({(sex_counts_nonph.get('Female', 0)/n_nonph*100):.1f}%)",
        f"{eth_counts_nonph.get('British', 0)} ({(eth_counts_nonph.get('British', 0)/n_nonph*100):.1f}%)"
    ],
    'p-value': [
        format_p(p_age),
        format_p(p_sex),
        format_p(p_eth)
    ]
})

# condition count rows
for _, row in disease_rows.iterrows():
    baseline_table = pd.concat([baseline_table, pd.DataFrame({
        'Characteristic': [f"Disease Count = {row['Disease Count Category']}"],
        f'PH (n = {n_ph})': [row[f'PH (n = {n_ph})']],
        f'Non-PH (n = {n_nonph})': [row[f'Non-PH (n = {n_nonph})']],
        'p-value': [format_p(p_dc)]
    })], ignore_index=True)

# Display
baseline_table


Unnamed: 0,Characteristic,PH (n = 2571),Non-PH (n = 150283),p-value
0,"Age (years), mean (SD)",79.0 (6.3),77.0 (7.1),<0.0001
1,"Female, n (%)",1174 (45.7%),70934 (47.2%),0.1265
2,"British, n (%)",2251 (87.6%),132058 (87.9%),0.6445
3,Disease Count = 2,184 (7.2%),58759 (39.1%),<0.0001
4,Disease Count = 3,303 (11.8%),36853 (24.5%),<0.0001
5,Disease Count = 4,344 (13.4%),23131 (15.4%),<0.0001
6,Disease Count = 5,383 (14.9%),13776 (9.2%),<0.0001
7,Disease Count = ≥6,1357 (52.8%),17764 (11.8%),<0.0001


In [6]:
import pandas as pd
import scipy.stats as stats

# display small p-values correctly
def format_p(p):
    return "<0.0001" if p < 0.0001 else f"{p:.4f}"

df = pd.read_csv('Matched_Cohort.csv')

# Split by PH status
ph = df[df['PH'] == 1]
nonph = df[df['PH'] == 0]
n_ph = len(ph)
n_nonph = len(nonph)

# age
age_ph_mean = ph['Age'].mean()
age_ph_std = ph['Age'].std()
age_nonph_mean = nonph['Age'].mean()
age_nonph_std = nonph['Age'].std()
t_stat, p_age = stats.ttest_ind(ph['Age'], nonph['Age'], equal_var=False)

# sex
sex_counts_ph = ph['Sex'].value_counts()
sex_counts_nonph = nonph['Sex'].value_counts()
sex_table = pd.crosstab(df['PH'], df['Sex'])
chi2_sex, p_sex, _, _ = stats.chi2_contingency(sex_table)

# ethnicity
eth_counts_ph = ph['Ethnicity'].value_counts()
eth_counts_nonph = nonph['Ethnicity'].value_counts()
eth_table = pd.crosstab(df['PH'], df['Ethnicity'])
chi2_eth, p_eth, _, _ = stats.chi2_contingency(eth_table)

# --- disease count - categorical
bins = [1.5, 2.5, 3.5, 4.5, 5.5, float('inf')]
labels = ['2', '3', '4', '5', '≥6']
df['Disease Count Category'] = pd.cut(df['Disease Count'], bins=bins, labels=labels)

# Crosstab for PH vs Disease Count Category
disease_ct = pd.crosstab(df['Disease Count Category'], df['PH'])
disease_percent = disease_ct.div(disease_ct.sum(axis=0), axis=1) * 100
disease_formatted = disease_ct.astype(str) + " (" + disease_percent.round(1).astype(str) + "%)"
disease_formatted.columns = [f'Non-PH (n = {n_nonph})', f'PH (n = {n_ph})']
disease_rows = disease_formatted.reset_index()

# Chi-squared test for disease count
chi2_dc, p_dc, _, _ = stats.chi2_contingency(disease_ct.T)

# Table
baseline_table = pd.DataFrame({
    'Characteristic': [
        'Age (years), mean (SD)', 
        'Female, n (%)',
        'British, n (%)'
    ],
    f'PH (n = {n_ph})': [
        f"{age_ph_mean:.1f} ({age_ph_std:.1f})",
        f"{sex_counts_ph.get('Female', 0)} ({(sex_counts_ph.get('Female', 0)/n_ph*100):.1f}%)",
        f"{eth_counts_ph.get('British', 0)} ({(eth_counts_ph.get('British', 0)/n_ph*100):.1f}%)"
    ],
    f'Non-PH (n = {n_nonph})': [
        f"{age_nonph_mean:.1f} ({age_nonph_std:.1f})",
        f"{sex_counts_nonph.get('Female', 0)} ({(sex_counts_nonph.get('Female', 0)/n_nonph*100):.1f}%)",
        f"{eth_counts_nonph.get('British', 0)} ({(eth_counts_nonph.get('British', 0)/n_nonph*100):.1f}%)"
    ],
    'p-value': [
        format_p(p_age),
        format_p(p_sex),
        format_p(p_eth)
    ]
})

# condition count rows
for _, row in disease_rows.iterrows():
    baseline_table = pd.concat([baseline_table, pd.DataFrame({
        'Characteristic': [f"Disease Count = {row['Disease Count Category']}"],
        f'PH (n = {n_ph})': [row[f'PH (n = {n_ph})']],
        f'Non-PH (n = {n_nonph})': [row[f'Non-PH (n = {n_nonph})']],
        'p-value': [format_p(p_dc)]
    })], ignore_index=True)

# Display
baseline_table

Unnamed: 0,Characteristic,PH (n = 2571),Non-PH (n = 2571),p-value
0,"Age (years), mean (SD)",79.0 (6.3),77.0 (7.1),<0.0001
1,"Female, n (%)",1174 (45.7%),1238 (48.2%),0.0783
2,"British, n (%)",2251 (87.6%),2231 (86.8%),0.4283
3,Disease Count = 2,184 (7.2%),950 (37.0%),<0.0001
4,Disease Count = 3,303 (11.8%),671 (26.1%),<0.0001
5,Disease Count = 4,344 (13.4%),431 (16.8%),<0.0001
6,Disease Count = 5,383 (14.9%),220 (8.6%),<0.0001
7,Disease Count = ≥6,1357 (52.8%),299 (11.6%),<0.0001
