In [10]:
import pandas as pd
from tabulate import tabulate
from scipy.stats import fisher_exact
from scipy.stats import chi2_contingency

In [11]:
data =  pd.read_excel('/Users/huyanshen/Desktop/4-8-table/AsiaVsUs-shh.xlsx', header=0)

In [3]:
data.columns.tolist()

['ID',
 'MRI-complete label (T1E&T2)',
 'Exclusion-list',
 'CohortID',
 'Geographical Location',
 'Race',
 'Date of Surgery',
 'Age',
 'Sex',
 'Hospital',
 'Molecular Subtypes',
 'Genetic Mutations',
 'Histological Subtypes',
 'Hydrocephalus before Surgery',
 'Tumor Texture',
 'Fourth-ventricle Infiltration',
 'Survival Status',
 'OS (months)',
 'OS-impute (months)',
 'Nanostring',
 'Recurrence-label_raw',
 'DFS (month)-raw',
 'DFS-impute (months)',
 'Radiotherapy',
 'Chemotherapy',
 'TumorResection',
 'ID.1',
 '转移情况/脊髓',
 '后遗症',
 '其他备注',
 'Nanostring.1']

In [4]:
intrst_list = ['Race', 'Sex', 'Histological Subtypes', 'Hydrocephalus before Surgery', 'Tumor Texture', 'TumorResection', 'Radiotherapy', 'Chemotherapy', 'Survival Status']

### Intracranial solid metastases

In [5]:
import numpy as np
c = np.array([[114, 20],
              [542, 64]])
chi2, p_value, dof, expected = chi2_contingency(c,correction=True)
print("P-value:", p_value)

P-value: 0.19678999452208812


In [6]:
c = np.array([[124, 9],
              [218, 35],
              [461, 87]])
chi2, p_value, dof, expected = chi2_contingency(c, correction=True)
print("P-value:", p_value)

P-value: 0.02503576123535973


In [7]:

c = np.array([[139, 11],
              [58, 2]])
_, p_value = fisher_exact(c)
print("P-value:", p_value)

P-value: 0.3570300685040749


In [8]:
c = np.array([[93, 21],
              [335, 52]])
_, p_value = fisher_exact(c)
print("P-value:", p_value)

P-value: 0.2260977778358269


In [12]:
for cato in intrst_list:
    contig = pd.crosstab(data[cato], data['Geographical Location'])
    print(tabulate(contig, headers='keys', tablefmt='fancy_grid'))
    chi2, p_value, dof, expected = chi2_contingency(contig, correction=True)
    print("P-value:", p_value)

╒═══════════════════════════╤═════════════╤═════════════════╕
│ Race                      │   East Asia │   North America │
╞═══════════════════════════╪═════════════╪═════════════════╡
│ Asian                     │         756 │               5 │
├───────────────────────────┼─────────────┼─────────────────┤
│ Black or African American │           0 │               5 │
├───────────────────────────┼─────────────┼─────────────────┤
│ Other or not reported     │          47 │              39 │
├───────────────────────────┼─────────────┼─────────────────┤
│ White                     │           0 │              82 │
╘═══════════════════════════╧═════════════╧═════════════════╛
P-value: 6.934891660640356e-155
╒═══════╤═════════════╤═════════════════╕
│ Sex   │   East Asia │   North America │
╞═══════╪═════════════╪═════════════════╡
│ F     │         280 │              45 │
├───────┼─────────────┼─────────────────┤
│ M     │         523 │              86 │
╘═══════╧═════════════╧═══════════

In [4]:
from lifelines import KaplanMeierFitter
geos = ['East Asia', 'North America', ]
for geo in geos:
    print(f'For {geo}')
    kmf = KaplanMeierFitter()
    kmf.fit(durations=data[data['Geographical Location'] == geo]['OS-impute (months)'], event_observed=data[data['Geographical Location'] == geo]['Survival Status'])
    time_points = [12, 24, 36, 48, 60]

    # Get survival probability at the specified time point
    for time_point in time_points:
        survival_prob = kmf.survival_function_at_times([time_point])
        print(f"Survival probability at {time_point} months:", survival_prob)

print("For all:")
kmf = KaplanMeierFitter()
kmf.fit(durations=data['OS-impute (months)'], event_observed=data['Survival Status'])
time_points = [12, 24, 36, 48, 60]

# Get survival probability at the specified time point
for time_point in time_points:
    survival_prob = kmf.survival_function_at_times([time_point])
    print(f"Survival probability at {time_point} months:", survival_prob)

ci = kmf.confidence_interval_
ci.to_excel("survival.xlsx")



For East Asia
Survival probability at 12 months: 12    0.899411
Name: KM_estimate, dtype: float64
Survival probability at 24 months: 24    0.826894
Name: KM_estimate, dtype: float64
Survival probability at 36 months: 36    0.739153
Name: KM_estimate, dtype: float64
Survival probability at 48 months: 48    0.689307
Name: KM_estimate, dtype: float64
Survival probability at 60 months: 60    0.662553
Name: KM_estimate, dtype: float64
For North America
Survival probability at 12 months: 12    0.96017
Name: KM_estimate, dtype: float64
Survival probability at 24 months: 24    0.889894
Name: KM_estimate, dtype: float64
Survival probability at 36 months: 36    0.820185
Name: KM_estimate, dtype: float64
Survival probability at 48 months: 48    0.786453
Name: KM_estimate, dtype: float64
Survival probability at 60 months: 60    0.756449
Name: KM_estimate, dtype: float64
For all:
Survival probability at 12 months: 12    0.90895
Name: KM_estimate, dtype: float64
Survival probability at 24 months: 24

In [9]:
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines.statistics import multivariate_logrank_test

kmf = KaplanMeierFitter()
groups = data['Geographical Location']
time = data['OS-impute (months)']
event = data['Survival Status']

# Perform log-rank test
results = multivariate_logrank_test(time, groups, event)

# Print the results (p-value)
p_value = results.p_value
print("Overall comparison (Multivariate Log-rank test) p-value:", p_value)



Overall comparison (Multivariate Log-rank test) p-value: 0.06914539877521987
