# Imports

In [1]:
import numpy as np
import secrets
import pandas as pd
import scipy.stats as spstats
from tqdm import tqdm
from IPython.display import display
from python_functions import check_dir, between_cancer_corr

# Directories

In [2]:
rawdata = "data/raw/"
processeddata = "data/processed/"
shuffled_dir = processeddata+"shuffled_bct/"
check_dir(shuffled_dir)

# Tumour Mutational Burden

In [3]:
silent_mutation = ["Silent", "5'Flank", "3'Flank", "5'UTR", "3'UTR", "Intron", "RNA"]
mutationtab = (
    pd.read_table(rawdata+"mc3.v0.2.8.PUBLIC.xena",)
    [["sample", "gene", "effect"]]
    .assign(
      patient=lambda x: x["sample"].str.split("-").str[:-1].str.join("-"),
      nonsilent=lambda x: ~x["effect"].isin(silent_mutation))
    .drop(columns=["sample"])
    .drop_duplicates()
)
print(mutationtab.shape)
mutationtab.head()

(2625385, 4)


Unnamed: 0,gene,effect,patient,nonsilent
0,TACC2,Missense_Mutation,TCGA-02-0003,True
1,JAKMIP3,Silent,TCGA-02-0003,False
2,PANX3,Missense_Mutation,TCGA-02-0003,True
3,SPI1,Missense_Mutation,TCGA-02-0003,True
4,NAALAD2,Missense_Mutation,TCGA-02-0003,True


In [4]:
exptab = pd.read_feather(processeddata+"expression.feather")
clintab = exptab.index.to_frame(index=False)
print("# patients:", len(clintab))
print("# cancer types:", clintab.cancer_type.nunique())
display(exptab.head())
clintab.head()

# patients: 7317
# cancer types: 32


Unnamed: 0_level_0,Unnamed: 1_level_0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAT,AAGAB,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
patient,cancer_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
TCGA-02-0047,GBM,6.98,0.0,15.05,5.4,5.22,1.16,8.87,8.92,7.87,10.01,...,8.03,8.66,6.05,8.48,10.12,0.69,10.24,11.92,10.45,9.24
TCGA-02-0055,GBM,8.62,0.0,15.39,1.42,8.93,0.64,9.22,8.31,6.66,10.41,...,8.87,7.95,5.45,8.14,9.25,2.6,9.85,13.49,9.25,9.49
TCGA-02-2483,GBM,8.09,0.0,14.36,1.82,6.46,0.0,10.11,8.95,8.02,9.92,...,9.42,9.39,4.35,8.67,9.76,5.5,10.24,12.31,9.7,9.46
TCGA-02-2485,GBM,6.41,0.0,12.93,7.73,7.29,0.56,9.99,8.25,7.58,10.36,...,8.79,8.79,5.78,8.1,10.4,0.0,10.06,12.31,10.16,9.45
TCGA-02-2486,GBM,6.77,0.0,15.32,6.71,5.49,0.0,9.46,8.62,7.77,10.54,...,7.39,6.24,5.03,7.64,9.35,0.0,9.43,12.93,9.3,9.05


Unnamed: 0,patient,cancer_type
0,TCGA-02-0047,GBM
1,TCGA-02-0055,GBM
2,TCGA-02-2483,GBM
3,TCGA-02-2485,GBM
4,TCGA-02-2486,GBM


In [5]:
drivermutations = pd.read_feather(processeddata+"mutation.feather")
print(drivermutations.shape)
drivermutations.head()

(7317, 3081)


Unnamed: 0_level_0,Unnamed: 1_level_0,A1CF,A2ML1,ABCA10,ABCA13,ABCA7,ABCB1,ABCB5,ABCC3,ABCC5,ABCC9,...,ZRANB3,ZRSR2,ZSCAN31,ZSCAN4,ZSWIM3,ZSWIM6,ZSWIM7,ZWILCH,ZWINT,ZZEF1
patient,cancer_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
TCGA-02-0047,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-0055,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-2483,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-2485,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-2486,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
graph = pd.read_csv(processeddata+"main_graph.csv")
drivers = graph["driver"].unique()
neighbours = graph["neighbour"].unique()
print("# drivers:", len(drivers))
print("# neighbours:", len(neighbours))

# drivers: 3081
# neighbours: 15465


In [7]:
mutation_burden = (
    mutationtab
    .groupby("patient", as_index=False)
    ["nonsilent"]
    .sum()
    .merge(clintab, on="patient")
    .rename(columns={"nonsilent": "mutation_burden"})
    [["patient", "cancer_type", "mutation_burden"]]
)
print(mutation_burden.shape)
mutation_burden.to_csv(processeddata+"mutation_burden.csv", index=False)
mutation_burden.head()

(7317, 3)


Unnamed: 0,patient,cancer_type,mutation_burden
0,TCGA-02-0047,GBM,61
1,TCGA-02-0055,GBM,49
2,TCGA-02-2483,GBM,43
3,TCGA-02-2485,GBM,50
4,TCGA-02-2486,GBM,56


# Between Cancer Types Associations

## Load Data

In [8]:
graph = pd.read_feather(processeddata+"neighbours.feather")
print(graph.shape)
graph.head()

(15465, 3081)


driver,A1CF,A2ML1,ABCA10,ABCA13,ABCA7,ABCB1,ABCB5,ABCC3,ABCC5,ABCC9,...,ZRANB3,ZRSR2,ZSCAN31,ZSCAN4,ZSWIM3,ZSWIM6,ZSWIM7,ZWILCH,ZWINT,ZZEF1
neighbour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A1CF,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A2M,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A2ML1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A4GALT,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
mutationtab = pd.read_feather(processeddata+"mutation.feather")
mutation_filter = mutationtab[mutationtab > 0].fillna(0).astype(bool)

# number of individuals with mutations per cancer type
cancer_freq = (
    mutationtab.index.to_frame(index=False)
    .groupby("cancer_type", as_index=False)
    .size()
    .rename(columns={"size": "freq"})
)
# number of mutated individuals per cancer type
mutationtab = (
    mutationtab
    .groupby(["cancer_type"])
    .sum()
)

display(cancer_freq.head())
print(mutationtab.shape)
mutationtab.head()

Unnamed: 0,cancer_type,freq
0,ACC,73
1,BLCA,288
2,BRCA,760
3,CESC,253
4,CHOL,35


(32, 3081)


Unnamed: 0_level_0,A1CF,A2ML1,ABCA10,ABCA13,ABCA7,ABCB1,ABCB5,ABCC3,ABCC5,ABCC9,...,ZRANB3,ZRSR2,ZSCAN31,ZSCAN4,ZSWIM3,ZSWIM6,ZSWIM7,ZWILCH,ZWINT,ZZEF1
cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACC,0,0,0,1,0,0,1,1,0,1,...,2,1,0,0,0,1,0,0,0,0
BLCA,1,6,4,19,7,4,8,7,5,12,...,5,2,0,0,0,0,1,2,0,10
BRCA,1,5,6,15,7,4,5,4,6,10,...,2,0,0,0,4,2,0,3,2,4
CESC,3,4,1,10,3,1,3,3,2,5,...,2,1,0,1,0,0,0,2,1,2
CHOL,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
exptab = (
    pd.read_feather(processeddata+"expression.feather")
    .groupby(["cancer_type"])
    .mean()
)
print(exptab.shape)
exptab.head()

(32, 15465)


Unnamed: 0_level_0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAT,AAGAB,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACC,5.851781,0.135616,13.047671,3.620548,7.893425,0.310274,10.721507,10.201781,5.04,10.077123,...,7.49274,8.361507,4.685342,7.888904,9.919589,1.240137,9.573425,10.610822,9.883151,8.889315
BLCA,5.422882,0.433264,12.652847,7.206562,9.460729,0.675278,9.714722,9.418819,7.122535,10.239965,...,8.883403,9.703507,4.867361,8.038889,9.965208,3.223333,9.590139,12.098681,9.826944,9.302361
BRCA,7.128882,0.109618,13.495566,3.458474,8.081,0.671026,9.417474,9.965684,5.593118,10.629211,...,8.7745,9.377132,5.894934,8.994039,10.07525,6.206553,9.729579,11.800447,10.150566,9.807053
CESC,5.663557,0.38253,11.30913,8.673913,9.790198,0.75581,9.70751,10.105059,5.265534,10.303043,...,9.702885,10.714466,5.028024,8.102213,10.281028,6.270435,9.25415,11.797036,9.832688,9.449921
CHOL,7.678286,7.446,12.957429,0.789429,8.179714,3.774571,9.802286,9.98,5.859714,10.088,...,8.234857,8.757429,5.628286,8.532286,10.021429,1.162857,9.445429,11.936571,10.002857,9.471714


In [11]:
mutation_burden = (
    pd.read_csv(processeddata+"mutation_burden.csv")
    .groupby("cancer_type", as_index=False)
    ["mutation_burden"]
    .mean()
    .merge(cancer_freq, on="cancer_type")
    .set_index("cancer_type")
    .sort_index()
    .rename(columns=lambda x: x.lower())
)
mutation_burden.head()

Unnamed: 0_level_0,mutation_burden,freq
cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1
ACC,34.712329,73
BLCA,119.857639,288
BRCA,46.130263,760
CESC,87.233202,253
CHOL,33.828571,35


In [12]:
print("exptab:", exptab.shape)
print("mutationtab:", mutationtab.shape)
print("mutation_burden:", mutation_burden.shape)
print("graph:", graph.shape)

exptab: (32, 15465)
mutationtab: (32, 3081)
mutation_burden: (32, 2)
graph: (15465, 3081)


## Data preparation

In [13]:
exparray = (
    exptab
    .sort_index(level=0)
    .to_numpy()
)
mutationarray = (
    mutationtab
    # compute relative frequency
    .div(mutation_burden.freq, axis=0)
    # order axis
    .T.sort_index()
    .T.sort_index(level=0)
    # correct mutation frequency for mutational burden
    .div(np.log10(mutation_burden.mutation_burden+1), axis=0)
    .fillna(0)
    .to_numpy()
)

# filter to remove cancer types with no mutations prior to correlation calculation
cancertype_filter = (mutationarray.T > 0)

In [14]:
print("exparray:", exparray.shape)
print("mutationarray:", mutationarray.shape)
print("graph:", graph.shape)
print("cancertype_filter:", cancertype_filter.shape)

exparray: (32, 15465)
mutationarray: (32, 3081)
graph: (15465, 3081)
cancertype_filter: (3081, 32)


In [15]:
nmut = mutationtab.sum(axis=0).sort_index()
nct = pd.Series(cancertype_filter.sum(axis=1), index=nmut.index)
nct_nmut = pd.concat([nct, nmut], axis=1).rename(columns={0: "nct", 1: "nmut"})
nct_nmut.head()

Unnamed: 0,nct,nmut
A1CF,16,39
A2ML1,21,67
ABCA10,20,73
ABCA13,28,266
ABCA7,20,66


## All cancer types

In [16]:
corr = (
    between_cancer_corr(exparray, mutationarray, graph, cancertype_filter)
    # add # of cancer types and # of mutations per driver
    .merge(nct_nmut, left_on="driver", right_index=True)
)
corr.head()

  0%|          | 0/3081 [00:00<?, ?it/s]

Unnamed: 0,driver,rho,rho_pval,neighbour,nct,nmut
0,A1CF,0.364706,0.164868,APOB,16,39
1,A1CF,0.273529,0.305323,APOBEC1,16,39
2,A1CF,0.270588,0.310761,APOBEC2,16,39
3,A1CF,-0.088235,0.745222,APOBEC3A,16,39
4,A1CF,-0.082353,0.761733,APOBEC3B,16,39


In [17]:
# save results
corr.to_feather(processeddata+"bct_obs.feather")

## Low and high mutation frequency groups

In [18]:
median = np.nanmedian(np.where(mutationarray > 0, mutationarray, np.nan), axis=0)
low_mut_filter = cancertype_filter & (mutationarray <= median).T
high_mut_filter = cancertype_filter & (mutationarray >= median).T
driver_indexer = np.flatnonzero(high_mut_filter.sum(axis=1) > 2)

mutationarray = mutationarray[:, driver_indexer]
low_mut_group = low_mut_filter[driver_indexer]
high_mut_group = high_mut_filter[driver_indexer]
filtgraph = graph.iloc[:, driver_indexer]
print("mutationarray:", mutationarray.shape)
print("low_mut_group:", low_mut_group.shape)
print("high_mut_group:", high_mut_group.shape)
print("filtgraph:", filtgraph.shape)

mutationarray: (32, 2972)
low_mut_group: (2972, 32)
high_mut_group: (2972, 32)
filtgraph: (15465, 2972)


In [19]:
corr_results = []
for group, filter_ in zip(
    ["low_mut", "high_mut"], [low_mut_group, high_mut_group]
):
    print(group)
    # compute correlation
    corr = between_cancer_corr(
        exparray,
        mutationarray,
        filtgraph,
        filter_,
    )
    # save results
    corr_results.append(corr)

corr_results = pd.merge(
    *corr_results,
    on=["driver", "neighbour"],
    suffixes=("_low_mut", "_high_mut"),
)
corr_results.head()

low_mut


  0%|          | 0/2972 [00:00<?, ?it/s]

high_mut


  0%|          | 0/2972 [00:00<?, ?it/s]

Unnamed: 0,driver,rho_low_mut,rho_pval_low_mut,neighbour,rho_high_mut,rho_pval_high_mut
0,A1CF,0.809524,0.014903,APOB,0.357143,0.385121
1,A1CF,-0.285714,0.492726,APOBEC1,0.357143,0.385121
2,A1CF,0.333333,0.419753,APOBEC2,0.47619,0.232936
3,A1CF,-0.404762,0.319889,APOBEC3A,-0.428571,0.289403
4,A1CF,-0.309524,0.455645,APOBEC3B,-0.047619,0.910849


In [20]:
corr_results.to_feather(processeddata+"bct_obs_mutation_groups.feather")

# Filtered Results

In [21]:
maingraph = pd.read_csv(processeddata+"main_graph.csv")
pairs_to_exclude = pd.read_csv(processeddata+"pairs_to_exclude_bct.csv")
print("# pairs in main graph:", len(maingraph))
print("# pairs to exclude:", len(pairs_to_exclude))
display(maingraph.head())
pairs_to_exclude.head()

# pairs in main graph: 456493
# pairs to exclude: 135440


Unnamed: 0,driver,neighbour
0,A1CF,APOB
1,A1CF,APOBEC1
2,A1CF,APOBEC2
3,A1CF,APOBEC3A
4,A1CF,APOBEC3B


Unnamed: 0,driver,neighbour
0,A1CF,METTL14
1,A2ML1,L2HGDH
2,A2ML1,MOGAT3
3,A2ML1,STX17
4,ABCA13,FANCD2


In [22]:
filteredpairs = (
  maingraph
  .merge(pairs_to_exclude, how="outer", indicator=True)
  .query("_merge == 'left_only'")
  .drop(columns=["_merge"])
)
print("pairs in filtered graph:", len(filteredpairs))
print("# drivers:", filteredpairs.driver.nunique())
print("# neighbours:", filteredpairs.neighbour.nunique())

pairs in filtered graph: 321053
# drivers: 2620
# neighbours: 15153


In [23]:
bct_corr = pd.read_feather(processeddata+"bct_obs.feather")
print(len(bct_corr))
bct_corr.head()

456330


Unnamed: 0,driver,rho,rho_pval,neighbour,nct,nmut
0,A1CF,0.364706,0.164868,APOB,16,39
1,A1CF,0.273529,0.305323,APOBEC1,16,39
2,A1CF,0.270588,0.310761,APOBEC2,16,39
3,A1CF,-0.088235,0.745222,APOBEC3A,16,39
4,A1CF,-0.082353,0.761733,APOBEC3B,16,39


In [24]:
filtered_bct_corr = (
  bct_corr.merge(filteredpairs)
  [["driver", "neighbour", "rho", "rho_pval", "nct", "nmut"]]
)
print("# pairs:", len(filtered_bct_corr))
print("# significant pairs:", len(filtered_bct_corr[filtered_bct_corr.rho_pval < 0.05]))
print("# drivers:", filtered_bct_corr.driver.nunique())
print("# neighbours:", filtered_bct_corr.neighbour.nunique())
filtered_bct_corr.head()

# pairs: 320944
# significant pairs: 30857
# drivers: 2620
# neighbours: 15148


Unnamed: 0,driver,neighbour,rho,rho_pval,nct,nmut
0,A1CF,APOB,0.364706,0.164868,16,39
1,A1CF,APOBEC1,0.273529,0.305323,16,39
2,A1CF,APOBEC2,0.270588,0.310761,16,39
3,A1CF,APOBEC3A,-0.088235,0.745222,16,39
4,A1CF,APOBEC3B,-0.082353,0.761733,16,39


In [25]:
filteredpairs.to_csv(processeddata+"main_graph_filtered.csv", index=False)
filtered_bct_corr.to_feather(processeddata+"bct_obs_filtered.feather")

# Random shuffles

## Load Data

In [26]:
shufflepairs = pd.read_csv(processeddata+"main_graph_filtered.csv")
drivers = shufflepairs.driver.unique()
neighbours = shufflepairs.neighbour.unique()
print("# shuffle pairs:", len(shufflepairs))
print("# drivers:", len(drivers))
print("# neighbours:", len(neighbours))
shufflepairs.head()

# shuffle pairs: 321053
# drivers: 2620
# neighbours: 15153


Unnamed: 0,driver,neighbour
0,A1CF,APOB
1,A1CF,APOBEC1
2,A1CF,APOBEC2
3,A1CF,APOBEC3A
4,A1CF,APOBEC3B


In [27]:
exptab = (
    pd.read_feather(processeddata+"expression.feather")
    [neighbours]
    # drop patient labels
    .droplevel(0)
)
print(exptab.shape)
exptab.head()

(7317, 15153)


Unnamed: 0_level_0,APOB,APOBEC1,APOBEC2,APOBEC3A,APOBEC3B,APOBEC3C,APOBEC3F,APOBEC3G,APOBEC3H,APOBEC4,...,LY6G6C,TPPP3,PZP,MTHFD2L,ZCWPW1,GTF2IRD2B,ZNF354A,KRTAP17-1,VCX,WFDC10A
cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GBM,1.79,0.0,2.23,0.69,4.68,7.71,5.95,6.95,1.16,0.0,...,1.16,12.48,0.0,9.07,5.54,7.51,7.48,0.0,0.0,0.0
GBM,2.95,0.0,1.7,4.2,6.9,8.33,7.18,8.51,2.84,0.0,...,0.64,9.0,0.64,8.86,6.15,7.0,8.24,0.0,0.0,0.64
GBM,0.0,0.0,1.82,3.63,5.07,6.12,4.7,6.57,1.64,0.0,...,2.13,7.56,0.88,9.1,5.53,7.88,8.03,0.0,0.0,0.0
GBM,0.97,0.0,1.54,2.53,5.96,6.87,6.68,7.81,2.64,0.0,...,1.54,9.99,2.12,8.96,7.31,8.54,8.37,0.0,0.56,0.0
GBM,2.7,0.0,1.5,5.08,4.77,8.91,7.42,9.61,5.0,0.69,...,1.15,12.42,1.15,8.33,7.63,8.07,8.71,0.0,0.0,0.0


In [28]:
mutationtab = pd.read_feather(processeddata+"mutation.feather")

cancer_freq = (
    mutationtab.index.to_frame(index=False)
    .groupby("cancer_type")
    .size()
)

mutationtab = (
    mutationtab[drivers]
    .groupby(["cancer_type"])
    .sum()
    .div(cancer_freq, axis=0)
    # order axis
    .T.sort_index()
    .T.sort_index()
)
print(cancer_freq.shape)
print(mutationtab.shape)
mutationtab.head()

(32,)
(32, 2620)


Unnamed: 0_level_0,A1CF,A2ML1,ABCA10,ABCA13,ABCA7,ABCB1,ABCB5,ABCC3,ABCC5,ABCC9,...,ZNRF3,ZPBP2,ZRANB3,ZRSR2,ZSCAN31,ZSCAN4,ZSWIM3,ZSWIM6,ZWINT,ZZEF1
cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACC,0.0,0.0,0.0,0.013699,0.0,0.0,0.013699,0.013699,0.0,0.013699,...,0.041096,0.0,0.027397,0.013699,0.0,0.0,0.0,0.013699,0.0,0.0
BLCA,0.003472,0.020833,0.013889,0.065972,0.024306,0.013889,0.027778,0.024306,0.017361,0.041667,...,0.003472,0.003472,0.017361,0.006944,0.0,0.0,0.0,0.0,0.0,0.034722
BRCA,0.001316,0.006579,0.007895,0.019737,0.009211,0.005263,0.006579,0.005263,0.007895,0.013158,...,0.003947,0.002632,0.002632,0.0,0.0,0.0,0.005263,0.002632,0.002632,0.005263
CESC,0.011858,0.01581,0.003953,0.039526,0.011858,0.003953,0.011858,0.011858,0.007905,0.019763,...,0.003953,0.003953,0.007905,0.003953,0.0,0.003953,0.0,0.0,0.003953,0.007905
CHOL,0.028571,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
mutation_burden = (
    pd.read_csv(processeddata+"mutation_burden.csv")
    .groupby("cancer_type", as_index=False)
    ["mutation_burden"]
    .mean()
    .set_index("cancer_type")
    .sort_index()
    .mutation_burden
    .add(1)
    .transform(np.log10)
)
mutation_burden.head()

cancer_type
ACC     1.552818
BLCA    2.082274
BRCA    1.673300
CESC    1.945632
CHOL    1.541936
Name: mutation_burden, dtype: float64

In [30]:
print("exptab:", exptab.shape)
print("mutationtab:", mutationtab.shape)
print("mutation_burden:", mutation_burden.shape)

exptab: (7317, 15153)
mutationtab: (32, 2620)
mutation_burden: (32,)


## Data preparation

In [31]:
# filter to remove cancer types with no mutations prior to correlation calculation
cancertype_filter = (
    mutationtab.T
    .reset_index(names="driver")
    .set_index("driver")
    # remove cancer types with no mutations
    .transform(lambda x: x > 0)
    .to_numpy()
)

In [32]:
shuffle_graph = (
    shufflepairs
    .assign(value=1)
    .pivot_table(index="neighbour", columns="driver", values="value", fill_value=0)
    .astype(bool)
)
shuffle_graph.head()

driver,A1CF,A2ML1,ABCA10,ABCA13,ABCA7,ABCB1,ABCB5,ABCC3,ABCC5,ABCC9,...,ZNRF3,ZPBP2,ZRANB3,ZRSR2,ZSCAN31,ZSCAN4,ZSWIM3,ZSWIM6,ZWINT,ZZEF1
neighbour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A1CF,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A2M,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A2ML1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A4GALT,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Shuffle

In [33]:
mutationarray = (
    mutationtab
    .div(mutation_burden, axis=0)
    .fillna(0)
    .to_numpy()
)
print("exptab:", exptab.shape)
print("mutationarray:", mutationarray.shape)
print("shuffle graph:", shuffle_graph.shape)
print("cancertype_filter:", cancertype_filter.shape)

exptab: (7317, 15153)
mutationarray: (32, 2620)
shuffle graph: (15153, 2620)
cancertype_filter: (2620, 32)


Create a seed for pseudo-random number generator. Using always this seed guarantees reproducibility.

In [34]:
print(secrets.randbits(128))

224922765619793089663346385743205238837


To run BCT random shuffles, set `run = True` in the cell below.

In [35]:
run = False
seed = 61028791052787875347939602848649680021
rng = np.random.default_rng(seed)
spawn = rng.spawn(100)
index = exptab.index.to_numpy()

if run:
    for i, child_rng in enumerate(tqdm(spawn)):
        # shuffle expression data
        shuffled_index = pd.Index(child_rng.permutation(index), name="cancer_type")
        exparray = (
            exptab
            .set_index(shuffled_index)
            .groupby(["cancer_type"])
            .mean()
            .sort_index()
            .to_numpy()
        )

        # compute correlation
        corr = between_cancer_corr(
            exparray,
            mutationarray,
            shuffle_graph,
            cancertype_filter,
            progressbar=False,
        )
        # save results
        corr.to_feather(shuffled_dir+f"shuffle{i+1}.feather")

# Results

In [36]:
filted_bct_corr = pd.read_feather(processeddata+"bct_obs_filtered.feather")
grouped_bct_corr = pd.read_feather(processeddata+"bct_obs_mutation_groups.feather")
display(filted_bct_corr.head(2))
grouped_bct_corr.head(2)

Unnamed: 0,driver,neighbour,rho,rho_pval,nct,nmut
0,A1CF,APOB,0.364706,0.164868,16,39
1,A1CF,APOBEC1,0.273529,0.305323,16,39


Unnamed: 0,driver,rho_low_mut,rho_pval_low_mut,neighbour,rho_high_mut,rho_pval_high_mut
0,A1CF,0.809524,0.014903,APOB,0.357143,0.385121
1,A1CF,-0.285714,0.492726,APOBEC1,0.357143,0.385121


In [37]:
results = pd.merge(
  filted_bct_corr,
  grouped_bct_corr,
  on=["driver", "neighbour"],
)[[
  "driver", "neighbour", "rho", "rho_low_mut",
  "rho_high_mut", "rho_pval", "rho_pval_low_mut", "rho_pval_high_mut"
  ]]

In [38]:
print("# pairs:", len(results))
print("# significant pairs:", (results.rho_pval < 0.05).sum())
print("# significant pairs (low mutation):", (results.rho_pval_low_mut < 0.05).sum())
print("# significant pairs (high mutation):", (results.rho_pval_high_mut < 0.05).sum())
print("# significant pairs in common:",
      len(results[
          (results.rho_pval < 0.05) &
          (results.rho_pval_low_mut < 0.05) &
          (results.rho_pval_high_mut < 0.05)
          ]))
stats = []
for pair_type in ["All pairs", "Significant pairs", "Non-significant pairs"]:
  print()
  print(pair_type)
  if pair_type == "All pairs":
    data = results
  elif pair_type == "Significant pairs":
    data = results[results.rho_pval < 0.05]
  else:
    data = results[results.rho_pval >= 0.05]

  concordance = data.assign(
    rho=lambda x: x.rho/x.rho.abs(),
    rho_low_mut=lambda x: x.rho_low_mut/x.rho_low_mut.abs(),
    rho_high_mut=lambda x: x.rho_high_mut/x.rho_high_mut.abs(),
  )

  for comparison in [
    ("rho", "rho_low_mut"),
    ("rho", "rho_high_mut"),
    ("rho_low_mut", "rho_high_mut"),
  ]:
    rho, pval = spstats.spearmanr(
      data[comparison[0]], data[comparison[1]], alternative="two-sided")
    signconc = (
      (concordance[comparison[0]] == concordance[comparison[1]]).sum()
      /len(concordance)* 100
    ).round(1)
    stats.append({
      "pair_type": pair_type,
      "comparison": f"{comparison[0]} vs {comparison[1]}",
      "rho": round(rho, 3),
      "pval": round(pval, 3),
      "sign_concordance_pct": signconc,
    })
pd.DataFrame(stats).set_index(["pair_type", "comparison"])

# pairs: 320736
# significant pairs: 30821
# significant pairs (low mutation): 21281
# significant pairs (high mutation): 22376
# significant pairs in common: 348

All pairs

Significant pairs

Non-significant pairs


Unnamed: 0_level_0,Unnamed: 1_level_0,rho,pval,sign_concordance_pct
pair_type,comparison,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All pairs,rho vs rho_low_mut,0.399,0.0,63.0
All pairs,rho vs rho_high_mut,0.432,0.0,64.3
All pairs,rho_low_mut vs rho_high_mut,-0.073,0.0,45.8
Significant pairs,rho vs rho_low_mut,0.665,0.0,83.1
Significant pairs,rho vs rho_high_mut,0.687,0.0,83.3
Significant pairs,rho_low_mut vs rho_high_mut,0.391,0.0,69.1
Non-significant pairs,rho vs rho_low_mut,0.336,0.0,60.8
Non-significant pairs,rho vs rho_high_mut,0.373,0.0,62.3
Non-significant pairs,rho_low_mut vs rho_high_mut,-0.143,0.0,43.4
