In [2]:
import pandas as pd
import numpy as np
from functools import reduce
import os

# Functions

In [2]:
def check_dir(dir: str):
    """
    Creates a given path driectory "dir" if it does not exist.
    Args:
        dir (str): Path to the directory. 
    """
    if os.path.exists(dir) and os.path.isdir(dir):
        pass
    else:
        os.makedirs(dir)

# Directories

In [3]:
datadir = "data/"
shuffledir = datadir + "shuffle/"
check_dir(datadir)
check_dir(shuffledir)

In [4]:
networks = ["biogrid", "apid", "huri", "string", "omnipath"]

# Process Raw Data

## Clinical Data

In [3]:
clintab = pd.read_table('Survival_SupplementalTable_S1_20171025_xena_sp')
clintab.head(2)

Unnamed: 0,sample,_PATIENT,cancer type abbreviation,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,clinical_stage,histological_type,histological_grade,...,residual_tumor,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
0,TCGA-OR-A5J1-01,TCGA-OR-A5J1,ACC,58.0,MALE,WHITE,Stage II,,Adrenocortical carcinoma- Usual Type,,...,,1.0,1355.0,1.0,1355.0,1.0,754.0,1.0,754.0,
1,TCGA-OR-A5J2-01,TCGA-OR-A5J2,ACC,44.0,FEMALE,WHITE,Stage IV,,Adrenocortical carcinoma- Usual Type,,...,,1.0,1677.0,1.0,1677.0,,,1.0,289.0,


In [6]:
cancertype = clintab[["sample", "_PATIENT", "cancer type abbreviation"]].rename(columns={
    "_PATIENT": "patient",
    "cancer type abbreviation": "cancer_type"
})
print(len(cancertype))
cancertype.head(2)

12591


Unnamed: 0,sample,patient,cancer_type
0,TCGA-OR-A5J1-01,TCGA-OR-A5J1,ACC
1,TCGA-OR-A5J2-01,TCGA-OR-A5J2,ACC


In [7]:
phenotab = pd.read_table('TCGA_phenotype_denseDataOnlyDownload.tsv')
print(len(phenotab))
phenotab = pd.merge(phenotab, cancertype, on="sample")
print(len(phenotab))
phenotab.head(2)

12804
12591


Unnamed: 0,sample,sample_type_id,sample_type,_primary_disease,patient,cancer_type
0,TCGA-D3-A1QA-07,7.0,Additional Metastatic,skin cutaneous melanoma,TCGA-D3-A1QA,SKCM
1,TCGA-DE-A4MD-06,6.0,Metastatic,thyroid carcinoma,TCGA-DE-A4MD,THCA


In [8]:
# Keep only individuals with normal and primary tumour samples
phenotab = phenotab[
    (phenotab.sample_type=="Primary Tumor") |
    (phenotab.sample_type=="Solid Tissue Normal")
]

print(phenotab["patient"].unique().shape[0])
phenotab.head()

10514


Unnamed: 0,sample,sample_type_id,sample_type,_primary_disease,patient,cancer_type
662,TCGA-ND-A4WA-01,1.0,Primary Tumor,uterine carcinosarcoma,TCGA-ND-A4WA,UCS
663,TCGA-NF-A5CP-01,1.0,Primary Tumor,uterine carcinosarcoma,TCGA-NF-A5CP,UCS
664,TCGA-N8-A4PP-01,1.0,Primary Tumor,uterine carcinosarcoma,TCGA-N8-A4PP,UCS
665,TCGA-N7-A4Y5-01,1.0,Primary Tumor,uterine carcinosarcoma,TCGA-N7-A4Y5,UCS
666,TCGA-N6-A4VE-01,1.0,Primary Tumor,uterine carcinosarcoma,TCGA-N6-A4VE,UCS


## Drivers Data

In [10]:
cancerdrivers = pd.read_table('NCG_cancerdrivers_annotation_supporting_evidence.tsv')
cancerdrivers.head(2)

Unnamed: 0,entrez,symbol,pubmed_id,type,organ_system,primary_site,cancer_type,method,coding_status,cgc_annotation,vogelstein_annotation,saito_annotation,NCG_oncogene,NCG_tsg
0,23,ABCF1,31444325,WGS-WES,Hematologic and lymphatic,blood,multiple_myeloma,dNdScv,coding,,,,,
1,25,ABL1,29625053,Pan-cancer,Multiple,multiple,pan-cancer_adult,PanSoftWare,coding,"oncogene, fusion",Oncogene,,1.0,0.0


In [11]:
mutationtab = pd.read_table('mc3.v0.2.8.PUBLIC.nonsilentGene.xena', index_col="sample")

# Choose driver in NCG
cancerdriverlist = np.intersect1d(cancerdrivers.symbol.unique(), mutationtab.index.to_numpy(dtype="str"))
print(len(cancerdriverlist))

# Add cancer_type and patient information 
mutationtab = mutationtab.loc[cancerdriverlist].T.merge(
    cancertype.set_index("sample"), left_index=True, right_index=True
).set_index(["patient", "cancer_type"])

print(mutationtab.shape)
mutationtab.head()

3199
(9080, 3199)


Unnamed: 0_level_0,Unnamed: 1_level_0,A1CF,A2ML1,AADACL4,ABCA10,ABCA13,ABCA7,ABCB1,ABCB5,ABCC3,ABCC5,...,ZRANB3,ZRSR2,ZSCAN31,ZSCAN4,ZSWIM3,ZSWIM6,ZSWIM7,ZWILCH,ZWINT,ZZEF1
patient,cancer_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
TCGA-02-0003,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-0033,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-0047,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-0055,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-2470,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Expression Data

In [12]:
exptab = pd.read_table('EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena', index_col="sample")

# Remove duplicated samples, fill missing values with min(neighbour_expression)
# and add cancer type and patient data
exptab = exptab.loc[~exptab.index.duplicated()].T.fillna(exptab.min(axis=0)).merge(
    cancertype.set_index("sample"), left_index=True, right_index=True
).reset_index(names="sample").merge(
    phenotab[["sample", "sample_type"]], on="sample"
).set_index(["patient", "sample_type", "cancer_type"]).drop(columns="sample")
print(exptab.shape)
exptab.head()

(10394, 20530)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
patient,sample_type,cancer_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
TCGA-OR-A5J1,Primary Tumor,ACC,0.0,2.09,2.3,7.23,10.99,0.0,8.1,1.29,0.0,0.0,...,7.53,7.21,4.44,8.46,10.04,0.57,9.34,10.85,10.18,9.22
TCGA-OR-A5J2,Primary Tumor,ACC,0.0,1.88,3.32,6.36,10.35,0.0,7.65,0.0,0.0,0.0,...,8.05,8.78,5.86,8.13,11.54,5.02,10.19,11.58,10.89,9.65
TCGA-OR-A5J3,Primary Tumor,ACC,0.0,1.45,2.92,6.45,10.04,0.0,8.45,0.67,0.0,0.0,...,6.52,7.58,5.35,8.96,9.84,0.67,9.66,11.38,10.53,8.78
TCGA-OR-A5J5,Primary Tumor,ACC,0.0,0.0,1.35,5.78,11.2,0.0,8.78,0.83,0.0,0.0,...,8.03,9.72,4.23,7.69,9.8,3.66,9.12,11.21,10.16,9.01
TCGA-OR-A5J6,Primary Tumor,ACC,0.0,0.0,2.45,6.09,10.3,0.0,7.23,0.0,0.0,0.0,...,6.03,6.0,3.79,6.89,9.81,3.14,9.64,9.47,9.64,8.9


In [13]:
# Split expression data in Normal and Tumour
tumourexp = exptab.loc[(slice(None), "Primary Tumor", slice(None))]
normalexp = exptab.loc[(slice(None), "Solid Tissue Normal", slice(None))]

print(len(tumourexp), len(normalexp))

9675 719


## Driver-Neighbour Network
We will build a main graph with interaction data from five sources.

We will do a first filter using cancer drivers from NCG and neighbours in the expression dataset.

In [14]:
maingraph = []
for net in networks:
    ppi = pd.read_csv(datadir+f"{net}_graph.csv")
    col1 = ppi.columns[0]
    col2 = ppi.columns[1]
    x = ppi[(ppi[col1].isin(cancerdriverlist))].rename(columns={f"{col1}": "driver", f"{col2}": "neighbour"})
    y = ppi[(ppi[col2].isin(cancerdriverlist))].rename(columns={f"{col1}": "neighbour", f"{col2}": "driver"})
    graph = pd.concat([x, y]).drop_duplicates()
    graph = graph[graph.neighbour.isin(tumourexp.columns)]
    maingraph.append(graph)
    
maingraph = pd.concat(maingraph).drop_duplicates()
print("# of drivers:", len(maingraph.driver.unique()))
print("# of neighbours:", len(maingraph.neighbour.unique()))
maingraph.head()

# of drivers: 3138
# of neighbours: 15487


Unnamed: 0,driver,neighbour
0,MAP2K4,FLNC
2,ACVR1,FNTA
3,GATA2,PML
8,XRN1,ALDOA
11,CITED2,TFAP2A


## Intersect Drivers, Neighbours & Samples

In [15]:
common_samples = reduce(np.intersect1d, [
    mutationtab.index.get_level_values("patient"),
    normalexp.index.get_level_values("patient"),
    tumourexp.index.get_level_values("patient")
]).tolist()
print(len(common_samples))

665


In [16]:
# Select drivers with 3 or more mutated individuals in common samples
drivers = mutationtab.loc[common_samples].columns[np.flatnonzero(mutationtab.loc[common_samples].sum(axis=0)>=3)]

# Filter drivers with main graph
drivers = np.intersect1d(drivers, maingraph.driver.unique())
print(len(drivers))

2570


In [17]:
# Filter mutationtab with drivers and sort indices
mutationtab = mutationtab[drivers].sort_index(axis=1).sort_index(axis=0, level=0)
print(mutationtab.shape)
mutationtab.head()

(9080, 2570)


Unnamed: 0_level_0,Unnamed: 1_level_0,A1CF,A2ML1,ABCA10,ABCA13,ABCA7,ABCB1,ABCB5,ABCC3,ABCC5,ABCC9,...,ZPBP2,ZRANB3,ZRSR2,ZSCAN31,ZSCAN4,ZSWIM3,ZSWIM6,ZWILCH,ZWINT,ZZEF1
patient,cancer_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
TCGA-02-0003,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-0033,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-0047,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-0055,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-02-2470,GBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Align mutationtab and tumourexp on samples
print(len(mutationtab), len(tumourexp))
mutationtab, tumourexp = mutationtab.align(tumourexp, join="inner", axis=0)
print(len(mutationtab), len(tumourexp))

9080 9675
8404 8404


In [19]:
# Refilter main graph
print(maingraph.driver.unique().shape[0])
maingraph = maingraph[maingraph.driver.isin(drivers)]
print(maingraph.driver.unique().shape[0])

3138
2570


In [20]:
# Filter normalexp with common samples
normalexp = normalexp.loc[common_samples]

# Remove neighbours without expression in common_samples, and filter neighbours with maingraph
neighbours = reduce(np.intersect1d, [
    normalexp.columns[np.flatnonzero(normalexp.abs().sum(axis=0))],
    tumourexp.loc[common_samples].columns[np.flatnonzero(tumourexp.loc[common_samples].abs().sum(axis=0))],
    maingraph.neighbour.unique()
])
# Filter expression and sort indices
normalexp = normalexp[neighbours].sort_index(axis=1).sort_index(axis=0, level=0)
tumourexp = tumourexp[neighbours].sort_index(axis=1).sort_index(axis=0, level=0)
print(normalexp.shape)
print(tumourexp.shape)

(665, 15206)
(8404, 15206)


In [21]:
# Refilter main graph
maingraph = maingraph[maingraph.neighbour.isin(neighbours)]

print("# of interactions:", len(maingraph))
print("# of drivers:", len(maingraph.driver.unique()))
print("# of neighbours:", len(maingraph.neighbour.unique()))

# of interactions: 383337
# of drivers: 2570
# of neighbours: 15206


In [22]:
# Create neighbours table with sorted indices
neighbourtab = maingraph.assign(value=1).pivot(index="neighbour", columns="driver", values="value").fillna(0).astype(bool)\
    .sort_index(axis=1).sort_index(axis=0)
print(neighbourtab.shape)
neighbourtab.head(2)

(15206, 2570)


driver,A1CF,A2ML1,ABCA10,ABCA13,ABCA7,ABCB1,ABCB5,ABCC3,ABCC5,ABCC9,...,ZPBP2,ZRANB3,ZRSR2,ZSCAN31,ZSCAN4,ZSWIM3,ZSWIM6,ZWILCH,ZWINT,ZZEF1
neighbour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A1CF,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# Save to files
neighbourtab.to_feather(datadir+"neighbours.feather")
mutationtab.sort_index(axis=1).sort_index(axis=0, level=0).to_feather(datadir+"mutation.feather")
tumourexp.to_feather(datadir+"tumour_expression.feather")
normalexp.to_feather(datadir+"normal_expression.feather")
maingraph.sort_values(["driver", "neighbour"]).to_csv(datadir+"main_graph.csv", index=False)

`expressiontab`$= \text{sample} \times \text{neighbour}$

`neighbours`$= \text{neighbour} \times \text{driver}$

`tumourexp & normalexp`$= \text{sample} \times \text{driver}$