In [1]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


from pyfaidx import Fasta
from seq2atac.analysis import fasta_file
fasta_seq=Fasta(fasta_file)

import tqdm

In [2]:
usecols = ["Chromosome","Start_position","End_position","Strand","Variant_Classification","Variant_Type",
           "Reference_Allele","Tumor_Seq_Allele2","i_NumCallers","i_repeat_masker","Project_Code","Donor_ID"
           ]

In [3]:
mutation_files = {"ICGC": "/illumina/scratch/deep_learning/public_data/ICGC/release_28/Consensus-SNV/final_consensus_passonly.snv_mnv_indel.icgc.public.maf.gz",
                  "TCGA": "/illumina/scratch/deep_learning/public_data/ICGC/release_28/Consensus-SNV/final_consensus_passonly.snv_mnv_indel.tcga.controlled.maf.gz"}


In [4]:
histology_df = pd.read_csv("/illumina/scratch/deep_learning/public_data/ICGC/release_28/summary/pcawg_specimen_histology_August2016_v9.csv")
donorid_to_project_dict = dict(zip(histology_df["icgc_donor_id"].tolist(), histology_df["project_code"].tolist()))
donorid_to_histology_dict = dict(zip(histology_df["icgc_donor_id"].tolist(), histology_df["histology_abbreviation"].tolist()))

In [5]:
all_rows = []
for mutation_set,all_mutations in mutation_files.items():
    for chunk in pd.read_csv(all_mutations, sep="\t", chunksize=1e6, usecols=usecols,low_memory=False):
        chunk = chunk[(chunk["Variant_Type"]=="SNP") & 
                      (chunk["Variant_Classification"].isin(["IGR","Intron"]))]

        ### modify chromosome to be an actual number
        chunk["Chromosome"] = chunk["Chromosome"].apply(lambda num:f"chr{num}")
        ### based on donor to cancer dict, add type here
        chunk["project_code"] = chunk["Donor_ID"].apply(lambda donor_id:donorid_to_project_dict.get(donor_id,"None"))
        chunk["histology_abbreviation"] = chunk["Donor_ID"].apply(lambda donor_id:donorid_to_histology_dict.get(donor_id,"None"))
        chunk["mutation_set"] = mutation_set
        all_rows.append(chunk)

        print(len(all_rows))
   

     
mutations_df = pd.concat(all_rows,axis=0,ignore_index=True)
mutations_df

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54


Unnamed: 0,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,i_NumCallers,i_repeat_masker,Project_Code,Donor_ID,project_code,histology_abbreviation,mutation_set
0,chr1,1230448,1230448,+,Intron,SNP,G,A,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC
1,chr1,1609723,1609723,+,Intron,SNP,C,T,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC
2,chr1,1903276,1903276,+,IGR,SNP,C,T,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC
3,chr1,2574999,2574999,+,Intron,SNP,C,T,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC
4,chr1,3151250,3151250,+,Intron,SNP,G,A,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41470627,chrX,154769962,154769962,+,Intron,SNP,G,A,4,L1M4c,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA
41470628,chrX,154771230,154771230,+,Intron,SNP,G,C,4,L1PA7,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA
41470629,chrX,154771597,154771597,+,Intron,SNP,G,T,3,L1PA7,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA
41470630,chrX,154772343,154772343,+,Intron,SNP,G,T,4,L1MCb,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA


In [6]:
mutations_df["Donor_ID"].unique().shape

(2658,)

### Visualize some stats and add cancer code

In [None]:
mutations_df.groupby(["project_code","histology_abbreviation"]).size().reset_index(name="counts")

In [8]:
mutations_df["cancer_code"] = mutations_df["project_code"].apply(lambda string:string.split("-")[0].upper())
mutations_df

Unnamed: 0,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,i_NumCallers,i_repeat_masker,Project_Code,Donor_ID,project_code,histology_abbreviation,mutation_set,cancer_code
0,chr1,1230448,1230448,+,Intron,SNP,G,A,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC,OV
1,chr1,1609723,1609723,+,Intron,SNP,C,T,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC,OV
2,chr1,1903276,1903276,+,IGR,SNP,C,T,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC,OV
3,chr1,2574999,2574999,+,Intron,SNP,C,T,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC,OV
4,chr1,3151250,3151250,+,Intron,SNP,G,A,4,,Ovary-AdenoCA,DO46416,OV-AU,Ovary-AdenoCA,ICGC,OV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41470627,chrX,154769962,154769962,+,Intron,SNP,G,A,4,L1M4c,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA,BLCA
41470628,chrX,154771230,154771230,+,Intron,SNP,G,C,4,L1PA7,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA,BLCA
41470629,chrX,154771597,154771597,+,Intron,SNP,G,T,3,L1PA7,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA,BLCA
41470630,chrX,154772343,154772343,+,Intron,SNP,G,T,4,L1MCb,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA,BLCA


### Donor Selection

1. Keep only ["BLCA","BRCA","GBM","COAD","KIRC","KIRP","LUAD","SKCM","RECA","MELA"] cancer codes
2. Within BRCA, remove histologies ["Breast-DCIS","Breast-LobularCA"]
3. For COAD, remove MSI samples. please pay attention to the outputs of the cells

In [9]:
valid_cancer_codes = ["BLCA","BRCA","GBM","COAD","KIRC","KIRP","LUAD","SKCM","RECA","MELA"]
invalid_histologies = ["Breast-DCIS","Breast-LobularCA"]
mutations_df = mutations_df[mutations_df["cancer_code"].isin(valid_cancer_codes)]
mutations_df = mutations_df[~mutations_df["histology_abbreviation"].isin(invalid_histologies)]
print(mutations_df.shape)

(20348478, 16)


In [10]:
coad_remove_df = pd.read_csv("/illumina/scratch/deep_learning/akumar22/TCGA/mutations_scoring/master_files/msi_pcawg_samples.csv",sep="\t")
ids_to_remove = coad_remove_df["icgc_donor_id"].tolist()
print(mutations_df[mutations_df["Donor_ID"].isin(ids_to_remove)]["cancer_code"].unique())

### make sure its only coad
mutations_df = mutations_df[~mutations_df["Donor_ID"].isin(ids_to_remove)]
print("After removing MSI samples (COAD): ",mutations_df.shape)

mutations_df = mutations_df.reset_index(drop=True)
mutations_df

['COAD']
After removing MSI samples (COAD):  (19682242, 16)


Unnamed: 0,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,i_NumCallers,i_repeat_masker,Project_Code,Donor_ID,project_code,histology_abbreviation,mutation_set,cancer_code
0,chr1,1489107,1489107,+,Intron,SNP,C,T,4,AluY,Kidney-RCC,DO46853,RECA-EU,Kidney-RCC,ICGC,RECA
1,chr1,2087940,2087940,+,Intron,SNP,G,T,4,,Kidney-RCC,DO46853,RECA-EU,Kidney-RCC,ICGC,RECA
2,chr1,4329055,4329055,+,IGR,SNP,C,T,4,,Kidney-RCC,DO46853,RECA-EU,Kidney-RCC,ICGC,RECA
3,chr1,5165802,5165802,+,IGR,SNP,C,A,4,MLT1I,Kidney-RCC,DO46853,RECA-EU,Kidney-RCC,ICGC,RECA
4,chr1,5822758,5822758,+,IGR,SNP,T,G,3,AluJr4,Kidney-RCC,DO46853,RECA-EU,Kidney-RCC,ICGC,RECA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19682237,chrX,154769962,154769962,+,Intron,SNP,G,A,4,L1M4c,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA,BLCA
19682238,chrX,154771230,154771230,+,Intron,SNP,G,C,4,L1PA7,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA,BLCA
19682239,chrX,154771597,154771597,+,Intron,SNP,G,T,3,L1PA7,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA,BLCA
19682240,chrX,154772343,154772343,+,Intron,SNP,G,T,4,L1MCb,Bladder-TCC,DO720,BLCA-US,Bladder-TCC,TCGA,BLCA


In [11]:
mutations_df["cancer_code"].value_counts()

MELA    6682332
COAD    5247285
SKCM    3736996
BRCA    1208987
LUAD    1154611
RECA     457284
GBM      448066
BLCA     430742
KIRC     168147
KIRP     147792
Name: cancer_code, dtype: int64

In [12]:
mutations_df["Donor_ID"].unique().shape

(588,)

### Convert ref, alt to upper case

In [None]:
mutations_df["Reference_Allele"] = mutations_df["Reference_Allele"].apply(lambda x : x.upper())
mutations_df["Tumor_Seq_Allele2"] = mutations_df["Tumor_Seq_Allele2"].apply(lambda x : x.upper())
mutations_df

### hg19 to hg38 function

In [None]:
from seq2atac.analysis.mutation_processing_pipeline_utils import hg19_to_hg38

In [None]:
mutations_df_hg38 = hg19_to_hg38(mutations_df)

In [None]:
mutations_df_hg38

In [None]:
mutations_df_hg38.columns

In [None]:
bed_cols = ["Chromosome","hg38_start","hg38_end","Reference_Allele","Tumor_Seq_Allele2"]
other_cols = [col for col in mutations_df_hg38.columns if col not in bed_cols]
bed_cols, other_cols

In [None]:
mutations_df_hg38 = mutations_df_hg38[bed_cols + other_cols]
mutations_df_hg38

### Create cancer specific files along with de-duplication annotation

In [None]:
all_tcga_cancers = ["BLCA","BRCA","GBM","COAD","KIRC","KIRP","LUAD","SKCM"]
cancer_to_codes = {"BLCA":["BLCA"],
                   "BRCA":["BRCA"],
                   "GBM":["GBM"],
                   "COAD":["COAD"], ## this could be coad, read.
                   "KIRC":["KIRC","RECA"],
                   "KIRP":["KIRP"],
                   "LUAD":["LUAD"],
                   "SKCM":["SKCM","MELA"]
                   }

In [None]:
from seq2atac.stable import write_pickle

In [None]:
!mkdir cancer_specific_somatic_hg38/
for cancer_name in all_tcga_cancers:
    print(cancer_name)

    # which codes can be scored by a particular TCGA model
    cancer_types = cancer_to_codes[cancer_name]
    print(cancer_types)
    cancer_mutations_df = mutations_df_hg38[mutations_df_hg38["cancer_code"].isin(cancer_types)]
    print("total: ",cancer_mutations_df.shape)
    
    # deduplicate and mark those that are duplicates
    cancer_mutations_df["duplicated_mutation"] = cancer_mutations_df.duplicated(["Chromosome","hg38_start","Tumor_Seq_Allele2"], keep=False).astype(int)
    cancer_mutations_df = cancer_mutations_df.drop_duplicates(["Chromosome","hg38_start","Tumor_Seq_Allele2"])
    print("after deduplication: ",cancer_mutations_df.shape)
    
    # sort
    cancer_mutations_df = cancer_mutations_df.sort_values(["Chromosome","hg38_start"]).reset_index(drop=True)
    
    
    # provide a mutation ID
    cancer_mutations_df["mutation_id"] = cancer_mutations_df.index
    
        
    ## where to save
    outfile = f"cancer_specific_somatic_hg38/{cancer_name}_somatic_hg38.pkl"
    
    print(cancer_mutations_df.head())
    
    # write into a file
    write_pickle(cancer_mutations_df,outfile)

In [None]:
print("Done")

In [None]:
# ### check 1
# (mutations_df["Reference_Allele"] == mutations_df["Tumor_Seq_Allele1"]).all()

In [None]:
# ### check 2
# (mutations_df["i_NumCallers"].astype(int) == mutations_df["i_Callers"].apply(lambda lst:len(lst.split(","))).astype(int)).all()

In [None]:
# ### check 3 - Start_position - 1 => Reference_Allele
# f19 = Fasta("/illumina/scratch/deep_learning/akumar22/hg19_genome.fa",sequence_always_upper=True)
# mismatched_indices = []
# for idx,row in mutations_df.iterrows():
#     chm = str(row["Chromosome"])
#     start = int(row["Start_position"])
#     ref = str(row["Reference_Allele"])
#     if f19[chm][start-1] != ref:
#         mismatched_indices.append(idx)
#     if idx % 100000 == 1:
#         print(idx)

In [None]:
# mismatched_indices

In [None]:
# ### check 4 - strand
# mutations_df["Strand"].value_counts()