## Extracting gene lengths through PyEnsembl

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ydata_profiling as yp
from ydata_profiling import ProfileReport

In [2]:
! pyensembl install --release 75 --species homo_sapiens

2024-03-25 22:34:37,234 - pyensembl.shell - INFO - Running 'install' for EnsemblRelease(release=75, species='homo_sapiens')
2024-03-25 22:34:37,828 - pyensembl.sequence_data - INFO - Loaded sequence dictionary from C:\Users\Cristina\AppData\Local\pyensembl\GRCh37\ensembl75\pyensembl\GRCh37\ensembl75\Cache\Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
2024-03-25 22:34:37,906 - pyensembl.sequence_data - INFO - Loaded sequence dictionary from C:\Users\Cristina\AppData\Local\pyensembl\GRCh37\ensembl75\pyensembl\GRCh37\ensembl75\Cache\Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
2024-03-25 22:34:38,015 - pyensembl.sequence_data - INFO - Loaded sequence dictionary from C:\Users\Cristina\AppData\Local\pyensembl\GRCh37\ensembl75\pyensembl\GRCh37\ensembl75\Cache\Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


In [3]:
## The function Vett_transcript takes all the unique Transcript IDs as indexes and create the column 'Hugo_symbol' filled with zeros 
def Vett_transcript (df_mut):
    vett_transcript_id = df_mut.Transcript_ID.unique()
    df_vett_transcript = pd.DataFrame(index = vett_transcript_id, columns = ['Hugo_Symbol'])
    df_vett_transcript = df_vett_transcript.fillna(0)
    return (df_vett_transcript)

In [4]:
from pyensembl import EnsemblRelease

# release 77 uses human reference genome GRCh38
# release 75 uses human reference genome GRCh37

data = EnsemblRelease(75)
count=0
def Ricavo_Transcript (df_mutations, df_vett_transcript):
    for index, row in df_mutations.iterrows():
        transcript = row.Transcript_ID
        gene_names = data.gene_name_of_transcript_id(transcript)
        df_vett_transcript.loc[transcript, 'Hugo_Symbol'] = gene_names
        #count = count+1
        #print(count)
    return (df_vett_transcript)

### Kan et al., Nature Communications 2018

In [5]:
os.chdir("C:/Users/Cristina/OneDrive/Documenti/BCG/Tesi/Datasets/Breast Cancer (SMC 2018)")

In [6]:
data_mutation_Kan=pd.read_csv('mutation_table_Kan_et_al_Communications_2018.csv', sep=None, engine='python')
data_mutation_Kan = data_mutation_Kan.iloc[:, 1:]
data_mutation_Kan

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,KALRN,3,123988087,123988087,Missense_Mutation,SNP,G,G,C,ENST00000240874.3:c.948G>C,p.E316D,316.0,ENST00000240874
1,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,ANK2,4,114195720,114195720,Missense_Mutation,SNP,C,C,T,ENST00000357077.4:c.1598C>T,p.P533L,533.0,ENST00000357077
2,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,SORBS2,4,186539742,186539742,Missense_Mutation,SNP,G,G,C,ENST00000355634.5:c.2936C>G,p.S979C,979.0,ENST00000355634
3,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,PCDHA11,5,140250828,140250828,Missense_Mutation,SNP,C,C,G,ENST00000398640.2:c.2140C>G,p.L714V,714.0,ENST00000398640
4,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,WRNIP1,6,2766210,2766210,Missense_Mutation,SNP,C,C,A,ENST00000380773.4:c.354C>A,p.S118R,118.0,ENST00000380773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6450,brca_smc_2018_BR469,brca_smc_2018_BR469,PPIL2,22,22049731,22049731,Missense_Mutation,SNP,G,G,T,ENST00000412327.1:c.1511G>T,p.C504F,504.0,ENST00000412327
6451,brca_smc_2018_BR469,brca_smc_2018_BR469,PORCN,X,48372705,48372705,Missense_Mutation,SNP,C,C,A,ENST00000326194.6:c.797C>A,p.A266D,266.0,ENST00000326194
6452,brca_smc_2018_BR469,brca_smc_2018_BR469,KIF4A,X,69510430,69510430,Splice_Site,SNP,T,T,G,ENST00000374403.3:c.120+2T>G,p.X40_splice,40.0,ENST00000374403
6453,brca_smc_2018_BR469,brca_smc_2018_BR469,MAML3,4,140811082,140811083,In_Frame_Ins,INS,-,-,TGT,ENST00000509479.2:c.1508_1509insCAA,p.Q503delinsHK,503.0,ENST00000509479


In [7]:
data_mutation_Kan['Hugo_Symbol'].nunique()

4710

In [8]:
data_mutation_Kan['Transcript_ID'].nunique()

4719

In [9]:
df_vett_transcript_Kan = Vett_transcript(data_mutation_Kan)
df_vett_transcript_Kan 

Unnamed: 0,Hugo_Symbol
ENST00000240874,0
ENST00000357077,0
ENST00000355634,0
ENST00000398640,0
ENST00000380773,0
...,...
ENST00000278317,0
ENST00000551964,0
ENST00000252997,0
ENST00000326194,0


The distinct transcript IDs are 4957. 

In [10]:
df_vett_transcript_Kan = Ricavo_Transcript(data_mutation_Kan, df_vett_transcript_Kan)
df_vett_transcript_Kan 

Unnamed: 0,Hugo_Symbol
ENST00000240874,KALRN
ENST00000357077,ANK2
ENST00000355634,SORBS2
ENST00000398640,PCDHA11
ENST00000380773,WRNIP1
...,...
ENST00000278317,TNNT3
ENST00000551964,APAF1
ENST00000252997,GATA5
ENST00000326194,PORCN


In [11]:
df_vett_transcript_Kan['Hugo_Symbol'].nunique()

4710

In [12]:
# creating a vector to put in the length of each gene, given the Hugo symbol
def Vett_geni (df_vett_transcript):
    index_geni = df_vett_transcript.Hugo_Symbol.unique()
    df_vett_geni = pd.DataFrame(index = index_geni, columns = ['Gene_length'])
    df_vett_geni = df_vett_geni.fillna(0)
    return (df_vett_geni, index_geni)

In [13]:
df_vett_geni_Kan, index_geni_Kan = Vett_geni(df_vett_transcript_Kan)
df_vett_geni_Kan

Unnamed: 0,Gene_length
KALRN,0
ANK2,0
SORBS2,0
PCDHA11,0
WRNIP1,0
...,...
TNNT3,0
APAF1,0
GATA5,0
PORCN,0


In [14]:
# converting the index column into an array
df_vett_geni_Kan.index.values

array(['KALRN', 'ANK2', 'SORBS2', ..., 'GATA5', 'PORCN', 'KIF4A'],
      dtype=object)

In [15]:
# compute the gene lengths
from pyensembl import EnsemblRelease
import pandas as pd

# release 77 uses human reference genome GRCh38
# release 75 uses human reference genome GRCh37

data = EnsemblRelease(75)

def Lunghezza_geni (index_geni, df_vett_geni):
    for index in index_geni:
        gene_nome = index
        if df_vett_geni.loc[gene_nome, 'Gene_length'] == 0:
            length = 0
            gene_names = data.genes_by_name(gene_nome)
            start = gene_names[0].start
            end = gene_names[0].end
            length = end-start+1
            df_vett_geni.loc[gene_nome, 'Gene_length'] = length
            #count_len = count_len + 1
            #print(length)
            #count_len = count_len + 1
            #print ('count_len:', count_len)

        #count_df = count_df+1
        #print(count_df)
    return(df_vett_geni)

In [16]:
df_vett_geni_Kan = Lunghezza_geni(index_geni_Kan, df_vett_geni_Kan)
df_vett_geni_Kan

Unnamed: 0,Gene_length
KALRN,646303
ANK2,565632
SORBS2,371209
PCDHA11,143241
WRNIP1,21539
...,...
TNNT3,19145
APAF1,90286
GATA5,12474
PORCN,11853


In [17]:
df_vett_geni_Kan.reset_index(level=0, inplace=True)
df_vett_geni_Kan.rename(columns={"index": "Hugo_symbol"}, inplace=True)
df_vett_geni_Kan

Unnamed: 0,Hugo_symbol,Gene_length
0,KALRN,646303
1,ANK2,565632
2,SORBS2,371209
3,PCDHA11,143241
4,WRNIP1,21539
...,...,...
4705,TNNT3,19145
4706,APAF1,90286
4707,GATA5,12474
4708,PORCN,11853


In [18]:
df_vett_transcript_Kan.reset_index(level=0,inplace=True)
df_vett_transcript_Kan.rename(columns={"index": "Transcript_ID"},inplace=True)
df_vett_transcript_Kan

Unnamed: 0,Transcript_ID,Hugo_Symbol
0,ENST00000240874,KALRN
1,ENST00000357077,ANK2
2,ENST00000355634,SORBS2
3,ENST00000398640,PCDHA11
4,ENST00000380773,WRNIP1
...,...,...
4714,ENST00000278317,TNNT3
4715,ENST00000551964,APAF1
4716,ENST00000252997,GATA5
4717,ENST00000326194,PORCN


In [19]:
df_transcript_length_Kan=df_vett_transcript_Kan.join(df_vett_geni_Kan.set_index('Hugo_symbol'), on='Hugo_Symbol')
df_transcript_length_Kan=df_transcript_length_Kan.drop('Hugo_Symbol', axis=1)
df_transcript_length_Kan

Unnamed: 0,Transcript_ID,Gene_length
0,ENST00000240874,646303
1,ENST00000357077,565632
2,ENST00000355634,371209
3,ENST00000398640,143241
4,ENST00000380773,21539
...,...,...
4714,ENST00000278317,19145
4715,ENST00000551964,90286
4716,ENST00000252997,12474
4717,ENST00000326194,11853


In [20]:
data_mutation_Kan_FINAL=data_mutation_Kan.drop('Hugo_Symbol', axis=1)
# add the gene lengths to the mutation dataset 
data_mutation_Kan_FINAL=data_mutation_Kan_FINAL.join(df_transcript_length_Kan.set_index('Transcript_ID'), on='Transcript_ID')
# add the new Hugo symbols to the mutation dataset 
data_mutation_Kan_FINAL=data_mutation_Kan_FINAL.join(df_vett_transcript_Kan.set_index('Transcript_ID'), on='Transcript_ID')
data_mutation_Kan_FINAL=data_mutation_Kan_FINAL[['Tumor_Sample_Barcode','PATIENT_ID','Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'HGVSc', 'HGVSp_Short', 'Protein_position', 'Transcript_ID', 'Gene_length']]
data_mutation_Kan_FINAL

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID,Gene_length
0,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,KALRN,3,123988087,123988087,Missense_Mutation,SNP,G,G,C,ENST00000240874.3:c.948G>C,p.E316D,316.0,ENST00000240874,646303
1,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,ANK2,4,114195720,114195720,Missense_Mutation,SNP,C,C,T,ENST00000357077.4:c.1598C>T,p.P533L,533.0,ENST00000357077,565632
2,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,SORBS2,4,186539742,186539742,Missense_Mutation,SNP,G,G,C,ENST00000355634.5:c.2936C>G,p.S979C,979.0,ENST00000355634,371209
3,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,PCDHA11,5,140250828,140250828,Missense_Mutation,SNP,C,C,G,ENST00000398640.2:c.2140C>G,p.L714V,714.0,ENST00000398640,143241
4,brca_smc_2018_BB01_002,brca_smc_2018_BB01_002,WRNIP1,6,2766210,2766210,Missense_Mutation,SNP,C,C,A,ENST00000380773.4:c.354C>A,p.S118R,118.0,ENST00000380773,21539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6450,brca_smc_2018_BR469,brca_smc_2018_BR469,PPIL2,22,22049731,22049731,Missense_Mutation,SNP,G,G,T,ENST00000412327.1:c.1511G>T,p.C504F,504.0,ENST00000412327,47746
6451,brca_smc_2018_BR469,brca_smc_2018_BR469,PORCN,X,48372705,48372705,Missense_Mutation,SNP,C,C,A,ENST00000326194.6:c.797C>A,p.A266D,266.0,ENST00000326194,11853
6452,brca_smc_2018_BR469,brca_smc_2018_BR469,KIF4A,X,69510430,69510430,Splice_Site,SNP,T,T,G,ENST00000374403.3:c.120+2T>G,p.X40_splice,40.0,ENST00000374403,130804
6453,brca_smc_2018_BR469,brca_smc_2018_BR469,MAML3,4,140811082,140811083,In_Frame_Ins,INS,-,-,TGT,ENST00000509479.2:c.1508_1509insCAA,p.Q503delinsHK,503.0,ENST00000509479,437432


In [21]:
# save the final dataset containing the gene lengths 
data_mutation_Kan_FINAL.to_csv('mutation_table_Kan_et_al_Communications_2018_FINAL.csv')

### Banerji et al., Nature 2012

In [22]:
os.chdir("C:/Users/Cristina/OneDrive/Documenti/BCG/Tesi/Datasets/Breast Invasive Carcinoma (Broad, Nature 2012)")

In [23]:
data_mutation_Banerji = pd.read_csv('mutation_table_Banerji_et_al_Nature_2012.csv', sep=None, engine='python')
data_mutation_Banerji = data_mutation_Banerji.iloc[: , 1:]
data_mutation_Banerji

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID,ENSP
0,BR-M-005,BR-M-005,ACTN3,11,66318756,66318756,3'UTR,SNP,A,-,C,ENST00000511191.1:c.*161A>C,*54*,,ENST00000511191,ENSP00000426797
1,BR-M-005,BR-M-005,ADAMTS7,15,79059041,79059041,Missense_Mutation,SNP,T,-,C,ENST00000388820.4:c.3212A>G,p.N1071S,1071.0,ENST00000388820,ENSP00000373472
2,BR-M-005,BR-M-005,CACNA1E,1,181721292,181721292,Missense_Mutation,SNP,C,-,T,ENST00000367573.2:c.3745C>T,p.R1249W,1249.0,ENST00000367573,ENSP00000356545
3,BR-M-005,BR-M-005,CEACAM8,19,43093650,43093650,Missense_Mutation,SNP,G,-,A,ENST00000244336.5:c.662C>T,p.A221V,221.0,ENST00000244336,ENSP00000244336
4,BR-M-005,BR-M-005,CFH,1,196646760,196646760,Silent,SNP,C,-,T,ENST00000367429.4:c.582C>T,p.D194=,194.0,ENST00000367429,ENSP00000356399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4980,BR-V-071,BR-V-071,ZMYM1,1,35578996,35578996,Nonsense_Mutation,SNP,C,-,A,ENST00000373330.1:c.1565C>A,p.S522*,522.0,ENST00000373330,ENSP00000352920
4981,BR-V-071,BR-V-071,ZNF264,19,57716815,57716815,Missense_Mutation,SNP,G,-,C,ENST00000263095.6:c.211G>C,p.E71Q,71.0,ENST00000263095,ENSP00000263095
4982,BR-V-071,BR-V-071,ZNF331,19,54080435,54080435,Silent,SNP,G,-,A,ENST00000253144.9:c.621G>A,p.R207=,207.0,ENST00000253144,ENSP00000253144
4983,BR-V-071,BR-V-071,ZNF404,19,44377263,44377263,Missense_Mutation,SNP,G,-,C,ENST00000587539.1:c.1103C>G,p.S368C,368.0,ENST00000587539,ENSP00000466051


In [24]:
data_mutation_Banerji['Hugo_Symbol'].nunique()

3809

In [25]:
data_mutation_Banerji['Transcript_ID'].nunique()

3816

In [26]:
df_vett_transcript_Banerji = Vett_transcript(data_mutation_Banerji)
df_vett_transcript_Banerji 

Unnamed: 0,Hugo_Symbol
ENST00000511191,0
ENST00000388820,0
ENST00000367573,0
ENST00000244336,0
ENST00000367429,0
...,...
ENST00000422135,0
ENST00000310421,0
ENST00000373330,0
ENST00000253144,0


In [27]:
df_vett_transcript_Banerji = Ricavo_Transcript(data_mutation_Banerji, df_vett_transcript_Banerji)
df_vett_transcript_Banerji 

Unnamed: 0,Hugo_Symbol
ENST00000511191,ACTN3
ENST00000388820,ADAMTS7
ENST00000367573,CACNA1E
ENST00000244336,CEACAM8
ENST00000367429,CFH
...,...
ENST00000422135,SYBU
ENST00000310421,VCPIP1
ENST00000373330,ZMYM1
ENST00000253144,ZNF331


In [28]:
df_vett_geni_Banerji, index_geni_Banerji = Vett_geni(df_vett_transcript_Banerji)
df_vett_geni_Banerji

Unnamed: 0,Gene_length
ACTN3,0
ADAMTS7,0
CACNA1E,0
CEACAM8,0
CFH,0
...,...
SYBU,0
VCPIP1,0
ZMYM1,0
ZNF331,0


In [29]:
# converting the index column into an array
df_vett_geni_Banerji.index.values

array(['ACTN3', 'ADAMTS7', 'CACNA1E', ..., 'ZMYM1', 'ZNF331', 'ZNF423'],
      dtype=object)

In [30]:
df_vett_geni_Banerji = Lunghezza_geni(index_geni_Banerji, df_vett_geni_Banerji)
df_vett_geni_Banerji

Unnamed: 0,Gene_length
ACTN3,16935
ADAMTS7,52229
CACNA1E,394982
CEACAM8,14815
CFH,95627
...,...
SYBU,117814
VCPIP1,38731
ZMYM1,56074
ZNF331,59289


In [31]:
df_vett_geni_Banerji.reset_index(level=0, inplace=True)
df_vett_geni_Banerji.rename(columns={"index": "Hugo_symbol"}, inplace=True)
df_vett_geni_Banerji

Unnamed: 0,Hugo_symbol,Gene_length
0,ACTN3,16935
1,ADAMTS7,52229
2,CACNA1E,394982
3,CEACAM8,14815
4,CFH,95627
...,...,...
3806,SYBU,117814
3807,VCPIP1,38731
3808,ZMYM1,56074
3809,ZNF331,59289


In [32]:
df_vett_transcript_Banerji.reset_index(level=0,inplace=True)
df_vett_transcript_Banerji.rename(columns={"index": "Transcript_ID"},inplace=True)
df_vett_transcript_Banerji

Unnamed: 0,Transcript_ID,Hugo_Symbol
0,ENST00000511191,ACTN3
1,ENST00000388820,ADAMTS7
2,ENST00000367573,CACNA1E
3,ENST00000244336,CEACAM8
4,ENST00000367429,CFH
...,...,...
3811,ENST00000422135,SYBU
3812,ENST00000310421,VCPIP1
3813,ENST00000373330,ZMYM1
3814,ENST00000253144,ZNF331


In [33]:
df_transcript_length_Banerji=df_vett_transcript_Banerji.join(df_vett_geni_Banerji.set_index('Hugo_symbol'), on='Hugo_Symbol')
df_transcript_length_Banerji=df_transcript_length_Banerji.drop('Hugo_Symbol', axis=1)
df_transcript_length_Banerji

Unnamed: 0,Transcript_ID,Gene_length
0,ENST00000511191,16935
1,ENST00000388820,52229
2,ENST00000367573,394982
3,ENST00000244336,14815
4,ENST00000367429,95627
...,...,...
3811,ENST00000422135,117814
3812,ENST00000310421,38731
3813,ENST00000373330,56074
3814,ENST00000253144,59289


In [34]:
data_mutation_Banerji_FINAL=data_mutation_Banerji.drop('Hugo_Symbol', axis=1)
# add the gene lengths to the mutation dataset 
data_mutation_Banerji_FINAL=data_mutation_Banerji_FINAL.join(df_transcript_length_Banerji.set_index('Transcript_ID'), on='Transcript_ID')
# add the new Hugo symbols to the mutation dataset 
data_mutation_Banerji_FINAL=data_mutation_Banerji_FINAL.join(df_vett_transcript_Banerji.set_index('Transcript_ID'), on='Transcript_ID')
data_mutation_Banerji_FINAL=data_mutation_Banerji_FINAL[['Tumor_Sample_Barcode','PATIENT_ID','Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'HGVSc', 'HGVSp_Short', 'Protein_position', 'Transcript_ID', 'Gene_length']]
data_mutation_Banerji_FINAL

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID,Gene_length
0,BR-M-005,BR-M-005,ACTN3,11,66318756,66318756,3'UTR,SNP,A,-,C,ENST00000511191.1:c.*161A>C,*54*,,ENST00000511191,16935
1,BR-M-005,BR-M-005,ADAMTS7,15,79059041,79059041,Missense_Mutation,SNP,T,-,C,ENST00000388820.4:c.3212A>G,p.N1071S,1071.0,ENST00000388820,52229
2,BR-M-005,BR-M-005,CACNA1E,1,181721292,181721292,Missense_Mutation,SNP,C,-,T,ENST00000367573.2:c.3745C>T,p.R1249W,1249.0,ENST00000367573,394982
3,BR-M-005,BR-M-005,CEACAM8,19,43093650,43093650,Missense_Mutation,SNP,G,-,A,ENST00000244336.5:c.662C>T,p.A221V,221.0,ENST00000244336,14815
4,BR-M-005,BR-M-005,CFH,1,196646760,196646760,Silent,SNP,C,-,T,ENST00000367429.4:c.582C>T,p.D194=,194.0,ENST00000367429,95627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4980,BR-V-071,BR-V-071,ZMYM1,1,35578996,35578996,Nonsense_Mutation,SNP,C,-,A,ENST00000373330.1:c.1565C>A,p.S522*,522.0,ENST00000373330,56074
4981,BR-V-071,BR-V-071,ZNF264,19,57716815,57716815,Missense_Mutation,SNP,G,-,C,ENST00000263095.6:c.211G>C,p.E71Q,71.0,ENST00000263095,21857
4982,BR-V-071,BR-V-071,ZNF331,19,54080435,54080435,Silent,SNP,G,-,A,ENST00000253144.9:c.621G>A,p.R207=,207.0,ENST00000253144,59289
4983,BR-V-071,BR-V-071,ZNF404,19,44377263,44377263,Missense_Mutation,SNP,G,-,C,ENST00000587539.1:c.1103C>G,p.S368C,368.0,ENST00000587539,29023


In [35]:
# save the final dataset containing the gene lengths 
data_mutation_Banerji_FINAL.to_csv('mutation_table_Banerji_et_al_Nature_2012_FINAL.csv')

### Stephens et al., Nature 2012

In [36]:
os.chdir("C:/Users/Cristina/OneDrive/Documenti/BCG/Tesi/Datasets/Breast Invasive Carcinoma (Sanger, Nature 2012)")

In [37]:
data_mutation_Stephens = pd.read_csv('mutation_table_Stephens_et_al_Nature_2012.csv', sep=None, engine='python')
data_mutation_Stephens = data_mutation_Stephens.iloc[: , 1:]
data_mutation_Stephens

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,PD4112a,PD4112a,UBQLN1,9,86297863,86297864,Splice_Region,INS,-,-,T,ENST00000376395.4:c.448+4dup,p.X150_splice,150.0,ENST00000376395
1,PD4093a,PD4093a,AKAP11,13,42874071,42874086,Frame_Shift_Del,DEL,AATCCTCAAAAATTCA,AATCCTCAAAAATTCA,-,ENST00000025301.2:c.1190_1205del,p.N397Sfs*36,397.0,ENST00000025301
2,PD4938a,PD4938a,ELF3,1,201981525,201981541,Frame_Shift_Del,DEL,GGCATGGCCTTCCAGGA,GGCATGGCCTTCCAGGA,-,ENST00000359651.3:c.442_458del,p.M148Pfs*7,147.0,ENST00000359651
3,PD4843a,PD4843a,MLL3,7,151845512,151845530,Frame_Shift_Del,DEL,AGTTTTGTCCTTAAAAAAC,AGTTTTGTCCTTAAAAAAC,-,ENST00000262189.6:c.13482_13500del,p.M4494Ifs*17,4494.0,ENST00000262189
4,PD4121a,PD4121a,CDH1,16,68847293,68847312,Frame_Shift_Del,DEL,TACCCCAGCGTGGGAGGCTG,TACCCCAGCGTGGGAGGCTG,-,ENST00000261769.5:c.1217_1236del,p.T406Ifs*6,405.0,p.T406fs*13(1)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7231,PD4127a,PD4127a,HORMAD2,22,30603124,30603124,5'Flank,SNP,G,G,A,,,,ENST00000432360
7232,PD4128a,PD4128a,EPS8L1,19,55583625,55583625,5'Flank,SNP,C,C,A,,,,ENST00000201647
7233,PD4109a,PD4109a,TNXB,6,32046832,32046832,5'Flank,SNP,C,C,T,,,,ENST00000516703
7234,PD4099a,PD4099a,SCN1B,19,35530118,35530118,5'Flank,SNP,C,C,T,,,182.0,ENST00000262626


In [38]:
# drop the rows that have protein position as missing value 
data_mutation_Stephens_protein_pos_NOnan = data_mutation_Stephens.dropna(subset='Protein_position')
data_mutation_Stephens_protein_pos_NOnan

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,PD4112a,PD4112a,UBQLN1,9,86297863,86297864,Splice_Region,INS,-,-,T,ENST00000376395.4:c.448+4dup,p.X150_splice,150.0,ENST00000376395
1,PD4093a,PD4093a,AKAP11,13,42874071,42874086,Frame_Shift_Del,DEL,AATCCTCAAAAATTCA,AATCCTCAAAAATTCA,-,ENST00000025301.2:c.1190_1205del,p.N397Sfs*36,397.0,ENST00000025301
2,PD4938a,PD4938a,ELF3,1,201981525,201981541,Frame_Shift_Del,DEL,GGCATGGCCTTCCAGGA,GGCATGGCCTTCCAGGA,-,ENST00000359651.3:c.442_458del,p.M148Pfs*7,147.0,ENST00000359651
3,PD4843a,PD4843a,MLL3,7,151845512,151845530,Frame_Shift_Del,DEL,AGTTTTGTCCTTAAAAAAC,AGTTTTGTCCTTAAAAAAC,-,ENST00000262189.6:c.13482_13500del,p.M4494Ifs*17,4494.0,ENST00000262189
4,PD4121a,PD4121a,CDH1,16,68847293,68847312,Frame_Shift_Del,DEL,TACCCCAGCGTGGGAGGCTG,TACCCCAGCGTGGGAGGCTG,-,ENST00000261769.5:c.1217_1236del,p.T406Ifs*6,405.0,p.T406fs*13(1)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7219,PD4938a,PD4938a,POLE,12,133244230,133244230,Nonsense_Mutation,SNP,G,G,C,ENST00000320574.5:c.2178C>G,p.Y726*,726.0,ENST00000320574
7220,PD4938a,PD4938a,POLE,12,133245390,133245390,Splice_Region,SNP,G,G,C,ENST00000320574.5:c.1923+7C>G,p.X641_splice,641.0,ENST00000320574
7227,PD3994a,PD3994a,OR2K2,9,114090733,114090733,5'Flank,SNP,T,T,C,,,23.0,ENST00000302681
7234,PD4099a,PD4099a,SCN1B,19,35530118,35530118,5'Flank,SNP,C,C,T,,,182.0,ENST00000262626


In [39]:
#data_mutation_Stephens_protein_pos_NOnan[data_mutation_Stephens_protein_pos_NOnan['Protein_position'].str.startswith('p')]

In [40]:
data_mutation_Stephens_ENSG=data_mutation_Stephens[data_mutation_Stephens['Transcript_ID'].str.startswith('ENST')]
data_mutation_Stephens_ENSG

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,PD4112a,PD4112a,UBQLN1,9,86297863,86297864,Splice_Region,INS,-,-,T,ENST00000376395.4:c.448+4dup,p.X150_splice,150.0,ENST00000376395
1,PD4093a,PD4093a,AKAP11,13,42874071,42874086,Frame_Shift_Del,DEL,AATCCTCAAAAATTCA,AATCCTCAAAAATTCA,-,ENST00000025301.2:c.1190_1205del,p.N397Sfs*36,397.0,ENST00000025301
2,PD4938a,PD4938a,ELF3,1,201981525,201981541,Frame_Shift_Del,DEL,GGCATGGCCTTCCAGGA,GGCATGGCCTTCCAGGA,-,ENST00000359651.3:c.442_458del,p.M148Pfs*7,147.0,ENST00000359651
3,PD4843a,PD4843a,MLL3,7,151845512,151845530,Frame_Shift_Del,DEL,AGTTTTGTCCTTAAAAAAC,AGTTTTGTCCTTAAAAAAC,-,ENST00000262189.6:c.13482_13500del,p.M4494Ifs*17,4494.0,ENST00000262189
5,PD4602a,PD4602a,TMEM209,7,129825062,129825083,Frame_Shift_Del,DEL,GGCTTCATCTTTGTTAGCACAT,GGCTTCATCTTTGTTAGCACAT,-,ENST00000397622.2:c.900_921del,p.C301Ifs*17,300.0,ENST00000397622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7231,PD4127a,PD4127a,HORMAD2,22,30603124,30603124,5'Flank,SNP,G,G,A,,,,ENST00000432360
7232,PD4128a,PD4128a,EPS8L1,19,55583625,55583625,5'Flank,SNP,C,C,A,,,,ENST00000201647
7233,PD4109a,PD4109a,TNXB,6,32046832,32046832,5'Flank,SNP,C,C,T,,,,ENST00000516703
7234,PD4099a,PD4099a,SCN1B,19,35530118,35530118,5'Flank,SNP,C,C,T,,,182.0,ENST00000262626


In [41]:
# select only the rows for which Transcript ID does not start with ENST
x = pd.concat([data_mutation_Stephens, data_mutation_Stephens_ENSG])
data_mutation_Stephens_withoutENSG = x.drop_duplicates(keep=False, inplace=False).copy()
data_mutation_Stephens_withoutENSG

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
4,PD4121a,PD4121a,CDH1,16,68847293,68847312,Frame_Shift_Del,DEL,TACCCCAGCGTGGGAGGCTG,TACCCCAGCGTGGGAGGCTG,-,ENST00000261769.5:c.1217_1236del,p.T406Ifs*6,405.0,p.T406fs*13(1)
51,PD4844a,PD4844a,RB1,13,48941721,48941733,Frame_Shift_Del,DEL,AGACTGATTCTAT,AGACTGATTCTAT,-,ENST00000267163.4:c.1035_1047delTGATTCTATAGAC,p.D346Vfs*17,344.0,344
83,PD4107a,PD4107a,TP53,17,7578263,7578263,Frame_Shift_Del,DEL,G,G,-,ENST00000269305.4:c.586delC,p.R196Efs*51,196.0,p.R196R(5)
84,PD4252a,PD4252a,TP53,17,7578516,7578516,Frame_Shift_Del,DEL,G,G,-,ENST00000269305.4:c.414delC,p.K139Rfs*31,138.0,p.A138fs*32(5)
85,PD4844a,PD4844a,TP53,17,7579359,7579359,Frame_Shift_Del,DEL,G,G,-,ENST00000269305.4:c.328delC,p.R110Vfs*13,110.0,p.R110fs*13(5)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5490,PD4937a,PD4937a,TP53,17,7577058,7577058,Nonsense_Mutation,SNP,C,C,A,ENST00000269305.4:c.880G>T,p.E294*,294.0,p.0?(7)
5491,PD4600a,PD4600a,TP53,17,7578239,7578239,Nonsense_Mutation,SNP,C,C,A,ENST00000269305.4:c.610G>T,p.E204*,204.0,p.E204G(2)
5492,PD4935a,PD4935a,TP53,17,7578275,7578275,Nonsense_Mutation,SNP,G,G,A,ENST00000269305.4:c.574C>T,p.Q192*,192.0,p.A189_V197delAPPQHLIRV(4)
5517,PD4120a,PD4120a,SMAD4,18,48575671,48575671,Nonsense_Mutation,SNP,C,C,G,ENST00000342988.3:c.431C>G,p.S144*,144.0,144


In [42]:
new_Transcript_ID=data_mutation_Stephens_withoutENSG.HGVSc.str.split('.').str[0].to_frame().copy()
new_Transcript_ID.rename(columns={'HGVSc': 'Transcript_ID'}, inplace=True)
new_Transcript_ID

Unnamed: 0,Transcript_ID
4,ENST00000261769
51,ENST00000267163
83,ENST00000269305
84,ENST00000269305
85,ENST00000269305
...,...
5490,ENST00000269305
5491,ENST00000269305
5492,ENST00000269305
5517,ENST00000342988


In [43]:
# replace the new Transcript ID column 
#data_mutation_Stephens_withoutENSG['Transcript_ID'] = new_Transcript_ID['Transcript_ID']
data_mutation_Stephens_withoutENSG.loc[:,'Transcript_ID'] = new_Transcript_ID.loc[:,'Transcript_ID']
data_mutation_Stephens_withoutENSG

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
4,PD4121a,PD4121a,CDH1,16,68847293,68847312,Frame_Shift_Del,DEL,TACCCCAGCGTGGGAGGCTG,TACCCCAGCGTGGGAGGCTG,-,ENST00000261769.5:c.1217_1236del,p.T406Ifs*6,405.0,ENST00000261769
51,PD4844a,PD4844a,RB1,13,48941721,48941733,Frame_Shift_Del,DEL,AGACTGATTCTAT,AGACTGATTCTAT,-,ENST00000267163.4:c.1035_1047delTGATTCTATAGAC,p.D346Vfs*17,344.0,ENST00000267163
83,PD4107a,PD4107a,TP53,17,7578263,7578263,Frame_Shift_Del,DEL,G,G,-,ENST00000269305.4:c.586delC,p.R196Efs*51,196.0,ENST00000269305
84,PD4252a,PD4252a,TP53,17,7578516,7578516,Frame_Shift_Del,DEL,G,G,-,ENST00000269305.4:c.414delC,p.K139Rfs*31,138.0,ENST00000269305
85,PD4844a,PD4844a,TP53,17,7579359,7579359,Frame_Shift_Del,DEL,G,G,-,ENST00000269305.4:c.328delC,p.R110Vfs*13,110.0,ENST00000269305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5490,PD4937a,PD4937a,TP53,17,7577058,7577058,Nonsense_Mutation,SNP,C,C,A,ENST00000269305.4:c.880G>T,p.E294*,294.0,ENST00000269305
5491,PD4600a,PD4600a,TP53,17,7578239,7578239,Nonsense_Mutation,SNP,C,C,A,ENST00000269305.4:c.610G>T,p.E204*,204.0,ENST00000269305
5492,PD4935a,PD4935a,TP53,17,7578275,7578275,Nonsense_Mutation,SNP,G,G,A,ENST00000269305.4:c.574C>T,p.Q192*,192.0,ENST00000269305
5517,PD4120a,PD4120a,SMAD4,18,48575671,48575671,Nonsense_Mutation,SNP,C,C,G,ENST00000342988.3:c.431C>G,p.S144*,144.0,ENST00000342988


In [44]:
data_mutation_Stephens_with_all_ENST=pd.concat([data_mutation_Stephens_ENSG,data_mutation_Stephens_withoutENSG], axis=0)
data_mutation_Stephens_with_all_ENST

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,PD4112a,PD4112a,UBQLN1,9,86297863,86297864,Splice_Region,INS,-,-,T,ENST00000376395.4:c.448+4dup,p.X150_splice,150.0,ENST00000376395
1,PD4093a,PD4093a,AKAP11,13,42874071,42874086,Frame_Shift_Del,DEL,AATCCTCAAAAATTCA,AATCCTCAAAAATTCA,-,ENST00000025301.2:c.1190_1205del,p.N397Sfs*36,397.0,ENST00000025301
2,PD4938a,PD4938a,ELF3,1,201981525,201981541,Frame_Shift_Del,DEL,GGCATGGCCTTCCAGGA,GGCATGGCCTTCCAGGA,-,ENST00000359651.3:c.442_458del,p.M148Pfs*7,147.0,ENST00000359651
3,PD4843a,PD4843a,MLL3,7,151845512,151845530,Frame_Shift_Del,DEL,AGTTTTGTCCTTAAAAAAC,AGTTTTGTCCTTAAAAAAC,-,ENST00000262189.6:c.13482_13500del,p.M4494Ifs*17,4494.0,ENST00000262189
5,PD4602a,PD4602a,TMEM209,7,129825062,129825083,Frame_Shift_Del,DEL,GGCTTCATCTTTGTTAGCACAT,GGCTTCATCTTTGTTAGCACAT,-,ENST00000397622.2:c.900_921del,p.C301Ifs*17,300.0,ENST00000397622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5490,PD4937a,PD4937a,TP53,17,7577058,7577058,Nonsense_Mutation,SNP,C,C,A,ENST00000269305.4:c.880G>T,p.E294*,294.0,ENST00000269305
5491,PD4600a,PD4600a,TP53,17,7578239,7578239,Nonsense_Mutation,SNP,C,C,A,ENST00000269305.4:c.610G>T,p.E204*,204.0,ENST00000269305
5492,PD4935a,PD4935a,TP53,17,7578275,7578275,Nonsense_Mutation,SNP,G,G,A,ENST00000269305.4:c.574C>T,p.Q192*,192.0,ENST00000269305
5517,PD4120a,PD4120a,SMAD4,18,48575671,48575671,Nonsense_Mutation,SNP,C,C,G,ENST00000342988.3:c.431C>G,p.S144*,144.0,ENST00000342988


In [45]:
data_mutation_Stephens_with_all_ENST['Hugo_Symbol'].nunique()

5093

In [46]:
data_mutation_Stephens_with_all_ENST['Transcript_ID'].nunique()

5115

In [47]:
df_vett_transcript_Stephens = Vett_transcript(data_mutation_Stephens_with_all_ENST)
df_vett_transcript_Stephens 

Unnamed: 0,Hugo_Symbol
ENST00000376395,0
ENST00000025301,0
ENST00000359651,0
ENST00000262189,0
ENST00000397622,0
...,...
ENST00000326873,0
ENST00000256078,0
ENST00000342988,0
ENST00000435339,0


In [48]:
df_vett_transcript_Stephens = Ricavo_Transcript(data_mutation_Stephens_with_all_ENST, df_vett_transcript_Stephens)
df_vett_transcript_Stephens

Unnamed: 0,Hugo_Symbol
ENST00000376395,UBQLN1
ENST00000025301,AKAP11
ENST00000359651,ELF3
ENST00000262189,KMT2C
ENST00000397622,TMEM209
...,...
ENST00000326873,STK11
ENST00000256078,KRAS
ENST00000342988,SMAD4
ENST00000435339,LPAR4


In [49]:
df_vett_geni_Stephens, index_geni_Stephens = Vett_geni(df_vett_transcript_Stephens)
df_vett_geni_Stephens

Unnamed: 0,Gene_length
UBQLN1,0
AKAP11,0
ELF3,0
KMT2C,0
TMEM209,0
...,...
STK11,0
KRAS,0
SMAD4,0
LPAR4,0


In [50]:
# converting the index column into an array
df_vett_geni_Stephens.index.values

array(['UBQLN1', 'AKAP11', 'ELF3', ..., 'SMAD4', 'LPAR4', 'GATA1'],
      dtype=object)

In [51]:
df_vett_geni_Stephens = Lunghezza_geni(index_geni_Stephens, df_vett_geni_Stephens)
df_vett_geni_Stephens

Unnamed: 0,Gene_length
UBQLN1,48241
AKAP11,51108
ELF3,9244
KMT2C,301081
TMEM209,43056
...,...
STK11,39023
KRAS,46148
SMAD4,117006
LPAR4,9386


In [52]:
df_vett_geni_Stephens.reset_index(level=0, inplace=True)
df_vett_geni_Stephens.rename(columns={"index": "Hugo_symbol"}, inplace=True)
df_vett_geni_Stephens

Unnamed: 0,Hugo_symbol,Gene_length
0,UBQLN1,48241
1,AKAP11,51108
2,ELF3,9244
3,KMT2C,301081
4,TMEM209,43056
...,...,...
5097,STK11,39023
5098,KRAS,46148
5099,SMAD4,117006
5100,LPAR4,9386


In [53]:
df_vett_transcript_Stephens.reset_index(level=0,inplace=True)
df_vett_transcript_Stephens.rename(columns={"index": "Transcript_ID"},inplace=True)
df_vett_transcript_Stephens

Unnamed: 0,Transcript_ID,Hugo_Symbol
0,ENST00000376395,UBQLN1
1,ENST00000025301,AKAP11
2,ENST00000359651,ELF3
3,ENST00000262189,KMT2C
4,ENST00000397622,TMEM209
...,...,...
5110,ENST00000326873,STK11
5111,ENST00000256078,KRAS
5112,ENST00000342988,SMAD4
5113,ENST00000435339,LPAR4


In [54]:
df_transcript_length_Stephens=df_vett_transcript_Stephens.join(df_vett_geni_Stephens.set_index('Hugo_symbol'), on='Hugo_Symbol')
df_transcript_length_Stephens=df_transcript_length_Stephens.drop('Hugo_Symbol', axis=1)
df_transcript_length_Stephens

Unnamed: 0,Transcript_ID,Gene_length
0,ENST00000376395,48241
1,ENST00000025301,51108
2,ENST00000359651,9244
3,ENST00000262189,301081
4,ENST00000397622,43056
...,...,...
5110,ENST00000326873,39023
5111,ENST00000256078,46148
5112,ENST00000342988,117006
5113,ENST00000435339,9386


In [55]:
data_mutation_Stephens_FINAL=data_mutation_Stephens_with_all_ENST.drop('Hugo_Symbol', axis=1)
# add the gene lengths to the mutation dataset 
data_mutation_Stephens_FINAL=data_mutation_Stephens_FINAL.join(df_transcript_length_Stephens.set_index('Transcript_ID'), on='Transcript_ID')
# add the new Hugo symbols to the mutation dataset 
data_mutation_Stephens_FINAL=data_mutation_Stephens_FINAL.join(df_vett_transcript_Stephens.set_index('Transcript_ID'), on='Transcript_ID')
data_mutation_Stephens_FINAL=data_mutation_Stephens_FINAL[['Tumor_Sample_Barcode','PATIENT_ID','Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'HGVSc', 'HGVSp_Short', 'Protein_position', 'Transcript_ID', 'Gene_length']]
data_mutation_Stephens_FINAL

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID,Gene_length
0,PD4112a,PD4112a,UBQLN1,9,86297863,86297864,Splice_Region,INS,-,-,T,ENST00000376395.4:c.448+4dup,p.X150_splice,150.0,ENST00000376395,48241
1,PD4093a,PD4093a,AKAP11,13,42874071,42874086,Frame_Shift_Del,DEL,AATCCTCAAAAATTCA,AATCCTCAAAAATTCA,-,ENST00000025301.2:c.1190_1205del,p.N397Sfs*36,397.0,ENST00000025301,51108
2,PD4938a,PD4938a,ELF3,1,201981525,201981541,Frame_Shift_Del,DEL,GGCATGGCCTTCCAGGA,GGCATGGCCTTCCAGGA,-,ENST00000359651.3:c.442_458del,p.M148Pfs*7,147.0,ENST00000359651,9244
3,PD4843a,PD4843a,KMT2C,7,151845512,151845530,Frame_Shift_Del,DEL,AGTTTTGTCCTTAAAAAAC,AGTTTTGTCCTTAAAAAAC,-,ENST00000262189.6:c.13482_13500del,p.M4494Ifs*17,4494.0,ENST00000262189,301081
5,PD4602a,PD4602a,TMEM209,7,129825062,129825083,Frame_Shift_Del,DEL,GGCTTCATCTTTGTTAGCACAT,GGCTTCATCTTTGTTAGCACAT,-,ENST00000397622.2:c.900_921del,p.C301Ifs*17,300.0,ENST00000397622,43056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5490,PD4937a,PD4937a,TP53,17,7577058,7577058,Nonsense_Mutation,SNP,C,C,A,ENST00000269305.4:c.880G>T,p.E294*,294.0,ENST00000269305,25760
5491,PD4600a,PD4600a,TP53,17,7578239,7578239,Nonsense_Mutation,SNP,C,C,A,ENST00000269305.4:c.610G>T,p.E204*,204.0,ENST00000269305,25760
5492,PD4935a,PD4935a,TP53,17,7578275,7578275,Nonsense_Mutation,SNP,G,G,A,ENST00000269305.4:c.574C>T,p.Q192*,192.0,ENST00000269305,25760
5517,PD4120a,PD4120a,SMAD4,18,48575671,48575671,Nonsense_Mutation,SNP,C,C,G,ENST00000342988.3:c.431C>G,p.S144*,144.0,ENST00000342988,117006


In [56]:
# save the final dataset containing the gene lengths 
data_mutation_Stephens_FINAL.to_csv('mutation_table_Stephens_et_al_Nature_2012_FINAL.csv')

### Krug et al., Cell 2020

In [57]:
os.chdir("C:/Users/Cristina/OneDrive/Documenti/BCG/Tesi/Datasets/Proteogenomic landscape of breast cancer (CPTAC, Cell 2020)")

In [58]:
data_mutation_Krug = pd.read_csv('mutation_table_Krug_et_al_Cell_2018.csv', sep=None, engine='python')
data_mutation_Krug = data_mutation_Krug.iloc[: , 1:]
data_mutation_Krug

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID,ENSP
0,CPT000814,CPT000814,SZT2,1,43908878,43908878,Missense_Mutation,SNP,G,G,C,c.8268G>C,p.K2756N,2756/3375,ENST00000562955,ENSP00000457168
1,CPT000814,CPT000814,AGBL4,1,49283877,49283877,Intron,SNP,A,A,C,c.634+48986T>G,,,ENST00000371839,ENSP00000360905
2,CPT000814,CPT000814,PTGER3,1,71512957,71512957,Missense_Mutation,SNP,C,C,T,c.304G>A,p.G102R,102/390,ENST00000306666,ENSP00000302313
3,CPT000814,CPT000814,HFM1,1,91859745,91859745,Silent,SNP,G,G,A,c.399C>T,p.G133=,133/1435,ENST00000370425,ENSP00000359454
4,CPT000814,CPT000814,PRUNE,1,150999720,150999720,Missense_Mutation,SNP,G,G,A,c.691G>A,p.E231K,231/453,ENST00000271620,ENSP00000271620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29013,X22BR006,X22BR006,ZNF652,17,47375881,47375894,Frame_Shift_Del,DEL,GGCGGGAGGTGAGG,GGCGGGAGGTGAGG,-,c.1702_1715del,p.P568Tfs*6,568-572/606,ENST00000362063,ENSP00000354686
29014,X22BR006,X22BR006,GALK1,17,73754610,73754610,Silent,SNP,C,C,T,c.864G>A,p.T288=,288/392,ENST00000225614,ENSP00000225614
29015,X22BR006,X22BR006,ZNF584,19,58928243,58928243,Missense_Mutation,SNP,C,C,A,c.358C>A,p.Q120K,120/421,ENST00000306910,ENSP00000306756
29016,X22BR006,X22BR006,ASXL1,20,31022238,31022238,Nonsense_Mutation,SNP,C,C,T,c.1723C>T,p.Q575*,575/1541,ENST00000375687,ENSP00000364839


In [59]:
data_mutation_Krug['Hugo_Symbol'].nunique()

11775

In [60]:
data_mutation_Krug['Transcript_ID'].nunique()

11839

In [61]:
df_vett_transcript_Krug = Vett_transcript(data_mutation_Krug)
df_vett_transcript_Krug 

Unnamed: 0,Hugo_Symbol
ENST00000562955,0
ENST00000371839,0
ENST00000306666,0
ENST00000370425,0
ENST00000271620,0
...,...
ENST00000372037,0
ENST00000315580,0
ENST00000362063,0
ENST00000225614,0


In [62]:
df_vett_transcript_Krug = Ricavo_Transcript(data_mutation_Krug, df_vett_transcript_Krug)
df_vett_transcript_Krug 

Unnamed: 0,Hugo_Symbol
ENST00000562955,SZT2
ENST00000371839,AGBL4
ENST00000306666,PTGER3
ENST00000370425,HFM1
ENST00000271620,PRUNE
...,...
ENST00000372037,BMPR1A
ENST00000315580,ARL6IP4
ENST00000362063,ZNF652
ENST00000225614,GALK1


In [63]:
df_vett_geni_Krug, index_geni_Krug = Vett_geni(df_vett_transcript_Krug)
df_vett_geni_Krug

Unnamed: 0,Gene_length
SZT2,0
AGBL4,0
PTGER3,0
HFM1,0
PRUNE,0
...,...
BMPR1A,0
ARL6IP4,0
ZNF652,0
GALK1,0


In [64]:
# converting the index column into an array
df_vett_geni_Krug.index.values

array(['SZT2', 'AGBL4', 'PTGER3', ..., 'ZNF652', 'GALK1', 'ZNF584'],
      dtype=object)

In [65]:
df_vett_geni_Krug = Lunghezza_geni(index_geni_Krug, df_vett_geni_Krug)
df_vett_geni_Krug 

Unnamed: 0,Gene_length
SZT2,62769
AGBL4,1491059
PTGER3,195456
HFM1,144104
PRUNE,27294
...,...
BMPR1A,176189
ARL6IP4,2850
ZNF652,73268
GALK1,14118


In [66]:
df_vett_geni_Krug.reset_index(level=0, inplace=True)
df_vett_geni_Krug.rename(columns={"index": "Hugo_symbol"}, inplace=True)
df_vett_geni_Krug

Unnamed: 0,Hugo_symbol,Gene_length
0,SZT2,62769
1,AGBL4,1491059
2,PTGER3,195456
3,HFM1,144104
4,PRUNE,27294
...,...,...
11773,BMPR1A,176189
11774,ARL6IP4,2850
11775,ZNF652,73268
11776,GALK1,14118


In [67]:
df_vett_transcript_Krug.reset_index(level=0,inplace=True)
df_vett_transcript_Krug.rename(columns={"index": "Transcript_ID"},inplace=True)
df_vett_transcript_Krug

Unnamed: 0,Transcript_ID,Hugo_Symbol
0,ENST00000562955,SZT2
1,ENST00000371839,AGBL4
2,ENST00000306666,PTGER3
3,ENST00000370425,HFM1
4,ENST00000271620,PRUNE
...,...,...
11834,ENST00000372037,BMPR1A
11835,ENST00000315580,ARL6IP4
11836,ENST00000362063,ZNF652
11837,ENST00000225614,GALK1


In [68]:
df_transcript_length_Krug=df_vett_transcript_Krug.join(df_vett_geni_Krug.set_index('Hugo_symbol'), on='Hugo_Symbol')
df_transcript_length_Krug=df_transcript_length_Krug.drop('Hugo_Symbol', axis=1)
df_transcript_length_Krug

Unnamed: 0,Transcript_ID,Gene_length
0,ENST00000562955,62769
1,ENST00000371839,1491059
2,ENST00000306666,195456
3,ENST00000370425,144104
4,ENST00000271620,27294
...,...,...
11834,ENST00000372037,176189
11835,ENST00000315580,2850
11836,ENST00000362063,73268
11837,ENST00000225614,14118


In [69]:
data_mutation_Krug_FINAL=data_mutation_Krug.drop('Hugo_Symbol', axis=1)
# add the gene lengths to the mutation dataset 
data_mutation_Krug_FINAL=data_mutation_Krug_FINAL.join(df_transcript_length_Krug.set_index('Transcript_ID'), on='Transcript_ID')
# add the new Hugo symbols to the mutation dataset 
data_mutation_Krug_FINAL=data_mutation_Krug_FINAL.join(df_vett_transcript_Krug.set_index('Transcript_ID'), on='Transcript_ID')
data_mutation_Krug_FINAL=data_mutation_Krug_FINAL[['Tumor_Sample_Barcode','PATIENT_ID','Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'HGVSc', 'HGVSp_Short', 'Protein_position', 'Transcript_ID', 'Gene_length']]
data_mutation_Krug_FINAL

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID,Gene_length
0,CPT000814,CPT000814,SZT2,1,43908878,43908878,Missense_Mutation,SNP,G,G,C,c.8268G>C,p.K2756N,2756/3375,ENST00000562955,62769
1,CPT000814,CPT000814,AGBL4,1,49283877,49283877,Intron,SNP,A,A,C,c.634+48986T>G,,,ENST00000371839,1491059
2,CPT000814,CPT000814,PTGER3,1,71512957,71512957,Missense_Mutation,SNP,C,C,T,c.304G>A,p.G102R,102/390,ENST00000306666,195456
3,CPT000814,CPT000814,HFM1,1,91859745,91859745,Silent,SNP,G,G,A,c.399C>T,p.G133=,133/1435,ENST00000370425,144104
4,CPT000814,CPT000814,PRUNE,1,150999720,150999720,Missense_Mutation,SNP,G,G,A,c.691G>A,p.E231K,231/453,ENST00000271620,27294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29013,X22BR006,X22BR006,ZNF652,17,47375881,47375894,Frame_Shift_Del,DEL,GGCGGGAGGTGAGG,GGCGGGAGGTGAGG,-,c.1702_1715del,p.P568Tfs*6,568-572/606,ENST00000362063,73268
29014,X22BR006,X22BR006,GALK1,17,73754610,73754610,Silent,SNP,C,C,T,c.864G>A,p.T288=,288/392,ENST00000225614,14118
29015,X22BR006,X22BR006,ZNF584,19,58928243,58928243,Missense_Mutation,SNP,C,C,A,c.358C>A,p.Q120K,120/421,ENST00000306910,16824
29016,X22BR006,X22BR006,ASXL1,20,31022238,31022238,Nonsense_Mutation,SNP,C,C,T,c.1723C>T,p.Q575*,575/1541,ENST00000375687,80968


In [70]:
# save the final dataset containing the gene lengths 
data_mutation_Krug_FINAL.to_csv('mutation_table_Krug_et_al_Cell_2018_FINAL.csv')

### The Metastatic Breast Cancer Project

In [71]:
os.chdir("C:/Users/Cristina/OneDrive/Documenti/BCG/Tesi/Datasets/The Metastatic Breast Cancer Project (Provisional, December 2021)")

In [72]:
data_mutation_MBC = pd.read_csv('mutation_table_TheMetastaticBreastCancerProject_2021.csv', sep=None, engine='python')
data_mutation_MBC = data_mutation_MBC.iloc[: , 1:]
data_mutation_MBC

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Num_samples_per_patient,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,ACAP3,1,1229017,1229017,Missense_Mutation,SNP,G,G,A,ENST00000354700.5:c.2432C>T,p.A811V,811,ENST00000354700
1,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,CPSF3L,1,1256201,1256201,Intron,SNP,C,C,A,ENST00000540437.1:c.144+175G>T,p.*48*,,ENST00000540437
2,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,ECE1,1,21605869,21605869,Intron,SNP,G,G,A,ENST00000374893.6:c.139-44C>T,p.*47*,,ENST00000374893
3,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,SLC6A17,1,110740989,110740989,Missense_Mutation,SNP,C,C,A,ENST00000331565.4:c.2107C>A,p.L703M,703,ENST00000331565
4,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,LCE5A,1,152484252,152484252,Missense_Mutation,SNP,G,G,A,ENST00000334269.2:c.242G>A,p.R81Q,81,ENST00000334269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55966,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,STAMBP,2,74058135,74058136,Frame_Shift_Ins,INS,-,-,T,ENST00000339566.3:c.153dup,p.E52*,51,ENST00000339566
55967,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,OR5K4,3,98073501,98073505,Frame_Shift_Del,DEL,TACAC,TACAC,-,ENST00000354924.2:c.804_808del,p.T269Sfs*18,268,ENST00000354924
55968,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,GBX1,7,150846045,150846046,In_Frame_Ins,INS,-,-,CCCAGGCTT,ENST00000297537.4:c.714_722dup,p.S239_G241dup,239,ENST00000297537
55969,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,CDH1,16,68842661,68842662,Frame_Shift_Ins,INS,-,-,C,ENST00000261769.5:c.602dup,p.V202Cfs*7,199,ENST00000261769


In [73]:
data_mutation_MBC[data_mutation_MBC['Transcript_ID'].isna()]

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Num_samples_per_patient,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
1049,rs200277687,,,TP53,17,7577479,7577511,Splice_Site,DEL,GGTGGCAAGTGGCTCCTGACCTGGAGTCTTCCA,GGTGGCAAGTGGCTCCTGACCTGGAGTCTTCCA,-,41,,,
3884,MBC-MBCProject_VrsMsqTb-Tumor-SM-CGLI9,MBCProject_VrsMsqTb,2.0,Unknown,22,28208688,28208688,IGR,SNP,G,G,A,,,,
4724,MBC-MBCProject_57iLiJIl-Tumor-SM-CGLIV,MBCProject_57iLiJIl,1.0,Unknown,12,66501918,66501918,IGR,SNP,G,G,C,,,,
5597,MBC-MBCProject_qmu6TYto-Tumor-SM-DL3BW,MBCProject_qmu6TYto,1.0,Unknown,1,173644491,173644491,IGR,SNP,T,T,C,,,,
5770,MBC-MBCProject_qmu6TYto-Tumor-SM-DL3BW,MBCProject_qmu6TYto,1.0,Unknown,11,65803106,65803106,IGR,SNP,A,A,G,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51297,rs202167678,,,JAK3,19,17953321,17953337,Frame_Shift_Del,DEL,CGCAGGGCTCTGCGCAC,CGCAGGGCTCTGCGCAC,-,__UNKNOWN__,33,ENST00000458235.1:c.649_665del,
52244,RP-1156_MBCProject_lqSlSztO_T2_v2_Exome,MBCProject_lqSlSztO,1.0,Unknown,11,65803166,65803166,IGR,SNP,G,G,A,,,,
53558,RP-1156_MBCProject_ELtvf4UO_BLOOD_P_v2_Exome,MBCProject_ELtvf4UO,1.0,Unknown,22,46402793,46402793,IGR,SNP,G,G,A,,,,
54369,rs368771578,,,TP53,17,7579363,7579374,In_Frame_Del,DEL,ACCGTAGCTGCC,ACCGTAGCTGCC,-,56,,p.Gly105_Gly108del,


In [74]:
# delete the rows with missing Transcript_ID
data_mutation_MBC=data_mutation_MBC.dropna(subset=['Transcript_ID'])
data_mutation_MBC

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Num_samples_per_patient,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,ACAP3,1,1229017,1229017,Missense_Mutation,SNP,G,G,A,ENST00000354700.5:c.2432C>T,p.A811V,811,ENST00000354700
1,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,CPSF3L,1,1256201,1256201,Intron,SNP,C,C,A,ENST00000540437.1:c.144+175G>T,p.*48*,,ENST00000540437
2,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,ECE1,1,21605869,21605869,Intron,SNP,G,G,A,ENST00000374893.6:c.139-44C>T,p.*47*,,ENST00000374893
3,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,SLC6A17,1,110740989,110740989,Missense_Mutation,SNP,C,C,A,ENST00000331565.4:c.2107C>A,p.L703M,703,ENST00000331565
4,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,LCE5A,1,152484252,152484252,Missense_Mutation,SNP,G,G,A,ENST00000334269.2:c.242G>A,p.R81Q,81,ENST00000334269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55966,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,STAMBP,2,74058135,74058136,Frame_Shift_Ins,INS,-,-,T,ENST00000339566.3:c.153dup,p.E52*,51,ENST00000339566
55967,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,OR5K4,3,98073501,98073505,Frame_Shift_Del,DEL,TACAC,TACAC,-,ENST00000354924.2:c.804_808del,p.T269Sfs*18,268,ENST00000354924
55968,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,GBX1,7,150846045,150846046,In_Frame_Ins,INS,-,-,CCCAGGCTT,ENST00000297537.4:c.714_722dup,p.S239_G241dup,239,ENST00000297537
55969,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,CDH1,16,68842661,68842662,Frame_Shift_Ins,INS,-,-,C,ENST00000261769.5:c.602dup,p.V202Cfs*7,199,ENST00000261769


In [75]:
data_mutation_MBC_ENST=data_mutation_MBC[data_mutation_MBC['Transcript_ID'].str.startswith('ENST')]
data_mutation_MBC_ENST

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Num_samples_per_patient,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,ACAP3,1,1229017,1229017,Missense_Mutation,SNP,G,G,A,ENST00000354700.5:c.2432C>T,p.A811V,811,ENST00000354700
1,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,CPSF3L,1,1256201,1256201,Intron,SNP,C,C,A,ENST00000540437.1:c.144+175G>T,p.*48*,,ENST00000540437
2,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,ECE1,1,21605869,21605869,Intron,SNP,G,G,A,ENST00000374893.6:c.139-44C>T,p.*47*,,ENST00000374893
3,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,SLC6A17,1,110740989,110740989,Missense_Mutation,SNP,C,C,A,ENST00000331565.4:c.2107C>A,p.L703M,703,ENST00000331565
4,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,LCE5A,1,152484252,152484252,Missense_Mutation,SNP,G,G,A,ENST00000334269.2:c.242G>A,p.R81Q,81,ENST00000334269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55966,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,STAMBP,2,74058135,74058136,Frame_Shift_Ins,INS,-,-,T,ENST00000339566.3:c.153dup,p.E52*,51,ENST00000339566
55967,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,OR5K4,3,98073501,98073505,Frame_Shift_Del,DEL,TACAC,TACAC,-,ENST00000354924.2:c.804_808del,p.T269Sfs*18,268,ENST00000354924
55968,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,GBX1,7,150846045,150846046,In_Frame_Ins,INS,-,-,CCCAGGCTT,ENST00000297537.4:c.714_722dup,p.S239_G241dup,239,ENST00000297537
55969,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,CDH1,16,68842661,68842662,Frame_Shift_Ins,INS,-,-,C,ENST00000261769.5:c.602dup,p.V202Cfs*7,199,ENST00000261769


In [76]:
# select only the rows for which Transcript ID does not start with ENST
x = pd.concat([data_mutation_MBC, data_mutation_MBC_ENST])
data_mutation_MBC_withoutENST = x.drop_duplicates(keep=False, inplace=False).copy()
data_mutation_MBC_withoutENST

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Num_samples_per_patient,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
1505,,,,CAV3,3,8787403,8787403,Silent,SNP,G,G,C,1,p.Ala102=,NM_001234.4,p.A102=
4361,,,,HLA-B,6,31324579,31324579,Missense_Mutation,SNP,C,C,G,0,p.Glu77Gln,NM_005514.6,p.E77Q
5169,,,,PIK3CA,3,178928071,178928085,In_Frame_Del,DEL,ATGGATTAGAAGATT,ATGGATTAGAAGATT,-,,p.His450_Asp454del,NM_006218.2,p.H450_D454del
5602,,,,ASPM,1,197070002,197070002,Missense_Mutation,SNP,C,C,G,0,p.Met2793Ile,NM_018136.4,p.M2793I
7238,,,,MUC16,19,9088439,9088456,In_Frame_Del,DEL,TTGTTCCATCTCTAGGTA,TTGTTCCATCTCTAGGTA,-,,p.Ile1120_Thr1125del,NM_024690.2,p.I1120_T1125del
8283,,,,TRIM8,10,104404673,104404698,Frame_Shift_Del,DEL,CGCTGCCCGCGCAGAAGGTCTGCCTG,CGCTGCCCGCGCAGAAGGTCTGCCTG,-,,p.Pro102ArgfsTer78,NM_030912.2,p.P102Rfs*78
9205,rs368029356,,,NEURL4,17,7226081,7226106,Frame_Shift_Del,DEL,CGGTGGCAGTGTTGCTGGTCGCCAGG,CGGTGGCAGTGTTGCTGGTCGCCAGG,-,,ENST00000399464.2:c.2673_2698del,ENST00000399464,p.Ser891ArgfsTer54
9206,,,,EPG5,18,43492336,43492349,Frame_Shift_Del,DEL,GTGACCAGAGCCTG,GTGACCAGAGCCTG,-,,p.Gln1297ThrfsTer3,NM_020964.2,p.Q1297Tfs*3
9274,,,,ZFPM1,16,88600188,88600203,Frame_Shift_Del,DEL,GACGCGCCTGCCGCGC,GACGCGCCTGCCGCGC,-,,p.Asp608AlafsTer185,NM_153813.2,p.D608Afs*185
9275,,,,ERBB2,17,37880220,37880225,Nonsense_Mutation,DEL,TGAGGG,TGAGGG,-,,p.Leu755_Glu757delinsTer,,p.L755_E757delins*


In [77]:
data_mutation_MBC['PATIENT_ID'].isna().sum()

85

In [78]:
# remove all the rows for which Patient_ID is not present
data_mutation_MBC=data_mutation_MBC.dropna(subset=['PATIENT_ID'])
data_mutation_MBC

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Num_samples_per_patient,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID
0,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,ACAP3,1,1229017,1229017,Missense_Mutation,SNP,G,G,A,ENST00000354700.5:c.2432C>T,p.A811V,811,ENST00000354700
1,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,CPSF3L,1,1256201,1256201,Intron,SNP,C,C,A,ENST00000540437.1:c.144+175G>T,p.*48*,,ENST00000540437
2,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,ECE1,1,21605869,21605869,Intron,SNP,G,G,A,ENST00000374893.6:c.139-44C>T,p.*47*,,ENST00000374893
3,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,SLC6A17,1,110740989,110740989,Missense_Mutation,SNP,C,C,A,ENST00000331565.4:c.2107C>A,p.L703M,703,ENST00000331565
4,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,2.0,LCE5A,1,152484252,152484252,Missense_Mutation,SNP,G,G,A,ENST00000334269.2:c.242G>A,p.R81Q,81,ENST00000334269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55966,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,STAMBP,2,74058135,74058136,Frame_Shift_Ins,INS,-,-,T,ENST00000339566.3:c.153dup,p.E52*,51,ENST00000339566
55967,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,OR5K4,3,98073501,98073505,Frame_Shift_Del,DEL,TACAC,TACAC,-,ENST00000354924.2:c.804_808del,p.T269Sfs*18,268,ENST00000354924
55968,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,GBX1,7,150846045,150846046,In_Frame_Ins,INS,-,-,CCCAGGCTT,ENST00000297537.4:c.714_722dup,p.S239_G241dup,239,ENST00000297537
55969,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,1.0,CDH1,16,68842661,68842662,Frame_Shift_Ins,INS,-,-,C,ENST00000261769.5:c.602dup,p.V202Cfs*7,199,ENST00000261769


In [79]:
data_mutation_MBC['Hugo_Symbol'].nunique()

14513

In [80]:
data_mutation_MBC['Transcript_ID'].nunique()

15039

In [81]:
df_vett_transcript_MBC = Vett_transcript(data_mutation_MBC)
df_vett_transcript_MBC 

Unnamed: 0,Hugo_Symbol
ENST00000354700,0
ENST00000540437,0
ENST00000374893,0
ENST00000331565,0
ENST00000334269,0
...,...
ENST00000578561,0
ENST00000210313,0
ENST00000337532,0
ENST00000357685,0


In [82]:
df_vett_transcript_MBC = Ricavo_Transcript(data_mutation_MBC, df_vett_transcript_MBC)
df_vett_transcript_MBC 

Unnamed: 0,Hugo_Symbol
ENST00000354700,ACAP3
ENST00000540437,CPSF3L
ENST00000374893,ECE1
ENST00000331565,SLC6A17
ENST00000334269,LCE5A
...,...
ENST00000578561,RP11-149I2.5
ENST00000210313,PSMD5
ENST00000337532,MPP7
ENST00000357685,BCO2


In [83]:
df_vett_geni_MBC, index_geni_MBC = Vett_geni(df_vett_transcript_MBC)
df_vett_geni_MBC

Unnamed: 0,Gene_length
ACAP3,0
CPSF3L,0
ECE1,0
SLC6A17,0
LCE5A,0
...,...
RP11-149I2.5,0
PSMD5,0
MPP7,0
BCO2,0


In [84]:
# converting the index column into an array
df_vett_geni_MBC.index.values

array(['ACAP3', 'CPSF3L', 'ECE1', ..., 'MPP7', 'BCO2', 'KLK9'],
      dtype=object)

In [85]:
df_vett_geni_MBC = Lunghezza_geni(index_geni_MBC, df_vett_geni_MBC)
df_vett_geni_MBC 

Unnamed: 0,Gene_length
ACAP3,17234
CPSF3L,13107
ECE1,128258
SLC6A17,51717
LCE5A,1334
...,...
RP11-149I2.5,1617
PSMD5,27489
MPP7,283494
BCO2,49233


In [86]:
df_vett_geni_MBC.reset_index(level=0, inplace=True)
df_vett_geni_MBC.rename(columns={"index": "Hugo_symbol"}, inplace=True)
df_vett_geni_MBC

Unnamed: 0,Hugo_symbol,Gene_length
0,ACAP3,17234
1,CPSF3L,13107
2,ECE1,128258
3,SLC6A17,51717
4,LCE5A,1334
...,...,...
14886,RP11-149I2.5,1617
14887,PSMD5,27489
14888,MPP7,283494
14889,BCO2,49233


In [87]:
df_vett_geni_MBC.loc[df_vett_geni_MBC['Hugo_symbol']=='SNHG14']

Unnamed: 0,Hugo_symbol,Gene_length


In [88]:
df_vett_geni_MBC.loc[df_vett_geni_MBC['Hugo_symbol']=='ZDHHC20']

Unnamed: 0,Hugo_symbol,Gene_length
11553,ZDHHC20,83247


In [89]:
df_vett_transcript_MBC.reset_index(level=0,inplace=True)
df_vett_transcript_MBC.rename(columns={"index": "Transcript_ID"},inplace=True)
df_vett_transcript_MBC

Unnamed: 0,Transcript_ID,Hugo_Symbol
0,ENST00000354700,ACAP3
1,ENST00000540437,CPSF3L
2,ENST00000374893,ECE1
3,ENST00000331565,SLC6A17
4,ENST00000334269,LCE5A
...,...,...
15034,ENST00000578561,RP11-149I2.5
15035,ENST00000210313,PSMD5
15036,ENST00000337532,MPP7
15037,ENST00000357685,BCO2


In [90]:
df_vett_transcript_MBC.loc[df_vett_transcript_MBC['Transcript_ID']=='ENST00000410815']

Unnamed: 0,Transcript_ID,Hugo_Symbol
4145,ENST00000410815,RNA5SP25


In [91]:
df_vett_transcript_MBC.loc[df_vett_transcript_MBC['Transcript_ID']=='ENST00000581098']

Unnamed: 0,Transcript_ID,Hugo_Symbol
537,ENST00000581098,MIR3135B


In [92]:
df_vett_transcript_MBC.loc[df_vett_transcript_MBC['Hugo_Symbol']=='ZDHHC20']

Unnamed: 0,Transcript_ID,Hugo_Symbol
11638,ENST00000400590,ZDHHC20


In [93]:
df_transcript_length_MBC=df_vett_transcript_MBC.join(df_vett_geni_MBC.set_index('Hugo_symbol'), on='Hugo_Symbol')
df_transcript_length_MBC=df_transcript_length_MBC.drop('Hugo_Symbol', axis=1)
df_transcript_length_MBC

Unnamed: 0,Transcript_ID,Gene_length
0,ENST00000354700,17234
1,ENST00000540437,13107
2,ENST00000374893,128258
3,ENST00000331565,51717
4,ENST00000334269,1334
...,...,...
15034,ENST00000578561,1617
15035,ENST00000210313,27489
15036,ENST00000337532,283494
15037,ENST00000357685,49233


In [94]:
data_mutation_MBC_FINAL=data_mutation_MBC.drop('Hugo_Symbol', axis=1)
# add the gene lengths to the mutation dataset 
data_mutation_MBC_FINAL=data_mutation_MBC_FINAL.join(df_transcript_length_MBC.set_index('Transcript_ID'), on='Transcript_ID')
# add the new Hugo symbols to the mutation dataset 
data_mutation_MBC_FINAL=data_mutation_MBC_FINAL.join(df_vett_transcript_MBC.set_index('Transcript_ID'), on='Transcript_ID')
data_mutation_MBC_FINAL=data_mutation_MBC_FINAL[['Tumor_Sample_Barcode','PATIENT_ID','Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'HGVSc', 'HGVSp_Short', 'Protein_position', 'Transcript_ID', 'Gene_length']]
data_mutation_MBC_FINAL

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID,Gene_length
0,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,ACAP3,1,1229017,1229017,Missense_Mutation,SNP,G,G,A,ENST00000354700.5:c.2432C>T,p.A811V,811,ENST00000354700,17234
1,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,CPSF3L,1,1256201,1256201,Intron,SNP,C,C,A,ENST00000540437.1:c.144+175G>T,p.*48*,,ENST00000540437,13107
2,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,ECE1,1,21605869,21605869,Intron,SNP,G,G,A,ENST00000374893.6:c.139-44C>T,p.*47*,,ENST00000374893,128258
3,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,SLC6A17,1,110740989,110740989,Missense_Mutation,SNP,C,C,A,ENST00000331565.4:c.2107C>A,p.L703M,703,ENST00000331565,51717
4,MBC-MBCProject_GvHkH2Hk-Tumor-SM-AZ5HV,MBCProject_GvHkH2Hk,LCE5A,1,152484252,152484252,Missense_Mutation,SNP,G,G,A,ENST00000334269.2:c.242G>A,p.R81Q,81,ENST00000334269,1334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55966,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,STAMBP,2,74058135,74058136,Frame_Shift_Ins,INS,-,-,T,ENST00000339566.3:c.153dup,p.E52*,51,ENST00000339566,44701
55967,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,OR5K4,3,98073501,98073505,Frame_Shift_Del,DEL,TACAC,TACAC,-,ENST00000354924.2:c.804_808del,p.T269Sfs*18,268,ENST00000354924,966
55968,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,GBX1,7,150846045,150846046,In_Frame_Ins,INS,-,-,CCCAGGCTT,ENST00000297537.4:c.714_722dup,p.S239_G241dup,239,ENST00000297537,26157
55969,RP-1156_MBCProject_86HMf8FA_T2_v2_Exome,MBCProject_86HMf8FA,CDH1,16,68842661,68842662,Frame_Shift_Ins,INS,-,-,C,ENST00000261769.5:c.602dup,p.V202Cfs*7,199,ENST00000261769,98324


In [95]:
data_mutation_MBC_FINAL.loc[data_mutation_MBC_FINAL['Hugo_Symbol']=='ZDHHC20']

Unnamed: 0,Tumor_Sample_Barcode,PATIENT_ID,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,HGVSc,HGVSp_Short,Protein_position,Transcript_ID,Gene_length
29581,RP-1156_MBCProject_YNTyfDfq_T1B_v1_Exome,MBCProject_YNTyfDfq,ZDHHC20,13,21965974,21965974,Missense_Mutation,SNP,C,C,A,ENST00000400590.3:c.614G>T,p.R205L,205.0,ENST00000400590,83247
36202,RP-1156_MBCProject_PkTDsOSa_T1A_v2_Exome,MBCProject_PkTDsOSa,ZDHHC20,13,21949165,21949165,3'Flank,SNP,T,T,C,,,,ENST00000400590,83247
45654,RP-1156_MBCProject_W7HMcZI4_T1_v1_Exome,MBCProject_W7HMcZI4,ZDHHC20,13,21949189,21949189,3'Flank,SNP,A,A,C,,,,ENST00000400590,83247
46419,RP-1156_MBCProject_W7HMcZI4_T4_v1_Exome,MBCProject_W7HMcZI4,ZDHHC20,13,21949185,21949185,3'Flank,SNP,C,C,A,,,,ENST00000400590,83247
46420,RP-1156_MBCProject_W7HMcZI4_T4_v1_Exome,MBCProject_W7HMcZI4,ZDHHC20,13,21949189,21949189,3'Flank,SNP,A,A,C,,,,ENST00000400590,83247


In [96]:
# save the final dataset containing the gene lengths 
data_mutation_MBC_FINAL.to_csv('mutation_table_MBC_FINAL.csv')