In [1]:
import pandas as pd
import seaborn as sns 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Filenames

#MAF protein coding files
MAF_VEP_protein_tumour_file = '/Users/jake/OneDrive - University of Glasgow/Project/COAD/FPKM_MAF_notebook/COAD_protein_coding_tumour.csv'
MAF_VEP_protein_normal_file = '/Users/jake/OneDrive - University of Glasgow/Project/COAD/FPKM_MAF_notebook/COAD_protein_coding_normal.csv'

#Gene expression FPKM files
FPKM_tumour_file = '/Users/jake/OneDrive - University of Glasgow/Project/COAD/FPKM_MAF_notebook/COAD_FPKM_tumour.csv'
FPKM_normal_file = '/Users/jake/OneDrive - University of Glasgow/Project/COAD/FPKM_MAF_notebook/COAD_FPKM_normal.csv'

#Isoform switch file
isoform_switches_file = '/Users/jake/OneDrive - University of Glasgow/Project/COAD/isoform_switch/coad-isoform-switch-iktspmodel.txt'

In [5]:
#Load isoform switches
isoform_switches = pd.read_csv(isoform_switches_file,sep='\t',header=None)

In [6]:
isoform_switches.head()

Unnamed: 0,0,1
0,"FBLN2|2199,uc011avb.1","FBLN2|2199,uc011ava.1"
1,"CD44|960,uc001mvw.2","CD44|960,uc001mvx.2"
2,"SLC39A14|23516,uc003xbq.3","SLC39A14|23516,uc003xbp.3"
3,"UGP2|7360,uc002scl.2","UGP2|7360,uc002scm.2"
4,"IL1R2|7850,uc002tbn.2","IL1R2|7850,uc002tbm.2"


In [7]:
#Create new cols for isoform switches, with one displaying gene_symbol and the other two displaying the two transcript IDs

isoform_switches['Gene_symbol'] = isoform_switches[0].apply(lambda x: x.split('|')[0])
isoform_switches['Transcript_Id_1'] = isoform_switches[0].apply(lambda x: x.split(',')[1])
isoform_switches['Transcript_Id_2'] = isoform_switches[1].apply(lambda x: x.split(',')[1])

#Drop the initial columns
isoform_switches.drop([0,1],axis=1,inplace=True)

In [8]:
isoform_switches.head()

Unnamed: 0,Gene_symbol,Transcript_Id_1,Transcript_Id_2
0,FBLN2,uc011avb.1,uc011ava.1
1,CD44,uc001mvw.2,uc001mvx.2
2,SLC39A14,uc003xbq.3,uc003xbp.3
3,UGP2,uc002scl.2,uc002scm.2
4,IL1R2,uc002tbn.2,uc002tbm.2


In [9]:
#Number of genes associated with isoform switches

isoform_switches['Gene_symbol'].nunique()

13

In [None]:
#### Filter for these genes in the FPKM gene expression and MAF protein coding dataframes ####

In [10]:
#Load files

MAF_VEP_protein_tumour = pd.read_csv(MAF_VEP_protein_tumour_file,low_memory=False,index_col=0)
MAF_VEP_protein_normal = pd.read_csv(MAF_VEP_protein_normal_file,low_memory=False,index_col=0)

FPKM_tumour = pd.read_csv(FPKM_tumour_file,index_col=0)
FPKM_normal = pd.read_csv(FPKM_normal_file,index_col=0)


In [None]:
#First filter the FPKM tumour and then normal dataframes

In [11]:
#Check if any present in isoform_switches not found in FPKM
#Could be due to differences in gene label => add manualy to FPKM_iso if so

set(isoform_switches['Gene_symbol']) - set(FPKM_tumour['Gene_symbol']) 

set()

In [12]:
#Create list of iso_form genes to filter for, including any identified above with alternative names

isoform_genes_FPKM = list(isoform_switches['Gene_symbol'])

In [13]:
# FPKM tumour filter for isoform switch genes

FPKM_tumour_iso = FPKM_tumour.loc[FPKM_tumour['Gene_symbol'].isin(isoform_genes_FPKM)]

In [14]:
FPKM_tumour_iso

Unnamed: 0,Gene_ensembl_id,Gene_symbol,TCGA-AA-3867,TCGA-CA-6719,TCGA-NH-A50V,TCGA-AA-A01C,TCGA-AA-A02F,TCGA-AA-A03F,TCGA-AZ-4615,TCGA-AA-3854,...,TCGA-A6-6138,TCGA-A6-2685,TCGA-QG-A5YX,TCGA-AY-A54L,TCGA-A6-2671,TCGA-NH-A8F8,TCGA-CM-5863,TCGA-A6-3809.2,TCGA-D5-6932,TCGA-AA-3675
456,ENSG00000026508,CD44,49.437336,26.181544,43.585652,25.303669,57.48887,22.468272,46.933128,56.34729,...,34.361535,32.09638,29.925135,32.40977,31.520747,50.971634,42.929245,59.984435,50.069357,59.563262
1287,ENSG00000075142,SRI,44.841315,38.171138,24.658549,43.792638,27.200675,15.08467,17.863711,114.546805,...,37.183063,47.035315,48.634665,60.581531,28.211444,23.956176,56.181913,25.066261,22.73908,36.772185
1667,ENSG00000085733,CTTN,43.419941,31.008285,49.971597,43.526294,23.395513,70.448546,30.153891,44.858901,...,29.078803,38.652049,43.775762,32.808479,32.837844,42.640666,32.527568,39.002346,47.686444,21.4558
2940,ENSG00000104635,SLC39A14,22.918339,16.831717,12.825033,20.000358,20.654395,12.901203,59.061575,23.314894,...,48.667234,22.855138,34.12563,22.872968,7.4961,21.73231,32.023106,25.83631,18.235208,27.093669
4452,ENSG00000115590,IL1R2,1.72977,1.097672,0.570912,0.479973,0.822586,2.869665,4.744338,31.282369,...,2.068797,2.393464,14.959713,0.994161,1.415372,0.253236,8.802575,2.729988,1.729465,1.220621
6093,ENSG00000129009,ISLR,69.811483,42.51688,51.739143,58.31348,10.410286,8.693111,27.509687,5.29922,...,26.867733,71.74356,7.693835,1.601303,49.215278,37.554881,21.229201,2.30874,30.980162,16.907314
6396,ENSG00000131089,ARHGEF9,2.11626,2.724855,2.982264,2.773046,2.26129,2.707335,2.139916,2.228208,...,2.66584,3.260938,3.532286,3.843074,2.326598,2.873641,2.192778,1.684769,2.963232,2.286335
6750,ENSG00000133392,MYH11,24.526326,60.857761,5.664525,15.126343,6.82404,3.769247,2.303767,1.568848,...,12.369897,138.054493,1.350113,0.849897,25.909564,52.773071,19.038967,0.888296,18.184051,1.436682
8483,ENSG00000143882,ATP6V1C2,2.374979,1.542474,1.674405,4.851438,2.605318,0.851872,0.524931,0.643164,...,1.414512,5.900615,6.585198,1.08512,1.658985,4.245257,1.313158,0.930872,2.093615,1.81775
9993,ENSG00000156510,HKDC1,12.752686,18.5329,4.204533,20.432585,19.322412,31.742125,15.641048,11.103851,...,32.782406,23.694282,11.978535,27.744009,6.391835,41.823995,16.527893,14.898626,18.774438,12.815931


In [15]:
#Perform filter on normal FPKM 

FPKM_normal_iso = FPKM_normal.loc[FPKM_normal['Gene_symbol'].isin(isoform_genes_FPKM)]

In [67]:
#No filter for these isoform genes in the MAF, Hugo_symbol col in the tumour and normal DFs

In [16]:
#First Check if any present in isoform_switches not found in MAF
#Could be due to differences in gene label => add manualy to MAF_iso if so

set(isoform_switches['Gene_symbol']) - set(MAF_VEP_protein_tumour['Hugo_Symbol']) 

set()

In [17]:
#Check for prescence of alternative symbols ..

MAF_VEP_protein_tumour[MAF_VEP_protein_tumour['Hugo_Symbol']=='']

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,FILTER,CONTEXT,src_vcf_id,tumor_bam_uuid,normal_bam_uuid,case_id,GDC_FILTER,COSMIC,MC3_Overlap,GDC_Validation_Status


In [18]:
#Create list of iso_form genes to filter for, including any identified above with alternative names

isoform_genes_MAF = list(isoform_switches['Gene_symbol'])

In [19]:
# MAF tumour filter for isoform switch genes

MAF_tumour_iso = MAF_VEP_protein_tumour.loc[MAF_VEP_protein_tumour['Hugo_Symbol'].isin(isoform_genes_MAF)]

In [None]:
# MAF normal filter for isoform switch genes

In [20]:
MAF_normal_iso = MAF_VEP_protein_normal.loc[MAF_VEP_protein_normal['Hugo_Symbol'].isin(isoform_genes_MAF)]

In [21]:
MAF_tumour_iso['Hugo_Symbol'].value_counts()

FLNA        36
MYH11       27
HKDC1       17
FBLN2       13
UGP2         9
IL1R2        7
ARHGEF9      7
ISLR         7
ATP6V1C2     4
SLC39A14     4
CTTN         2
SRI          2
CD44         2
Name: Hugo_Symbol, dtype: int64

In [22]:
MAF_normal_iso['Hugo_Symbol'].value_counts()

FLNA        3
MYH11       2
IL1R2       1
FBLN2       1
SLC39A14    1
HKDC1       1
Name: Hugo_Symbol, dtype: int64

In [101]:
#For each gene there is only one associated transcript_ID 
#=> cant filter for Transcript id suplied with isoform switches

MAF_tumour_iso.groupby('Hugo_Symbol')['Transcript_ID'].value_counts()

Hugo_Symbol  Transcript_ID  
ABLIM3       ENST00000309868     4
ADAMTS12     ENST00000504830     8
ATXN3        ENST00000532032     2
BMP1         ENST00000306385     3
CASC4        ENST00000299957     4
CCND3        ENST00000372991     1
CEP104       ENST00000378230     3
DST          ENST00000312431    32
DUT          ENST00000331200     1
EHBP1        ENST00000263991     8
FBLN2        ENST00000295760     2
GSPT1        ENST00000420576     5
INHBA        ENST00000242208     4
INSR         ENST00000302850    10
ISLR         ENST00000249842     4
LRRFIP2      ENST00000336686     2
LSP1         ENST00000311604     3
MCF2L        ENST00000375608     7
MPRIP        ENST00000341712     3
MYL6         ENST00000548293     2
PCGF5        ENST00000336126     2
PPP2R1B      ENST00000311129     1
PRMT2        ENST00000355680     2
REPS1        ENST00000450536     3
SERPINA1     ENST00000355814     6
SLK          ENST00000369755     5
SNRNP70      ENST00000598441     2
TANK         ENST000002590