In [1]:
import pandas as pd
import seaborn as sns 
import numpy as np
import glob
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Import FPKM, MAF_protein and isoform switch files for required cancer type

#Filenames
MAF_file = '/Users/jake/OneDrive - University of Glasgow/Project/COAD/GeneExpression_MAF/COAD_MAF_protein_coding_full.csv'
FPKM_protein_file = '/Users/jake/OneDrive - University of Glasgow/Project/COAD/GeneExpression_MAF/COAD_FPKM_protein_all.csv'
isoform_switch_file = '/Users/jake/OneDrive - University of Glasgow/Project/COAD/isoform_switch/COAD_isoform_switches_ensembl.csv'

#Read in as dataframes
MAF_protein = pd.read_csv(MAF_file,low_memory=False,sep=',',index_col=0)
FPKM_protein = pd.read_csv(FPKM_protein_file,header=[0,1],low_memory=False,sep='\t',index_col=[0])
isoform_switches = pd.read_csv(isoform_switch_file,sep='\t')


In [3]:
isoform_switches

Unnamed: 0,Gene_symbol,ensembl_gene_id,entrezgene_id,chromosome_name
2,CTTN,ENSG00000085733,2017,11
3,FBLN2,ENSG00000163520,2199,3
4,FLNA,ENSG00000196924,2316,X
5,ARHGEF9,ENSG00000131089,23229,X
6,SLC39A14,ENSG00000104635,23516,8
7,ATP6V1C2,ENSG00000143882,245973,2
9,ISLR,ENSG00000129009,3671,15
11,MYH11,ENSG00000133392,4629,16
12,SRI,ENSG00000075142,6717,7
13,UGP2,ENSG00000169764,7360,2


In [4]:
FPKM_protein.head()

Gene_symbol,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1
Gene_ensembl_id,ENSG00000121410,ENSG00000148584,ENSG00000175899,ENSG00000166535,ENSG00000184389,ENSG00000128274,ENSG00000118017,ENSG00000094914,ENSG00000081760,ENSG00000114771,...,ENSG00000086827,ENSG00000174442,ENSG00000122952,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755
Sample_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
TCGA-AA-3867-01A-01R-1022-07,0.052551,2.4772,57.378073,0.02162,0.0,5.347004,0.0,10.124001,3.88819,0.031552,...,8.337702,3.266961,11.9512,0.84453,2.250194,4.091899,0.011212,3.216321,87.952703,2.923495
TCGA-CA-6719-01A-11R-1839-07,0.005613,1.927244,74.944713,0.012317,0.0,4.271353,0.025395,11.569886,1.87234,1.159435,...,10.87554,6.107697,11.915234,1.711313,4.772996,4.795855,0.009581,5.43401,72.500361,5.15808
TCGA-NH-A50V-01A-11R-A28H-07,0.035237,0.550182,51.276491,0.024852,0.0,5.097944,0.056933,10.462945,3.612377,0.193436,...,7.953033,1.813204,14.288954,1.255144,4.156986,5.492997,0.038664,3.39948,57.827137,3.430372
TCGA-AA-A01C-01A-01R-A00A-07,0.049092,1.914795,45.653386,0.013464,0.0,5.011742,0.0,15.683433,3.70862,0.117902,...,7.460225,3.650127,22.783866,1.123639,2.886182,3.341388,0.0,2.662764,116.008374,5.140081
TCGA-AA-A02F-01A-01R-A089-07,0.011685,1.145542,28.787736,0.00641,0.0,2.099586,0.0,15.396349,2.902797,0.028064,...,7.259496,11.352859,21.240253,1.650287,4.630294,6.636417,0.0,5.230433,68.590155,2.939338


In [5]:
#Check for genes not present in FPKM_protein 

display(set(isoform_switches['ensembl_gene_id']) - set(FPKM_protein.columns.levels[1]))
display (set(isoform_switches['Gene_symbol']) - set(FPKM_protein.columns.levels[0]) )


set()

set()

In [None]:
# Note that gene names / symbols can still be different 
# => filter in relation to Ensembl ID
#drop any Ensembl IDs from isoform switch dataframe that not present in FPKM

#missing_genes = list(set(isoform_switches['ensembl_gene_id']) - set(FPKM_protein.columns.levels[1]))
#isoform_switches = isoform_switches.loc[~isoform_switches['ensembl_gene_id'].isin(missing_genes)]

In [8]:
#Create list of isoform gene_names / columns to keep in FPKM
#ADD 'Cancer' TO LIST since = target variable

cols_to_keep = list(isoform_switches['ensembl_gene_id'])
cols_to_keep.append('Cancer')

In [11]:
#Filter FPKM for the isoform switch genes

FPKM_protein_isoform = FPKM_protein.loc(axis=1)[:, cols_to_keep]

In [12]:
FPKM_protein_isoform

Gene_symbol,CTTN,FBLN2,FLNA,ARHGEF9,SLC39A14,ATP6V1C2,ISLR,MYH11,SRI,UGP2,IL1R2,HKDC1,CD44,Cancer
Gene_ensembl_id,ENSG00000085733,ENSG00000163520,ENSG00000196924,ENSG00000131089,ENSG00000104635,ENSG00000143882,ENSG00000129009,ENSG00000133392,ENSG00000075142,ENSG00000169764,ENSG00000115590,ENSG00000156510,ENSG00000026508,Cancer
Sample_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
TCGA-AA-3867-01A-01R-1022-07,43.419941,17.969730,98.506992,2.116260,22.918339,2.374979,69.811483,24.526326,44.841315,15.509306,1.729770,12.752686,49.437336,1
TCGA-CA-6719-01A-11R-1839-07,31.008285,5.075141,153.793369,2.724855,16.831717,1.542474,42.516880,60.857761,38.171138,13.800729,1.097672,18.532900,26.181544,1
TCGA-NH-A50V-01A-11R-A28H-07,49.971597,7.463640,48.447111,2.982264,12.825033,1.674405,51.739143,5.664525,24.658549,16.107688,0.570912,4.204533,43.585652,1
TCGA-AA-A01C-01A-01R-A00A-07,43.526294,11.790653,150.463581,2.773046,20.000358,4.851438,58.313480,15.126343,43.792638,11.142595,0.479973,20.432585,25.303669,1
TCGA-AA-A02F-01A-01R-A089-07,23.395513,2.266629,75.349062,2.261290,20.654395,2.605318,10.410286,6.824040,27.200675,13.780305,0.822586,19.322412,57.488870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-A6-2682-11A-01R-A32Z-07,36.195520,9.756399,40.254574,5.116727,27.502997,0.668204,7.510906,17.512665,116.629904,57.037639,10.666862,11.973689,12.996430,0
TCGA-AA-3522-11A-01R-A32Z-07,38.606857,16.125248,45.252554,5.323429,31.354408,0.477939,10.783217,24.119451,153.386318,74.743021,18.051094,9.957414,11.233170,0
TCGA-AA-3662-11A-01R-1723-07,31.916105,21.770961,92.316274,6.593554,28.567929,1.115913,16.008965,125.767640,69.142586,45.311255,49.908591,2.679323,15.903117,0
TCGA-AA-3520-11A-01R-A32Z-07,36.031825,20.685962,45.642212,4.815894,29.933660,0.531336,11.140504,25.517808,142.960271,70.222246,16.308686,14.142560,12.256530,0


In [10]:
#Save the isoform_switch filtered FPKM file to a csv
FPKM_protein_isoform.to_csv(MAF_file.split('/')[5]+'_FPKM_protein_isoform_filtered.csv',sep='\t')