In [1]:
import pandas as pd
import seaborn as sns 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Files used - replace with the corresponding files uploaded to Git since I am loading them from my computer here

#MAF file
TCGA_MAF = '/Users/jake/OneDrive - University of Glasgow/Project - Practice/COAD/70cb1255-ec99-4c08-b482-415f8375be3f/TCGA.COAD.muse.70cb1255-ec99-4c08-b482-415f8375be3f.DR-10.0.somatic.maf'

#FPKM files - FPKM values for COAD from TCGA, acquired through TCGAbiolinks in R (see R code and document)
tumour_FPKM_file = '/Users/jake/OneDrive - University of Glasgow/Project - Practice/TCGA_R_FPKM/FPKM_COAD_tumour'
normal_FPKM_file = '/Users/jake/OneDrive - University of Glasgow/Project - Practice/TCGA_R_FPKM/FPKM_COAD_normal'

In [3]:
#Load MAF file for somatic COAD mutations from TCGA
#Skip 'comment' rows that start with #, low_memory = False for dtype error

MAF = pd.read_csv(TCGA_MAF,sep='\t',comment='#',low_memory=False)

In [4]:
MAF.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,FILTER,CONTEXT,src_vcf_id,tumor_bam_uuid,normal_bam_uuid,case_id,GDC_FILTER,COSMIC,MC3_Overlap,GDC_Validation_Status
0,ATAD3B,83858,BCM,GRCh38,chr1,1485803,1485803,+,Nonsense_Mutation,SNP,...,PASS,TCAGTCGACCC,9130f121-b7ce-460f-b90c-8e31add6cd10,7de9d9e2-c4a4-4311-826f-973cb7987c66,861ad835-790a-47ad-8c03-06f7eb7b5710,7a70f061-9a6f-408e-a416-7f5295ceba3b,,COSM1333470,True,Unknown
1,PLCH2,9651,BCM,GRCh38,chr1,2487195,2487195,+,Silent,SNP,...,PASS,AGGAGCCCTGC,9130f121-b7ce-460f-b90c-8e31add6cd10,7de9d9e2-c4a4-4311-826f-973cb7987c66,861ad835-790a-47ad-8c03-06f7eb7b5710,7a70f061-9a6f-408e-a416-7f5295ceba3b,,COSM1340725;COSM1340726,True,Unknown
2,CHD5,26038,BCM,GRCh38,chr1,6146395,6146395,+,Missense_Mutation,SNP,...,PASS,AGTTGCGATAC,9130f121-b7ce-460f-b90c-8e31add6cd10,7de9d9e2-c4a4-4311-826f-973cb7987c66,861ad835-790a-47ad-8c03-06f7eb7b5710,7a70f061-9a6f-408e-a416-7f5295ceba3b,,COSM911251,True,Unknown
3,IFNLR1,163702,BCM,GRCh38,chr1,24159060,24159060,+,Missense_Mutation,SNP,...,PASS,GGCCCGTGGCA,9130f121-b7ce-460f-b90c-8e31add6cd10,7de9d9e2-c4a4-4311-826f-973cb7987c66,861ad835-790a-47ad-8c03-06f7eb7b5710,7a70f061-9a6f-408e-a416-7f5295ceba3b,,COSM1340840,True,Unknown
4,YTHDF2,51441,BCM,GRCh38,chr1,28769136,28769136,+,3'UTR,SNP,...,PASS,AAAAAAAGAAA,9130f121-b7ce-460f-b90c-8e31add6cd10,7de9d9e2-c4a4-4311-826f-973cb7987c66,861ad835-790a-47ad-8c03-06f7eb7b5710,7a70f061-9a6f-408e-a416-7f5295ceba3b,,,True,Unknown


In [7]:
#IMPACT col is the VEP status, with HIGH and MODERATE displaying variants that affect proteins
#Create new MAF DF with these values extracted

MAF_VEP_protein = MAF[(MAF['IMPACT'] == 'HIGH') | (MAF['IMPACT'] == 'MODERATE')]

In [8]:
#dbSNP_RS col is the rs-IDs from the dbSNP database, "novel" if not found in any database used, 
#or null if there is no dbSNP record, but it is found in other databases

MAF_VEP_protein['dbSNP_RS'].value_counts()

novel          37143
rs121913529       84
rs113488022       44
rs112445441       30
rs104886003       29
               ...  
rs199510940        1
rs146229969        1
rs746300859        1
rs764418026        1
rs778832697        1
Name: dbSNP_RS, Length: 32918, dtype: int64

In [9]:
#Aliquot barcode for tumour sample in col 'Tumor_Sample_Barcode'

MAF_VEP_protein['Tumor_Sample_Barcode'].value_counts()

TCGA-AA-A010-01A-01D-A17O-10    7267
TCGA-CA-6717-01A-11D-1835-10    6899
TCGA-AZ-4315-01A-01D-1408-10    5841
TCGA-AA-3984-01A-02D-1981-10    4406
TCGA-AA-A00N-01A-02D-A17O-10    4181
                                ... 
TCGA-F4-6704-01A-11D-1835-10      32
TCGA-CM-5863-01A-21D-1835-10      32
TCGA-A6-5664-01A-21D-1835-10      29
TCGA-CA-5255-01A-11D-1835-10      24
TCGA-AZ-4323-01A-21D-1835-10      20
Name: Tumor_Sample_Barcode, Length: 399, dtype: int64

In [10]:
#Aliquot barcode for the matched normal sample in col 'Matched_Norm_Sample_Barcode'

MAF_VEP_protein['Matched_Norm_Sample_Barcode'].value_counts()

TCGA-AA-A010-10A-01D-A17O-10    7267
TCGA-CA-6717-10A-01D-1835-10    6899
TCGA-AZ-4315-10A-01D-1408-10    5841
TCGA-AA-3984-10A-01D-1982-10    4406
TCGA-AA-A00N-10A-01D-A17O-10    4181
                                ... 
TCGA-AA-3972-10A-01W-0999-10      32
TCGA-CM-5863-10A-01D-1835-10      32
TCGA-A6-5664-10A-01D-1835-10      29
TCGA-CA-5255-10A-01D-1835-10      24
TCGA-AZ-4323-10A-01D-1835-10      20
Name: Matched_Norm_Sample_Barcode, Length: 399, dtype: int64

In [11]:
#Each tumour sample seems to have a tumour sample with a matched normal

MAF_VEP_protein[['Tumor_Sample_Barcode','Matched_Norm_Sample_Barcode']].head()

Unnamed: 0,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode
0,TCGA-CK-6751-01A-11D-1835-10,TCGA-CK-6751-10A-01D-1835-10
2,TCGA-CK-6751-01A-11D-1835-10,TCGA-CK-6751-10A-01D-1835-10
3,TCGA-CK-6751-01A-11D-1835-10,TCGA-CK-6751-10A-01D-1835-10
8,TCGA-CK-6751-01A-11D-1835-10,TCGA-CK-6751-10A-01D-1835-10
10,TCGA-CK-6751-01A-11D-1835-10,TCGA-CK-6751-10A-01D-1835-10


In [12]:
#Load the FPKM files

tumour_FPKM = pd.read_csv(tumour_FPKM_file)
normal_FPKM = pd.read_csv(normal_FPKM_file)

In [13]:
tumour_FPKM.head()

Unnamed: 0.1,Unnamed: 0,Gene_symbol,TCGA-AA-3867-01A-01R-1022-07,TCGA-CA-6719-01A-11R-1839-07,TCGA-NH-A50V-01A-11R-A28H-07,TCGA-AA-A01C-01A-01R-A00A-07,TCGA-AA-A02F-01A-01R-A089-07,TCGA-AA-A03F-01A-11R-A16W-07,TCGA-AA-A00O-01A-02R-A089-07,TCGA-AZ-4615-01A-01R-1410-07,...,TCGA-A6-6138-01A-11R-1774-07,TCGA-A6-2685-01A-01R-1410-07,TCGA-QG-A5YX-01A-11R-A28H-07,TCGA-AY-A54L-01A-11R-A28H-07,TCGA-A6-2671-01A-01R-1410-07,TCGA-NH-A8F8-01A-72R-A41B-07,TCGA-CM-5863-01A-21R-1839-07,TCGA-A6-3809-01A-01R-A278-07,TCGA-D5-6932-01A-11R-1928-07,TCGA-AA-3675-01A-02R-0905-07
0,ENSG00000000003,TSPAN6,60.706756,62.310848,28.054054,43.148277,49.288566,6.029851,15.781543,20.08427,...,43.579197,31.906153,47.32587,74.750409,22.444239,27.192429,33.208506,18.076069,29.713629,25.791329
1,ENSG00000000005,TNMD,0.065378,0.768211,0.087677,0.488598,0.058151,1.005012,0.0,0.024084,...,0.69925,2.392969,0.60265,0.452964,0.265327,0.273898,5.345912,0.013534,0.199307,0.0
2,ENSG00000000419,DPM1,53.021975,49.744823,39.211929,48.717118,47.276633,6.233657,59.815651,30.839994,...,54.079416,23.685697,39.968203,85.633379,43.145615,81.92411,20.560575,53.239013,52.921301,37.169671
3,ENSG00000000457,SCYL3,1.850407,1.996215,2.578204,1.157164,2.659188,1.140148,2.379475,1.706928,...,2.972966,1.609248,2.675526,1.867057,1.201758,2.562697,2.301442,2.823902,1.946377,4.566399
4,ENSG00000000460,C1orf112,1.234814,1.937095,0.980064,1.235927,4.581492,0.637249,0.782066,1.124192,...,2.282533,0.85844,1.89923,2.791183,2.154205,1.504133,0.87232,3.005428,1.630099,3.124514


In [14]:
#rename first col name in both DF

tumour_FPKM.rename(columns={'Unnamed: 0':'Gene_ensembl_id'},inplace=True)
normal_FPKM.rename(columns={'Unnamed: 0':'Gene_ensembl_id'},inplace=True)

In [15]:
tumour_FPKM.head()

Unnamed: 0,Gene_ensembl_id,Gene_symbol,TCGA-AA-3867-01A-01R-1022-07,TCGA-CA-6719-01A-11R-1839-07,TCGA-NH-A50V-01A-11R-A28H-07,TCGA-AA-A01C-01A-01R-A00A-07,TCGA-AA-A02F-01A-01R-A089-07,TCGA-AA-A03F-01A-11R-A16W-07,TCGA-AA-A00O-01A-02R-A089-07,TCGA-AZ-4615-01A-01R-1410-07,...,TCGA-A6-6138-01A-11R-1774-07,TCGA-A6-2685-01A-01R-1410-07,TCGA-QG-A5YX-01A-11R-A28H-07,TCGA-AY-A54L-01A-11R-A28H-07,TCGA-A6-2671-01A-01R-1410-07,TCGA-NH-A8F8-01A-72R-A41B-07,TCGA-CM-5863-01A-21R-1839-07,TCGA-A6-3809-01A-01R-A278-07,TCGA-D5-6932-01A-11R-1928-07,TCGA-AA-3675-01A-02R-0905-07
0,ENSG00000000003,TSPAN6,60.706756,62.310848,28.054054,43.148277,49.288566,6.029851,15.781543,20.08427,...,43.579197,31.906153,47.32587,74.750409,22.444239,27.192429,33.208506,18.076069,29.713629,25.791329
1,ENSG00000000005,TNMD,0.065378,0.768211,0.087677,0.488598,0.058151,1.005012,0.0,0.024084,...,0.69925,2.392969,0.60265,0.452964,0.265327,0.273898,5.345912,0.013534,0.199307,0.0
2,ENSG00000000419,DPM1,53.021975,49.744823,39.211929,48.717118,47.276633,6.233657,59.815651,30.839994,...,54.079416,23.685697,39.968203,85.633379,43.145615,81.92411,20.560575,53.239013,52.921301,37.169671
3,ENSG00000000457,SCYL3,1.850407,1.996215,2.578204,1.157164,2.659188,1.140148,2.379475,1.706928,...,2.972966,1.609248,2.675526,1.867057,1.201758,2.562697,2.301442,2.823902,1.946377,4.566399
4,ENSG00000000460,C1orf112,1.234814,1.937095,0.980064,1.235927,4.581492,0.637249,0.782066,1.124192,...,2.282533,0.85844,1.89923,2.791183,2.154205,1.504133,0.87232,3.005428,1.630099,3.124514


In [16]:
#All FPKM samples seem to come from centre 07, whereas the MAF samples come from centre 10
# => to see if FPKM and MAF samples match split Id's up to and including participant Id

#First for tumour samples
MAF_VEP_protein_tumour = MAF_VEP_protein.copy()
MAF_VEP_protein_tumour['Tumor_Sample_Barcode'] = MAF_VEP_protein['Tumor_Sample_Barcode'].apply(lambda x: '-'.join((x.split('-')[0:3])))

In [17]:
MAF_VEP_protein_tumour['Tumor_Sample_Barcode'].value_counts()

TCGA-AA-A010    7267
TCGA-CA-6717    6899
TCGA-AZ-4315    5841
TCGA-AA-3984    4406
TCGA-AA-A00N    4181
                ... 
TCGA-CM-5863      32
TCGA-F4-6704      32
TCGA-A6-5664      29
TCGA-CA-5255      24
TCGA-AZ-4323      20
Name: Tumor_Sample_Barcode, Length: 399, dtype: int64

In [18]:
#Do the same for the FPKM tumour file

tumour_FPKM_sample = tumour_FPKM.copy()

In [19]:
#Rename rows upto participant id

tumour_FPKM_sample.rename(columns= lambda x: '-'.join((x.split('-')[0:3])),inplace=True)

In [20]:
tumour_FPKM_sample.head()

Unnamed: 0,Gene_ensembl_id,Gene_symbol,TCGA-AA-3867,TCGA-CA-6719,TCGA-NH-A50V,TCGA-AA-A01C,TCGA-AA-A02F,TCGA-AA-A03F,TCGA-AA-A00O,TCGA-AZ-4615,...,TCGA-A6-6138,TCGA-A6-2685,TCGA-QG-A5YX,TCGA-AY-A54L,TCGA-A6-2671,TCGA-NH-A8F8,TCGA-CM-5863,TCGA-A6-3809,TCGA-D5-6932,TCGA-AA-3675
0,ENSG00000000003,TSPAN6,60.706756,62.310848,28.054054,43.148277,49.288566,6.029851,15.781543,20.08427,...,43.579197,31.906153,47.32587,74.750409,22.444239,27.192429,33.208506,18.076069,29.713629,25.791329
1,ENSG00000000005,TNMD,0.065378,0.768211,0.087677,0.488598,0.058151,1.005012,0.0,0.024084,...,0.69925,2.392969,0.60265,0.452964,0.265327,0.273898,5.345912,0.013534,0.199307,0.0
2,ENSG00000000419,DPM1,53.021975,49.744823,39.211929,48.717118,47.276633,6.233657,59.815651,30.839994,...,54.079416,23.685697,39.968203,85.633379,43.145615,81.92411,20.560575,53.239013,52.921301,37.169671
3,ENSG00000000457,SCYL3,1.850407,1.996215,2.578204,1.157164,2.659188,1.140148,2.379475,1.706928,...,2.972966,1.609248,2.675526,1.867057,1.201758,2.562697,2.301442,2.823902,1.946377,4.566399
4,ENSG00000000460,C1orf112,1.234814,1.937095,0.980064,1.235927,4.581492,0.637249,0.782066,1.124192,...,2.282533,0.85844,1.89923,2.791183,2.154205,1.504133,0.87232,3.005428,1.630099,3.124514


In [21]:
#Compare sample ids between the MAF and tumour_FPKM dataframe using set function since doesn't allow duplicate values
#Returns list of sample ids not found in the FPKM DF

non_matching_samples_tumour = list(set(MAF_VEP_protein_tumour['Tumor_Sample_Barcode']) - set(tumour_FPKM_sample.iloc[:,2:].columns))
non_matching_samples_tumour

['TCGA-AZ-4681', 'TCGA-AA-3695', 'TCGA-AA-3967']

In [22]:
#Drop the rows containing the non matching samples from the MAF DF using .query

MAF_VEP_protein_tumour =  MAF_VEP_protein_tumour.query('Tumor_Sample_Barcode not in @non_matching_samples_tumour')


In [23]:
MAF_VEP_protein_tumour[MAF_VEP_protein_tumour['Tumor_Sample_Barcode'] =='TCGA-AA-3695']

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,FILTER,CONTEXT,src_vcf_id,tumor_bam_uuid,normal_bam_uuid,case_id,GDC_FILTER,COSMIC,MC3_Overlap,GDC_Validation_Status


In [24]:
#Check that we have no mismatching sample Ids ... => len = 0

len(list(set(MAF_VEP_protein_tumour['Tumor_Sample_Barcode']) - set(tumour_FPKM_sample.columns)))

0

In [25]:
#Create list of final cols required in FPKM

final_cols_tumour = list(MAF_VEP_protein_tumour['Tumor_Sample_Barcode'].unique())
final_cols_tumour.extend(['Gene_ensembl_id','Gene_symbol'])

In [26]:
#drop cols not in final list 

tumour_FPKM_sample.drop(columns= [col for col in tumour_FPKM_sample if col not in final_cols_tumour],inplace=True)

In [27]:
tumour_FPKM_sample.head()

Unnamed: 0,Gene_ensembl_id,Gene_symbol,TCGA-AA-3867,TCGA-CA-6719,TCGA-NH-A50V,TCGA-AA-A01C,TCGA-AA-A02F,TCGA-AA-A03F,TCGA-AZ-4615,TCGA-AA-3854,...,TCGA-A6-6138,TCGA-A6-2685,TCGA-QG-A5YX,TCGA-AY-A54L,TCGA-A6-2671,TCGA-NH-A8F8,TCGA-CM-5863,TCGA-A6-3809,TCGA-D5-6932,TCGA-AA-3675
0,ENSG00000000003,TSPAN6,60.706756,62.310848,28.054054,43.148277,49.288566,6.029851,20.08427,18.393394,...,43.579197,31.906153,47.32587,74.750409,22.444239,27.192429,33.208506,18.076069,29.713629,25.791329
1,ENSG00000000005,TNMD,0.065378,0.768211,0.087677,0.488598,0.058151,1.005012,0.024084,0.045789,...,0.69925,2.392969,0.60265,0.452964,0.265327,0.273898,5.345912,0.013534,0.199307,0.0
2,ENSG00000000419,DPM1,53.021975,49.744823,39.211929,48.717118,47.276633,6.233657,30.839994,21.438009,...,54.079416,23.685697,39.968203,85.633379,43.145615,81.92411,20.560575,53.239013,52.921301,37.169671
3,ENSG00000000457,SCYL3,1.850407,1.996215,2.578204,1.157164,2.659188,1.140148,1.706928,2.956078,...,2.972966,1.609248,2.675526,1.867057,1.201758,2.562697,2.301442,2.823902,1.946377,4.566399
4,ENSG00000000460,C1orf112,1.234814,1.937095,0.980064,1.235927,4.581492,0.637249,1.124192,1.976735,...,2.282533,0.85844,1.89923,2.791183,2.154205,1.504133,0.87232,3.005428,1.630099,3.124514


In [28]:
#Check FPKM cols match up to sample ids in MAF => all that should remain should be gene ensembl and symbol col

set(tumour_FPKM_sample.columns) - set(MAF_VEP_protein_tumour['Tumor_Sample_Barcode'])

{'Gene_ensembl_id', 'Gene_symbol'}

In [29]:
#Do the same process for matching normal samples

In [30]:
#Normal FPKM rename columns

normal_FPKM_sample = normal_FPKM.copy()
normal_FPKM_sample.rename(columns= lambda x: '-'.join((x.split('-')[0:3])),inplace=True)
normal_FPKM_sample.head()

Unnamed: 0,Gene_ensembl_id,Gene_symbol,TCGA-AA-3534,TCGA-AZ-6598,TCGA-A6-2678,TCGA-AA-3514,TCGA-A6-5667,TCGA-AZ-6603,TCGA-AA-3713,TCGA-AA-3655,...,TCGA-A6-5665,TCGA-A6-2684,TCGA-AZ-6599,TCGA-AA-3712,TCGA-AA-3697,TCGA-A6-2682,TCGA-AA-3522,TCGA-AA-3662,TCGA-AA-3520,TCGA-A6-2685
0,ENSG00000000003,TSPAN6,21.875041,25.857174,30.247957,35.900789,27.576637,18.098814,24.464751,36.113444,...,19.274104,23.841193,34.069906,22.731096,24.817074,26.814931,20.53871,43.168339,18.687427,23.1586
1,ENSG00000000005,TNMD,0.705639,0.735095,0.659471,0.956285,0.560075,0.467306,1.209251,0.415819,...,0.603394,0.576547,0.18354,0.341408,0.269357,0.712037,0.712653,0.52798,0.421246,0.902174
2,ENSG00000000419,DPM1,19.428661,20.021853,20.9532,22.699088,21.024849,21.407599,48.348166,31.3935,...,19.63069,19.945078,19.145061,22.067904,31.367703,22.735246,22.497485,26.527291,21.52756,21.845183
3,ENSG00000000457,SCYL3,2.815292,2.174283,2.885359,2.363508,2.719953,2.018772,3.232633,2.869292,...,2.702307,2.530088,2.10366,3.873146,2.265448,2.672642,2.923599,3.734924,2.676826,2.765696
4,ENSG00000000460,C1orf112,0.700505,0.886138,0.673826,1.144002,0.680031,0.791986,1.084764,0.777887,...,0.484447,0.514032,0.970638,0.72159,1.305029,0.723451,1.023061,0.630269,0.64407,1.159835


In [31]:
#Rename MAF Matched_Norm_Sample_Barcode column

MAF_VEP_protein_normal = MAF_VEP_protein.copy()
MAF_VEP_protein_normal['Matched_Norm_Sample_Barcode'] = MAF_VEP_protein['Matched_Norm_Sample_Barcode'].apply(lambda x: '-'.join((x.split('-')[0:3])))



In [32]:
MAF_VEP_protein_normal['Matched_Norm_Sample_Barcode'].value_counts()

TCGA-AA-A010    7267
TCGA-CA-6717    6899
TCGA-AZ-4315    5841
TCGA-AA-3984    4406
TCGA-AA-A00N    4181
                ... 
TCGA-CM-5863      32
TCGA-F4-6704      32
TCGA-A6-5664      29
TCGA-CA-5255      24
TCGA-AZ-4323      20
Name: Matched_Norm_Sample_Barcode, Length: 399, dtype: int64

In [33]:
non_matching_samples_normal = list(set(MAF_VEP_protein_normal['Matched_Norm_Sample_Barcode']) - set(normal_FPKM_sample.iloc[:,2:].columns))


In [34]:
#Drop the rows containing the non matching samples from the MAF DF using .query

MAF_VEP_protein_normal =  MAF_VEP_protein_normal.query('Matched_Norm_Sample_Barcode not in @non_matching_samples_normal')


In [35]:
#Check that we have no mismatching sample Ids ... => len = 0

len(list(set(MAF_VEP_protein_normal['Matched_Norm_Sample_Barcode']) - set(normal_FPKM_sample.columns)))

0

In [36]:
#Create list of final cols required in FPKM

final_cols_normal = list(MAF_VEP_protein_normal['Matched_Norm_Sample_Barcode'].unique())
final_cols_normal.extend(['Gene_ensembl_id','Gene_symbol'])

In [37]:
#drop cols not in final list 

normal_FPKM_sample.drop(columns= [col for col in normal_FPKM_sample if col not in final_cols_normal],inplace=True)

In [38]:
normal_FPKM_sample.head()

Unnamed: 0,Gene_ensembl_id,Gene_symbol,TCGA-AZ-6598,TCGA-A6-5667,TCGA-AZ-6603,TCGA-AA-3713,TCGA-AA-3655,TCGA-AA-3489,TCGA-AA-3660,TCGA-AA-3663,...,TCGA-A6-2686,TCGA-F4-6704,TCGA-A6-5665,TCGA-A6-2684,TCGA-AZ-6599,TCGA-AA-3712,TCGA-AA-3697,TCGA-A6-2682,TCGA-AA-3662,TCGA-A6-2685
0,ENSG00000000003,TSPAN6,25.857174,27.576637,18.098814,24.464751,36.113444,35.956685,29.191332,18.200302,...,16.622676,21.215679,19.274104,23.841193,34.069906,22.731096,24.817074,26.814931,43.168339,23.1586
1,ENSG00000000005,TNMD,0.735095,0.560075,0.467306,1.209251,0.415819,0.477041,0.592175,0.727522,...,0.541646,0.913592,0.603394,0.576547,0.18354,0.341408,0.269357,0.712037,0.52798,0.902174
2,ENSG00000000419,DPM1,20.021853,21.024849,21.407599,48.348166,31.3935,39.133548,31.131119,21.327928,...,22.022668,28.874312,19.63069,19.945078,19.145061,22.067904,31.367703,22.735246,26.527291,21.845183
3,ENSG00000000457,SCYL3,2.174283,2.719953,2.018772,3.232633,2.869292,3.344046,3.27141,2.408914,...,2.700505,2.086699,2.702307,2.530088,2.10366,3.873146,2.265448,2.672642,3.734924,2.765696
4,ENSG00000000460,C1orf112,0.886138,0.680031,0.791986,1.084764,0.777887,0.94122,0.874086,0.811366,...,0.838983,1.328217,0.484447,0.514032,0.970638,0.72159,1.305029,0.723451,0.630269,1.159835


In [39]:
#Check FPKM cols match up to sample ids in MAF => all that should remain should be gene ensembl and symbol col

set(normal_FPKM_sample.columns) - set(MAF_VEP_protein_normal['Matched_Norm_Sample_Barcode'])

{'Gene_ensembl_id', 'Gene_symbol'}