In [39]:
import pandas as pd
import seaborn as sns 
import numpy as np
import glob
import matplotlib.pyplot as plt
%matplotlib inline
%config Completer.use_jedi = False

In [40]:
#Import eQTL 95 and FPKM files for required cancer type

FPKM_file = '/Users/jake/OneDrive - University of Glasgow/Project/BRCA/GeneExpression_MAF/BRCA_FPKM_protein_all_matched.csv'
FPKM = pd.read_csv(FPKM_file,sep='\t',header=[0,1],index_col=[0])


#Add path to eQTL files
path = r'/Users/jake/OneDrive - University of Glasgow/Project/BRCA/Machine_learning_final/Pipeline/Data_parsing_4_eQTLs'

all_files = glob.glob(path + "/*.csv")

#For loop to read in eQTL files        
for filename in all_files:
    
    if 'cis_symbol_95' in filename:
        cis_symbol_95 = pd.read_csv(filename,sep='\t')
    elif 'cis_entrez_95' in filename:
        cis_entrez_95 = pd.read_csv(filename, sep='\t')
    elif 'trans_symbol_95' in filename:
        trans_symbol_95 = pd.read_csv(filename,sep='\t')
    elif 'trans_entrez_95' in filename:
        trans_entrez_95 = pd.read_csv(filename,sep='\t')

In [41]:
cis_symbol_95.head()

Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,entrezgene_id,chromosome_name
1,CCT6P1,643253,1930,ENSG00000228409,,7.0
2,STAG3L4,64940,1805,ENSG00000106610,64940.0,7.0
3,STAG3L4,64940,1805,ENSG00000106610,101929736.0,7.0
4,ZNF117,51351,1668,ENSG00000152926,51351.0,7.0
5,ZNF117,51351,1668,ENSG00000152926,109504726.0,7.0


In [42]:
trans_symbol_95.head()

Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,entrezgene_id,chromosome_name
1,FOLH1B,219595,6069,ENSG00000134612,219595.0,11.0
2,LOC647121,647121,1732,,,
3,BTN3A2,11118,634,ENSG00000186470,11118.0,6.0
4,RNF5P1,286140,592,ENSG00000253570,,8.0
5,ZNF322B,387328,582,,,


In [43]:
#Concat cis_entrez and cis_symbol and drop any rows with NA for Ensembl Id
#Also drop duplicates in Ensembl ID col

cis_eQTL_95 = pd.concat([cis_entrez_95,cis_symbol_95]).dropna(axis=0,subset=['ensembl_gene_id']).drop_duplicates(subset = 'ensembl_gene_id')

In [44]:
#Drop hgnc_symbol and entregene_id cols 
#=> keeping hg38 determined ensembl_id alongside original Gene_symol (hg19) 

cis_eQTL_95.drop(['hgnc_symbol','entrezgene_id'],axis=1,inplace=True)
cis_eQTL_95.reset_index(drop=True)

Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,chromosome_name
0,STAG3L4,64940,1805,ENSG00000106610,7.0
1,ZNF117,51351,1668,ENSG00000152926,7.0
2,BTN3A2,11118,1377,ENSG00000186470,6.0
3,TYW1,55253,1273,ENSG00000198874,7.0
4,LOC84856,84856,1224,ENSG00000185904,10.0
...,...,...,...,...,...
567,PI4KAP2,375133,263,ENSG00000183506,22.0
568,SFTA3,253970,254,ENSG00000229415,14.0
569,SBDSP1,155370,214,ENSG00000225648,7.0
570,OR2A9P,441295,208,ENSG00000228960,7.0


In [45]:
#Check for duplicates in 'Gene_symbol'

cis_eQTL_95[cis_eQTL_95.duplicated(['Gene_symbol'])]

Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,chromosome_name
49,DDX11L2,84771,667,ENSG00000236397,2.0
352,SFTA3,253970,254,ENSG00000229415,14.0


In [46]:

cis_eQTL_95[cis_eQTL_95['Gene_symbol'] == 'SFTA3']

Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,chromosome_name
348,SFTA3,253970,254,ENSG00000257520,14.0
352,SFTA3,253970,254,ENSG00000229415,14.0


In [47]:
#Check FPKM for duplicates

FPKM.xs('ENSG00000257520',level='Gene_ensembl_id',axis=1)

Gene_symbol,SFTA3
TCGA-BH-A1FN,0.0
TCGA-E9-A1NG,0.0
TCGA-AC-A23H,0.0
TCGA-A7-A0DC,0.0
TCGA-BH-A0BA,0.0
...,...
TCGA-E9-A1N5,0.0
TCGA-BH-A1FG,0.0
TCGA-BH-A1EO,0.0
TCGA-BH-A0AY,0.0


In [48]:
#Drop any duplicate not mathcing FPKM

cis_eQTL_95 = cis_eQTL_95[~cis_eQTL_95['ensembl_gene_id'].isin(['ENSG00000229415'])]
cis_eQTL_95 = cis_eQTL_95[~cis_eQTL_95['Gene_symbol'].isin(['DDX11L2'])]


In [58]:
#Reset index

cis_eQTL_95.reset_index(drop=True)


Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,chromosome_name
0,STAG3L4,64940,1805,ENSG00000106610,7.0
1,ZNF117,51351,1668,ENSG00000152926,7.0
2,BTN3A2,11118,1377,ENSG00000186470,6.0
3,TYW1,55253,1273,ENSG00000198874,7.0
4,LOC84856,84856,1224,ENSG00000185904,10.0
...,...,...,...,...,...
564,UPK3B,80761,306,ENSG00000243566,7.0
565,PI4KAP2,375133,263,ENSG00000183506,22.0
566,SBDSP1,155370,214,ENSG00000225648,7.0
567,OR2A9P,441295,208,ENSG00000228960,7.0


In [50]:
#Repeat process for trans eQTL

In [51]:
#Concat trans_entrez and trans_symbol and drop any rows with NA for Ensembl Id
#Also drop duplicates in Ensembl ID col

trans_eQTL_95 = pd.concat([trans_entrez_95,trans_symbol_95]).dropna(axis=0,subset=['ensembl_gene_id']).drop_duplicates(subset = 'ensembl_gene_id')

In [52]:
#Drop hgnc_symbol and entregene_id cols 
#=> keeping hg38 determined ensembl_id alongside original Gene_symol (hg19) 

trans_eQTL_95.drop(['hgnc_symbol','entrezgene_id'],axis=1,inplace=True)
trans_eQTL_95.reset_index(drop=True)

Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,chromosome_name
0,FOLH1B,219595,6069,ENSG00000134612,11
1,LOC647121,647121,1732,ENSG00000231752,1
2,BTN3A2,11118,634,ENSG00000186470,6
3,TYW1B,441250,556,ENSG00000277149,7
4,ZFP57,346171,513,ENSG00000204644,6
...,...,...,...,...,...
290,RNF5P1,286140,592,ENSG00000253570,8
291,RPL23AP53,644128,127,ENSG00000223508,8
292,FKBP1AP1,2282,98,ENSG00000269304,19
293,CCT6P1,643253,59,ENSG00000228409,7


In [53]:
#Check for duplicates in 'Gene_symbol'

trans_eQTL_95[trans_eQTL_95.duplicated(['Gene_symbol'])]

Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,chromosome_name
35,PINX1,54984,167,ENSG00000258724,8


In [54]:

#trans_eQTL_95[cis_eQTL_95['Gene_symbol'] == '']

In [55]:
#Drop any duplicate not mathcing FPKM

#trabs_eQTL_95 = trans_eQTL_95[~trans_eQTL_95['ensembl_gene_id'].isin(['ENSG00000229415'])]
trans_eQTL_95 = trans_eQTL_95[~trans_eQTL_95['Gene_symbol'].isin(['PINX1'])]

In [59]:
#Reset index

trans_eQTL_95.reset_index(drop=True)


Unnamed: 0,Gene_symbol,Entrez_ID,Value_count,ensembl_gene_id,chromosome_name
0,FOLH1B,219595,6069,ENSG00000134612,11
1,LOC647121,647121,1732,ENSG00000231752,1
2,BTN3A2,11118,634,ENSG00000186470,6
3,TYW1B,441250,556,ENSG00000277149,7
4,ZFP57,346171,513,ENSG00000204644,6
...,...,...,...,...,...
288,RNF5P1,286140,592,ENSG00000253570,8
289,RPL23AP53,644128,127,ENSG00000223508,8
290,FKBP1AP1,2282,98,ENSG00000269304,19
291,CCT6P1,643253,59,ENSG00000228409,7


In [57]:
#Write cis and trans dataframes to csv

cis_eQTL_95.to_csv(path.split('/')[5]+'_cis_eQTL_95_ensembl.csv',sep='\t')
trans_eQTL_95.to_csv(path.split('/')[5]+'_trans_eQTL_95_ensembl.csv',sep='\t')