The purpose of this script is to prepare the mutation datasets to be used for finding RiboSPLitches.

The two datasets are from HGMD and ClinVar.

The relevant columns we are looking for are chromosome, start and end coordinates, mutID, WT base, MUT base and clinical signficance.

In [1]:
# Import relevant packages
import pandas as pd

In [2]:
# Get a list which contains all chromosomes -> valid chromosome list
chrom = range(1,23)
chrom.extend(['X','Y'])
chrom = [str(i) for i in chrom]
print chrom

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']


Let's start by processing the mutations from the database HGMD. We only want single base mutations which are in the valid chromosomes

In [3]:
#Read all the mutations in HGMD into a dataframe
mutations = pd.read_csv("../processed_data/HGMD_PRO_2016.3_hg38.bed",sep="\t",header=None,low_memory=False)
print mutations.shape
mutations.head()

(169524, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,942142,942143,CM1511864,.,C,G,.,CLASS=DM?;MUT=ALT;GENE=SAMD11;STRAND=+;DNA=NM_...
1,1,963937,963938,CD142720,.,CCT,C,.,CLASS=DM?;MUT=ALT;GENE=KLHL17;STRAND=+;DNA=NM_...
2,1,1014142,1014143,CM1411641,.,C,T,.,CLASS=DM;MUT=ALT;GENE=ISG15;STRAND=+;DNA=NM_00...
3,1,1014315,1014316,CI128669,.,C,CG,.,CLASS=DM;MUT=ALT;GENE=ISG15;STRAND=+;DNA=NM_00...
4,1,1014358,1014359,CM128668,.,G,T,.,CLASS=DM;MUT=ALT;GENE=ISG15;STRAND=+;DNA=NM_00...


In [4]:
# Get only mutations that are single base mutations [no insertions, deletions]
mutations_single = mutations[(mutations[5].isin(['A','G','T','C','N']))&(mutations[6].isin(['A','G','T','C','N']))]
print mutations_single.shape

(127445, 9)


In [5]:
# Check that all chromosomes are valid
set(mutations_single[0].drop_duplicates().values)==set(chrom)

True

In [6]:
# Grab gene name and strand information from column 8 and put into two separate columns
# This will be written to a new file 
mutations_towrite_data = {"chrom":mutations_single[0],"start":mutations_single[1],"end":mutations_single[2],"mutID":mutations_single[3],"gene_name":mutations_single[8].str.split(pat=';',expand=True)[2].str.split(pat='=',expand=True)[1] ,"strand":mutations_single[8].str.split(pat=';',expand=True)[3].str.split(pat='=',expand=True)[1],"WTbase":mutations_single[5],"MUTbase":mutations_single[6],"ClinSig":mutations_single[8].str.split(pat=';',expand=True)[0].str.split(pat='=',expand=True)[1]}
#mutations_towrite_data = {"chrom":mutations_single[0],"start":mutations_single[1],"end":mutations_single[2],"mutID":mutations_single[3]}
mutations_towrite_df = pd.DataFrame(mutations_towrite_data,columns=["chrom","start","end","mutID","gene_name","strand","WTbase","MUTbase","ClinSig"])
mutations_towrite_df.head()

Unnamed: 0,chrom,start,end,mutID,gene_name,strand,WTbase,MUTbase,ClinSig
0,1,942142,942143,CM1511864,SAMD11,+,C,G,DM?
2,1,1014142,1014143,CM1411641,ISG15,+,C,T,DM
4,1,1014358,1014359,CM128668,ISG15,+,G,T,DM
5,1,1022224,1022225,CM148517,AGRN,+,G,A,DM
6,1,1022312,1022313,CM148518,AGRN,+,A,T,DM


In [7]:
mutations_towrite_df.to_csv("../processed_data/HGMD_PRO_2016_3_hg38_SingleBaseMutations.bed",sep="\t",header=False,index=False)

Next's let's grab the mutations from the database ClinVar. We only want single base mutations which are in the valid chromosomes.

In [8]:
# Read all mutations in ClinVar into a dataframe
clinvar_muts = pd.read_csv("../processed_data/clinvar.bed",header=None,sep="\t",low_memory=False)
print clinvar_muts.shape
clinvar_muts.head()

(346093, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,1014041,1014042,475283,.,G,A,.,"ALLELEID=446939;CLNDISDB=MedGen:C4015293,OMIM:..."
1,1,1014142,1014143,183381,.,C,T,.,"ALLELEID=181485;CLNDISDB=MedGen:C4015293,OMIM:..."
2,1,1014216,1014217,475278,.,C,T,.,"ALLELEID=446987;CLNDISDB=MedGen:C4015293,OMIM:..."
3,1,1014227,1014228,402986,.,G,A,.,ALLELEID=389314;CLNDISDB=MedGen:CN169374;CLNDN...
4,1,1014315,1014316,161455,.,C,CG,.,"ALLELEID=171289;CLNDISDB=MedGen:C4015293,OMIM:..."


In [9]:
# Only get mutations that are single base mutations [so no INDELs]
clinvar_muts_single = clinvar_muts[(clinvar_muts[5].isin(['A','G','T','C','N']))&(clinvar_muts[6].isin(['A','G','T','C','N']))]
print clinvar_muts_single.shape
clinvar_muts_single.head()

(305277, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,1014041,1014042,475283,.,G,A,.,"ALLELEID=446939;CLNDISDB=MedGen:C4015293,OMIM:..."
1,1,1014142,1014143,183381,.,C,T,.,"ALLELEID=181485;CLNDISDB=MedGen:C4015293,OMIM:..."
2,1,1014216,1014217,475278,.,C,T,.,"ALLELEID=446987;CLNDISDB=MedGen:C4015293,OMIM:..."
3,1,1014227,1014228,402986,.,G,A,.,ALLELEID=389314;CLNDISDB=MedGen:CN169374;CLNDN...
5,1,1014358,1014359,161454,.,G,T,.,AF_EXAC=0.00001;ALLELEID=171288;CLNDISDB=MedGe...


In [10]:
# Check if all rows contain a clinical significant value
clinvar_muts_single[clinvar_muts_single[8].str.contains("CLNSIG")].shape == clinvar_muts_single.shape

True

In [11]:
# We now need to only get mutations that have a clinical significance of pathogenic or likely pathogenic
# Need to do some analysis on what kind of clinical significances we observe in this dataset
splitUpData = clinvar_muts_single[8].str.split(pat=';',expand=True)
storeCLNSIG = []
rowsDontContainCLNSIG = []
rowsContainCLNSIGINCL = []
exceptionRows = []
for index,row in splitUpData.iterrows():
    try:
        y = [i for i in row if i != None and "CLNSIG=" in i]
        z = [i for i in row if i != None and "CLNSIG" in i and "CLNSIG=" not in i]
        if len(y)==1:
            storeCLNSIG.append(row)
        elif len(z)==1:
            rowsContainCLNSIGINCL.append(row)
        else:
            rowsDontContainCLNSIG.append(row)
    except:
        exceptionRows.append(row)

In [12]:
# Number of rows with no CLNSIG in them
print len(storeCLNSIG)
print len(rowsContainCLNSIGINCL)
print len(rowsDontContainCLNSIG)
print len(exceptionRows)

304857
420
0
0


It looks like all entries of ClinVar have a clinical significance associated with them. We just need to extract the clinicial significance from each entry. 

In [13]:
# This function will extract the clinical significance from a given row
# Clinical significance can either be in the form "CLNSIG=" or "CLNSIGINCL="
def getClinicalSigForRow(rowofinterest):
    try:
        y = [i for i in rowofinterest if i != None and "CLNSIG=" in i]
        z = [i for i in rowofinterest if i != None and "CLNSIG" in i and "CLNSIG=" not in i]
        if len(y)==1:
            return y[0].split('=')[1]
        elif len(z)==1:
            # Sometimes there are multiple codes associated with CLNSIGINCL, but on cursory glance they appear to be
            # the same signficance but with difference codes, so just extracting the first clin sig
            return z[0].split("=")[1].split("|")[0].split(':')[1]
        else:
            print "Error in getting Clinical Sig for Row in try clause"
    except:
        print "Error in getting Clinical Sig for Row"

In [14]:
# Split up the extra information column of the clin var data and then extract the clinical significance 
splitUpData = clinvar_muts_single[8].str.split(pat=';',expand=True)
clinSigs = splitUpData.apply(getClinicalSigForRow,axis=1)

In [15]:
# This is the total number of clinSigs extracted 
print len(clinSigs)

305277


In [16]:
# This will be written to a new file 
clinvar_muts_towrite_data = {"chrom":clinvar_muts_single[0],"start":clinvar_muts_single[1],"end":clinvar_muts_single[2],"mutID":clinvar_muts_single[3],"WTbase":clinvar_muts_single[5],"MUTbase":clinvar_muts_single[6],"ClinSig":clinSigs}
#clinvar_muts_towrite_data = {"chrom":clinvar_muts_single[0],"start":clinvar_muts_single[1],"end":clinvar_muts_single[2],"mutID":clinvar_muts_single[3]}
clinvar_muts_towrite_df = pd.DataFrame(clinvar_muts_towrite_data,columns=["chrom","start","end","mutID","WTbase","MUTbase","ClinSig"])
clinvar_muts_towrite_df.head()

Unnamed: 0,chrom,start,end,mutID,WTbase,MUTbase,ClinSig
0,1,1014041,1014042,475283,G,A,Benign
1,1,1014142,1014143,183381,C,T,Pathogenic
2,1,1014216,1014217,475278,C,T,Benign
3,1,1014227,1014228,402986,G,A,Benign
5,1,1014358,1014359,161454,G,T,Pathogenic


In [17]:
clinvar_muts_towrite_df.to_csv("../processed_data/ClinVar_SingleBase_Mutations.bed",sep="\t",header=False,index=False)

We want to combine the ClinVar data with the HGMD mutation data, but remove any replicates.

In [18]:
!intersectBed -a ../processed_data/HGMD_PRO_2016_3_hg38_SingleBaseMutations.bed -b ../processed_data/ClinVar_SingleBase_Mutations.bed -wa -wb > ../processed_data/HGMD_ClinVar_Intersect.txt
!intersectBed -a ../processed_data/HGMD_PRO_2016_3_hg38_SingleBaseMutations.bed -b ../processed_data/ClinVar_SingleBase_Mutations.bed -v > ../processed_data/HGMD_SingleBaseMutations_NotInClinVar.txt
!intersectBed -a ../processed_data/ClinVar_SingleBase_Mutations.bed -b ../processed_data/HGMD_PRO_2016_3_hg38_SingleBaseMutations.bed -v > ../processed_data/ClinVar_SingleBaseMutations_NotInHGMD.txt

In [19]:
# Read mutations that are in common between HGMD and ClinVar: they have the same location
muts_Both_HGMD_ClinVar = pd.read_csv("../processed_data/HGMD_ClinVar_Intersect.txt",header=None,sep="\t",low_memory=False)
print muts_Both_HGMD_ClinVar.shape
muts_Both_HGMD_ClinVar.head()

(63021, 16)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,1014142,1014143,CM1411641,ISG15,+,C,T,DM,1,1014142,1014143,183381,C,T,Pathogenic
1,1,1014358,1014359,CM128668,ISG15,+,G,T,DM,1,1014358,1014359,161454,G,T,Pathogenic
2,1,1022224,1022225,CM148517,AGRN,+,G,A,DM,1,1022224,1022225,243036,G,A,Pathogenic
3,1,1022312,1022313,CM148518,AGRN,+,A,T,DM,1,1022312,1022313,243037,A,T,Pathogenic
4,1,1041581,1041582,CM126385,AGRN,+,C,T,DM,1,1041581,1041582,126556,C,T,Pathogenic


In [20]:
# Grab the mutations that have the same base changes in both HGMD and ClinVar
muts_Both_HGMD_ClinVar_SameBase = muts_Both_HGMD_ClinVar[(muts_Both_HGMD_ClinVar[6]==muts_Both_HGMD_ClinVar[13])&(muts_Both_HGMD_ClinVar[7]==muts_Both_HGMD_ClinVar[14])]
print muts_Both_HGMD_ClinVar_SameBase.shape
muts_Both_HGMD_ClinVar_SameBase.head()

(48273, 16)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,1014142,1014143,CM1411641,ISG15,+,C,T,DM,1,1014142,1014143,183381,C,T,Pathogenic
1,1,1014358,1014359,CM128668,ISG15,+,G,T,DM,1,1014358,1014359,161454,G,T,Pathogenic
2,1,1022224,1022225,CM148517,AGRN,+,G,A,DM,1,1022224,1022225,243036,G,A,Pathogenic
3,1,1022312,1022313,CM148518,AGRN,+,A,T,DM,1,1022312,1022313,243037,A,T,Pathogenic
4,1,1041581,1041582,CM126385,AGRN,+,C,T,DM,1,1041581,1041582,126556,C,T,Pathogenic


In [21]:
# Grab the mutations that have do not have the same base changes in both HGMD and ClinVar
muts_Both_HGMD_ClinVar_DiffBase_SameLoc = muts_Both_HGMD_ClinVar[(muts_Both_HGMD_ClinVar[6]!=muts_Both_HGMD_ClinVar[13])|(muts_Both_HGMD_ClinVar[7]!=muts_Both_HGMD_ClinVar[14])]
print muts_Both_HGMD_ClinVar_DiffBase_SameLoc.shape
muts_Both_HGMD_ClinVar_DiffBase_SameLoc.head()

(14748, 16)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
28,1,1806502,1806503,CM164924,GNB1,-,A,G,DM,1,1806502,1806503,391609,A,C,Pathogenic
30,1,1806502,1806503,CM164924,GNB1,-,A,G,DM,1,1806502,1806503,224715,A,T,Pathogenic
31,1,1806502,1806503,CM164922,GNB1,-,A,T,DM,1,1806502,1806503,391609,A,C,Pathogenic
32,1,1806502,1806503,CM164922,GNB1,-,A,T,DM,1,1806502,1806503,208722,A,G,Conflicting_interpretations_of_pathogenicity
40,1,2228824,2228825,CM165733,SKI,+,C,A,DM,1,2228824,2228825,409969,C,T,Uncertain_significance


In [29]:
# The problem above as we can notice is that using bedtools intersectBind, it's overlapping the same line in file
# with multiple lines in the other file and vice versa. 
# Quick way to check is if IDs found in dataset: same location, different base has any IDs in dataset: same base,
# same location
# Those that don't will be included as unique rows
#ClinVar_IDs_ThatDiffer = [i for i in list(muts_Both_HGMD_ClinVar_DiffBase_SameLoc[12].values) if i not in list(muts_Both_HGMD_ClinVar_SameBase[12].values)]
ClinVar_IDs_ThatDiffer = list(muts_Both_HGMD_ClinVar_DiffBase_SameLoc[~muts_Both_HGMD_ClinVar_DiffBase_SameLoc[12].isin(muts_Both_HGMD_ClinVar_SameBase[12])][12].values)
print len(ClinVar_IDs_ThatDiffer)
HGMD_IDs_ThatDiffer = ClinVar_IDs_ThatDiffer = list(muts_Both_HGMD_ClinVar_DiffBase_SameLoc[~muts_Both_HGMD_ClinVar_DiffBase_SameLoc[3].isin(muts_Both_HGMD_ClinVar_SameBase[3])][3].values)
print len(HGMD_IDs_ThatDiffer)

5561
5787


In [30]:
# Gather all mutations in both HGMD and ClinVar that have the same coordinates with similar named columns
muts_Both_SameBase = muts_Both_HGMD_ClinVar_SameBase.iloc[:,[0,1,2,3,6,7,8]]
muts_Both_SameBase.columns = ["Chrom","Start","End","MutID","WTbase","MUTbase","ClinSig"]
muts_Both_DiffBase_HGMD = muts_Both_HGMD_ClinVar_DiffBase_SameLoc[muts_Both_HGMD_ClinVar_DiffBase_SameLoc[3].isin(HGMD_IDs_ThatDiffer)].iloc[:,[0,1,2,3,6,7,8]]
muts_Both_DiffBase_HGMD.columns = ["Chrom","Start","End","MutID","WTbase","MUTbase","ClinSig"]
muts_Both_DiffBase_ClinVar = muts_Both_HGMD_ClinVar_DiffBase_SameLoc[muts_Both_HGMD_ClinVar_DiffBase_SameLoc[12].isin(ClinVar_IDs_ThatDiffer)].iloc[:,[9,10,11,12,13,14,15]]
muts_Both_DiffBase_ClinVar.columns = ["Chrom","Start","End","MutID","WTbase","MUTbase","ClinSig"]

In [31]:
# Read mutations that are only found in HGMD
muts_Only_HGMD = pd.read_csv("../processed_data/HGMD_SingleBaseMutations_NotInClinVar.txt",header=None,sep="\t",low_memory=False)
print muts_Only_HGMD.shape
muts_Only_HGMD.head()

(73798, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,942142,942143,CM1511864,SAMD11,+,C,G,DM?
1,1,1211916,1211917,CS060109,TNFRSF4,-,G,A,DP
2,1,1232512,1232513,CM1411602,B3GALT6,+,A,G,DM
3,1,1232753,1232754,CM148989,B3GALT6,+,C,A,DM
4,1,1232833,1232834,CM165283,B3GALT6,+,T,C,DM


In [32]:
# Read mutations that are only found in ClinVar
muts_Only_ClinVar = pd.read_csv("../processed_data/ClinVar_SingleBaseMutations_NotInHGMD.txt",header=None,sep="\t",low_memory=False)
print muts_Only_ClinVar.shape
muts_Only_ClinVar.head()

(252132, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,1,1014041,1014042,475283,G,A,Benign
1,1,1014216,1014217,475278,C,T,Benign
2,1,1014227,1014228,402986,G,A,Benign
3,1,1014450,1014451,475281,C,T,Benign
4,1,1014470,1014471,475282,G,C,Likely_benign


In [33]:
# Gather all mutations that are only in HGMD or ClinVar with similar named columns
muts_Only_HGMD_NoStrand = muts_Only_HGMD.iloc[:,[0,1,2,3,6,7,8]]
muts_Only_HGMD_NoStrand.columns = ["Chrom","Start","End","MutID","WTbase","MUTbase","ClinSig"]
muts_Only_ClinVar.columns = ["Chrom","Start","End","MutID","WTbase","MUTbase","ClinSig"]

In [34]:
# Append all the data frames together to get a total set of mutations to work with 
allmuts = muts_Both_SameBase.append(muts_Both_DiffBase_HGMD.append(muts_Both_DiffBase_ClinVar.append(muts_Only_HGMD_NoStrand.append(muts_Only_ClinVar))))
print allmuts.shape
allmuts.head()

(379990, 7)


Unnamed: 0,Chrom,Start,End,MutID,WTbase,MUTbase,ClinSig
0,1,1014142,1014143,CM1411641,C,T,DM
1,1,1014358,1014359,CM128668,G,T,DM
2,1,1022224,1022225,CM148517,G,A,DM
3,1,1022312,1022313,CM148518,A,T,DM
4,1,1041581,1041582,CM126385,C,T,DM


In [35]:
# Sort the data and add "chr" to the chromosome name
allmuts_sorted = allmuts.sort_values(["Chrom","Start"])
allmuts_sorted['Chrom'] = 'chr' + allmuts_sorted['Chrom'].astype(str)
print allmuts_sorted.shape
allmuts_sorted.head()

(379990, 7)


Unnamed: 0,Chrom,Start,End,MutID,WTbase,MUTbase,ClinSig
0,chr1,942142,942143,CM1511864,C,G,DM?
0,chr1,1014041,1014042,475283,G,A,Benign
0,chr1,1014142,1014143,CM1411641,C,T,DM
1,chr1,1014216,1014217,475278,C,T,Benign
2,chr1,1014227,1014228,402986,G,A,Benign


In [36]:
# Write the data to a new file
allmuts_sorted.to_csv("../processed_data/CombinedMutations_HGMDandClinVar.bed",sep="\t",header=False,index=False)

Let's separate out HGMD and ClinVar information into two different files. 

In [37]:
# This is for HGMD mut
muts_Both_SameBase = muts_Both_HGMD_ClinVar_SameBase.iloc[:,[0,1,2,3,5,6,7,8]]
muts_Both_SameBase.columns = ["Chrom","Start","End","MutID","Strand","WTbase","MUTbase","ClinSig"]
muts_Both_DiffBase_HGMD = muts_Both_HGMD_ClinVar_DiffBase_SameLoc[muts_Both_HGMD_ClinVar_DiffBase_SameLoc[3].isin(HGMD_IDs_ThatDiffer)].iloc[:,[0,1,2,3,5,6,7,8]]
muts_Both_DiffBase_HGMD.columns = ["Chrom","Start","End","MutID","Strand","WTbase","MUTbase","ClinSig"]
muts_Only_HGMD_Strand = muts_Only_HGMD.iloc[:,[0,1,2,3,5,6,7,8]]
muts_Only_HGMD_Strand.columns = ["Chrom","Start","End","MutID","Strand","WTbase","MUTbase","ClinSig"]
allmuts_HGMD = muts_Both_SameBase.append(muts_Only_HGMD_Strand.append(muts_Both_DiffBase_HGMD))
allmuts_HGMD_sorted = allmuts_HGMD.sort_values(["Chrom","Start"])
allmuts_HGMD_sorted['Chrom'] = 'chr' + allmuts_HGMD_sorted['Chrom'].astype(str)
print allmuts_HGMD_sorted.shape
allmuts_HGMD_sorted.head()

(127858, 8)


Unnamed: 0,Chrom,Start,End,MutID,Strand,WTbase,MUTbase,ClinSig
0,chr1,942142,942143,CM1511864,+,C,G,DM?
0,chr1,1014142,1014143,CM1411641,+,C,T,DM
1,chr1,1014358,1014359,CM128668,+,G,T,DM
2,chr1,1022224,1022225,CM148517,+,G,A,DM
3,chr1,1022312,1022313,CM148518,+,A,T,DM


In [38]:
# Write the data to a new file
allmuts_HGMD_sorted.to_csv("../processed_data/Mutations_HGMD.bed",sep="\t",header=False,index=False)

In [39]:
# This is the ClinVar data
allmuts_ClinVar = muts_Only_ClinVar.append(muts_Both_DiffBase_ClinVar)
allmuts_ClinVar_sorted = allmuts_ClinVar.sort_values(["Chrom","Start"])
allmuts_ClinVar_sorted['Chrom'] = 'chr' + allmuts_ClinVar_sorted['Chrom'].astype(str)
print allmuts_ClinVar_sorted.shape
allmuts_ClinVar_sorted.head()

(252132, 7)


Unnamed: 0,Chrom,Start,End,MutID,WTbase,MUTbase,ClinSig
0,chr1,1014041,1014042,475283,G,A,Benign
1,chr1,1014216,1014217,475278,C,T,Benign
2,chr1,1014227,1014228,402986,G,A,Benign
3,chr1,1014450,1014451,475281,C,T,Benign
4,chr1,1014470,1014471,475282,G,C,Likely_benign


In [40]:
# Write the data to a new file
allmuts_ClinVar_sorted.to_csv("../processed_data/Mutations_ClinVar.bed",sep="\t",header=False,index=False)