The purpose of this script is to prepare the common SNP datasets to be used for analysis.

The relevant columns we are looking for are chromosome, start and end coordinates, SNP id, major allele, minor allele and minor allele frequency.

In [None]:
%%script bash
# First convert vcf file to bed file 
vcf2bed < ../data/common_all_20170710.vcf > ../processed_data/common_all.bed

In [2]:
# Import relevant packages
import pandas as pd

The following functions are used to extract valid entries in the commonSNPs file that meet the following criteria:
- they are single mutations 
- they are found within genes
- they have a MAF that is greater than 0.05

In [None]:
def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

from operator import itemgetter
def getMAF_Index(rowofinterest):
    x =rowofinterest[8].split(';')
    y = [i for i in x if i is not None and "CAF=" in i]
    z = y[0].split('=')[1].split(',')
    z_p = [float(i) if isfloat(i) else 1 for i in z[1:]]
    min_index = min(enumerate(z_p), key=itemgetter(1))[0]
    return min_index

def getMAF(rowofinterest):
    x =rowofinterest[8].split(';')
    y = [i for i in x if i is not None and "CAF=" in i]
    z = y[0].split('=')[1].split(',')
    z_p = [float(i) for i in z[1:] if isfloat(i)]
    return min(z_p) 

def getRowRelevantForGroup(group):
    i = group["maf_index"].values[0]
    try:
        return group.iloc[i,:]
    except:
        print group

def getNumOfAlleles(rowofinterest):
    x =rowofinterest[8].split(';')
    y = [i for i in x if i is not None and "CAF=" in i]
    z = y[0].split('=')[1].split(',')
    return len(z)

def getGene(rowofinterest):
    x =rowofinterest[8].split(';')
    y = [i for i in x if i is not None and "GENEINFO=" in i]
    return y[0].split('=')[1]

def extractValidInfoCommonSNPs(snpfile,writefile):
    chrom = range(1,23)
    chrom.extend(['X','Y'])
    commonSNPs = pd.read_csv(snpfile,sep="\t",header=None)
    commonSNPs_single = commonSNPs[(commonSNPs[5].isin(['A','G','T','C','N']))&(commonSNPs[6].isin(['A','G','T','C','N']))]
    for chromosome in chrom:
        print chromosome
        # Get chromosomes that are for each chromosome
        commonSNPs_single_chrom = commonSNPs_single[commonSNPs_single[0]==chromosome]
        # Get number of alleles for each SNP
        numAllelesForChrom = commonSNPs_single_chrom.apply(getNumOfAlleles,1)
        # Get SNPs that have 2 alleles, one major and one minor allele
        dataForchrom_2alleles = commonSNPs_single_chrom[numAllelesForChrom == 2]
        # Get the minor allele frequency for SNPs that have 2 alleles
        mafForchrom_2alleles = dataForchrom_2alleles.apply(getMAF,1)
        # Get all SNPs that have greater than 2 alleles 
        not2alleles = commonSNPs_single_chrom[numAllelesForChrom > 2]
        # Get the index of the allele that has the minor allele frequency, this will 0,1,2 or 3
        index_MAF_not2alleles = not2alleles.apply(getMAF_Index,1)
        # Assign the index to the table that is for SNPs with more than 2 alleles
        not2alleles = not2alleles.assign(maf_index=index_MAF_not2alleles)
        # Group SNPs by SNP id and then get the row that has the index for the MAF
        dataForchrom_not2alleles = not2alleles.groupby([3]).apply(getRowRelevantForGroup)
        # get the minor allele frequency for that allele selected 
        maf_not2alleles = dataForchrom_not2alleles.apply(getMAF,1)
        dataForchrom = {"chrom":pd.concat([dataForchrom_2alleles[0],dataForchrom_not2alleles[0]]),
                        "start":pd.concat([dataForchrom_2alleles[1],dataForchrom_not2alleles[1]]),
                        "end":pd.concat([dataForchrom_2alleles[2],dataForchrom_not2alleles[2]]),
                        "snpID":pd.concat([dataForchrom_2alleles[3],dataForchrom_not2alleles[3]]),
                        "major":pd.concat([dataForchrom_2alleles[5],dataForchrom_not2alleles[5]]),
                        "minor":pd.concat([dataForchrom_2alleles[6],dataForchrom_not2alleles[6]]),
                        "MAF":pd.concat([mafForchrom_2alleles,maf_not2alleles])}
        #dfForchrom = pd.DataFrame(dataForchrom,columns=["chrom","start","end","snpID","major","minor","gene","MAF"])
        dfForchrom = pd.DataFrame(dataForchrom,columns=["chrom","start","end","snpID","major","minor","MAF"])
        dfForchrom.to_csv(writefile,sep="\t",header=False,index=False,mode="a")
        #dfForchrom_MAFvalid = dfForchrom[(dfForchrom["MAF"]>=minMAFval)&(dfForchrom["MAF"]<=(1-minMAFval))]
        #dfForchrom_MAFvalid['chrom'] = 'chr' + dfForchrom_MAFvalid['chrom'].astype(str)
        #dfForchrom_MAFvalid.to_csv(writefile,sep="\t",header=False,index=False,mode="a")

In [None]:
# Get the relevant SNPs 
#extractValidInfoCommonSNPs("../processed_data/common_all.bed","../processed_data/commonSNPs_all_processed.bed")

In [3]:
# Get the SNPs
allSNPs = pd.read_csv("../processed_data/commonSNPs_all_processed.bed",header=None,sep="\t")
print allSNPs.shape
allSNPs.head()

  interactivity=interactivity, compiler=compiler, result=result)


(33745944, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,1,10641,10642,rs558604819,G,A,0.004193
1,1,11007,11008,rs575272151,C,G,0.08806
2,1,11011,11012,rs544419019,C,G,0.08806
3,1,11062,11063,rs561109771,T,G,0.002995
4,1,13109,13110,rs540538026,G,A,0.02676


In [4]:
# Get SNPs that have MAF value greater than certain threshold 0.05
minMAFval=0.05
allSNPs_aboveMAF = allSNPs[(allSNPs[6]>=minMAFval)&(allSNPs[6]<=(1-minMAFval))]
print allSNPs_aboveMAF.shape
allSNPs_aboveMAF[0] = 'chr' + allSNPs_aboveMAF[0].astype(str)
allSNPs_aboveMAF.head()

(7105197, 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,0,1,2,3,4,5,6
1,chr1,11007,11008,rs575272151,C,G,0.08806
2,chr1,11011,11012,rs544419019,C,G,0.08806
5,chr1,13115,13116,rs62635286,T,G,0.09704
6,chr1,13117,13118,rs62028691,A,G,0.09704
7,chr1,13272,13273,rs531730856,G,C,0.09505


In [5]:
# Write the SNPs to a file
allSNPs_aboveMAF.to_csv("../processed_data/commonSNPs_all_processed_AboveMAF5Percent.bed",sep="\t",header=False,index=False)