## Genotype information encoder

**Input:** A gzipped VCF file containing all SNPs for all populations. An CSV/TSV file mapping the subpopulations/populations to the sample names.

**Output:** A CSV file containing the encoded genotype information sequence for each SNP. This encoded sequence contains the genotype info for all samples/populations for the SNP in question.

In [22]:
'''Create initial encoder which converts geneotype info into a character string and produces and
   order dictionary for the sequence. These are outputted as two CSV files. '''

def encoderseq(infile, outdb, outorder):
    
    # import modules needed for function
    import csv
    import gzip
    import time

    # dictionary for translating genotype info
    ginfo={'1|0':'a', '1|1':'b', '0|1':'c','0|0':'d'}

    #Time how long it takes to run
    start_time = time.time()

    # Open gzipped file 
    with gzip.open(infile,'rt') as vcf:

        # Create empty dictionary that will be populated with rsIDs and genptype frequencies
        dictionary={}
        
        # create empty dictionary for order of samples
        order={}

        # Iterate over each line in a VCF
        for line in vcf:
            
            # create empty string to input genotype info
            string=''

            # Ignore info lines
            if line.strip().startswith("##"):
                pass

            # column heading line
            elif line.strip().startswith("#"):
                
                # Seperate each line into a list of seperate elements 
                items=line.split('\t')
                
                # Start a counter that will as order
                counter=0
                
                # Select only samples 
                for item in items:
                    
                    item = item.strip('\n').strip('\t')
                    
                    if item == '#CHROM':
                        pass
                    elif item == 'POS':
                        pass
                    elif item == 'ID':
                        pass
                    elif item == 'REF':
                        pass
                    elif item == 'ALT':
                        pass
                    elif item == 'QUAL':
                        pass
                    elif item == 'FILTER':
                        pass
                    elif item == 'INFO':
                        pass
                    elif item == 'FORMAT':
                        pass
                    else:
                        
                        #Add sample to dictionary as the key
                        sample = item
                        #Add counter & sample to dictionary
                        order[sample] = str(counter)
                        counter+=1

            # If not an info line
            else:
            
                # Seperate each line into a list of seperate elements 
                items=line.split('\t')

                # Iterate over items in list (the line)
                for item in items:
                    
                    item=item.strip('\n').strip('\t')

                    # Select the line rsID
                    if item.startswith('rs',0) == True:

                        #Add rsID to dictionary as the key
                        rsID = item
                        dictionary[rsID] = []


                    # Select the genotype columns
                    elif item.startswith('1|',0) == True or item.startswith('0|',0)== True:

                        # Add to the genotype counters
                        # Using starts with rather than == because the final entry ends with \n
                        if item.startswith('1|0',0) == True:
                            string+=ginfo[item]

                        elif item.startswith('0|1',0) == True:
                            string+=ginfo[item]

                        elif item.startswith('1|1',0) == True:
                            string+=ginfo[item]

                        elif item.startswith('0|0',0) == True:
                            string+=ginfo[item]
                            
                        else:                            
                            string+='-'
                    else:
                        pass

                # Add the geneotype list as the dictionary value            
                dictionary[rsID]=string 

    # Write the seq dictionary as a csv file
    with open(outdb, 'w') as f:
        for key in dictionary.keys():
            f.write("%s, %s\n" % (key, dictionary[key]))
            
    # Write the order dictionary as a csv file
    with open(outorder, 'w') as p:
        for key in order.keys():
            p.write("%s, %s\n" % (key, order[key]))

    print("Files created in --- %s seconds ---" % (time.time() - start_time))

In [23]:
# Write initial encoded sequences and order file
encoderseq('anno_biallelic_ALL_5.vcf.gz', 'dbseq.csv', 'sorder.csv')

Files created in --- 595.3310840129852 seconds ---


In [111]:
''' Check that all of the samples in the order file (taken straight from the VCF) are in the index file, 
    if not, add them and manually search their subpopulation'''

def checkfiles(index_file, order_file, output_file):
    
    import csv
    import time
    
    #Time how long it takes to run
    start_time = time.time()
    
    # csv file will be converted into this dictionary 
    order={}
    with open(order_file,'r') as inp:
        reader = csv.reader(inp)
        order= {rows[0]:rows[1] for rows in reader}
    
    # csv file will be converted into this dictionary
    index={}
    
    with open(index_file,'r') as inp:
        next(inp)
        reader = csv.reader(inp, delimiter = '\t')#tsv file
        index= {rows[0]:rows[1] for rows in reader}
    
    # Create lists of the keys in each dictionary - these are the samples
    orderkeys= order.keys()
    indexkeys= index.keys()
    
    # Determine whether there are any keys missing in the index
    differences = list(set(orderkeys) - set(indexkeys))# These are all peruvian - I checked on 1000 genome browser
    
    # After checking the population of the missing keys, add them to index
    for item in differences:
        
        index[item]='PEL'
    
    # Write the index dictionary as a csv file
    with open(output_file, 'w') as f:
        for key in index.keys():
            f.write("%s, %s\n" % (key, index[key]))


    print("File created in --- %s seconds ---" % (time.time() - start_time))
    
    return differences 


In [112]:
checkfiles('panel_index.csv', 'sorder.csv', 'sample_index.csv')

File created in --- 0.0035381317138671875 seconds ---


['HG01565', 'HG01572', 'HG01566', 'HG01577', 'HG01578', 'HG01571']

In [70]:
# create dictionary of order from csv order file - used in second stage of encoding

def dictorder(file):
    
    import csv
    
    index={}
    with open(file,'r') as inp:
        reader = csv.reader(inp)
        index= {int(rows[1]):rows[0] for rows in reader}
    
    
    return index
    

In [101]:
# create dictionary of sample-subpopulations - used in second stage of encoding

def dictsubpop(file):
    
    import csv
    
    index={}
    with open(file,'r') as inp:
        reader = csv.reader(inp,)
        index= {rows[0]:rows[1] for rows in reader}
    
    
    return index


In [106]:
# create fully encoded rsID-sequence file

def encoded(order_file, index_file, sequence_file, output_file):
    
    # dictionaries of subpop-based encoding
    nested_dictionary= {'GBR' :{'a':'e','b':'f','c':'g','d':'h'},
                        'PEL' :{'a':'i','b':'j','c':'k','d':'l'},
                        'ESN' :{'a':'m','b':'n','c':'o','d':'p'},
                        'BEB' :{'a':'q','b':'r','c':'s','d':'t'},
                        'CHB' :{'a':'u','b':'v','c':'w','d':'x'}}
    
    import csv
    import time
    
    #Time how long it takes to run
    start_time = time.time()
    
    # Create dictionaries from the order and index files
    order=dictorder(order_file)
    index=dictsubpop(index_file)
    
    # Empty dictionary which will be the output file
    output_dict={}
    
    # open the initial encoding csv file
    with open(sequence_file, 'rt') as f:
        
        # for row in csv
        for line in f:
            
            # create a list containing rsID and initial sequence
            line = line.split(',')
            
            # This will be the encoded sequence
            new_seq=''
            
            for item in line:
                
                item = item.strip('\n').strip('\t').strip(' ')
                
                # extract rsID as dictionary key
                if item.startswith('rs') == True:
                
                    ID=item
                    output_dict[ID]=''
                    
                else: # if not rsID then it is the sequence
                    # counter to search dictionaries
                    count = 0
                    
                    for character in item:
                        # use counter to search sample name
                        p=order[count]
                        # use sample name to search subpop
                        subpop=index[p].strip(' ')
                        # use subpop and character to seach encoded charcter
                        letter=nested_dictionary[subpop][character]
                        # add encoded character to sequence
                        new_seq+=letter
                        # increase count to seach the next sample
                        count+=1
            # Add sequence as value to dictionary
            output_dict[ID]=new_seq
        
        # Write the encoded seq dictionary as a csv file
        with open(output_file, 'w') as f:
            for key in output_dict.keys():
                f.write("%s, %s\n" % (key, output_dict[key]))


        print("File created in --- %s seconds ---" % (time.time() - start_time))


In [108]:
# create encoded sequence file
encoded('sorder.csv', 'sample_index.csv','dbseq.csv', 'encoded.csv')

File created in --- 119.58313822746277 seconds ---
