# Genotype Calculator v2

In [92]:
# Read in file and create dictionary of SNP names and genotype info
def gfdict(infile, outfile):
    
    import csv
    import gzip
    import time
    
    start_time = time.time()
    
    with gzip.open(infile,'rt') as vcf:
        
        dictionary={}
        
        for line in vcf:
            
            if line.strip().startswith("##"):
                pass
            
            elif line.strip().startswith("#"):
                pass
            
            else:
                
                items=line.split('\t')
                
                refalt=[]
                
                for item in items:
                    
                    i=item.strip('\t')
                    
                    if i=='A':
                        refalt+=i
                    elif i=='C':
                        refalt+=i
                    elif i=='T':
                        refalt+=i
                    elif i=='G':
                        refalt+=i
                    else:
                        pass
                    
                refallele=refalt[0]
                altallele=refalt[-1]
                
                for item in items:
                    
                    if item.startswith('rs',0) == True:
                        
                        rsID = item
                        dictionary[rsID] = []
                        
                    else:
                        pass
               
                homozygous_ref=0
                homozygous_alt=0
                heterozygous_first=0
                heterozygous_second=0
                
                count=0
                
                for item in items:
                    
                    if item.startswith('1|',0) == True or item.startswith('0|',0)== True:
                        
                        count+=1
                        
                        # Using starts with rather than == because the final entry ends with \n
                        if item.startswith('1|0',0) == True:
                            heterozygous_first+=1
                            
                        elif item.startswith('0|1',0) == True:
                            heterozygous_second+=1
                            
                        elif item.startswith('1|1',0) == True:
                            homozygous_alt+=1
                        
                        elif item.startswith('0|0',0) == True:
                            homozygous_ref+=1
                        
                        else:                            
                            pass
                        
                Sample_number=count
                        
                lstgf=['REF = ' + refallele,
                       'ALT = ' + altallele,
                       altallele + '|'+ refallele + ' = ' + str(heterozygous_first/Sample_number),
                       refallele + '|'+ altallele + ' = ' + str(heterozygous_second/Sample_number),
                       altallele + '|'+ altallele + ' = ' + str(homozygous_alt/Sample_number),
                       refallele + '|'+ refallele + ' = ' + str(homozygous_ref/Sample_number)]
                              
                dictionary[rsID]=lstgf 
    
    with open(outfile, 'w') as f:
        for key in dictionary.keys():
            f.write("%s, %s\n" % (key, dictionary[key]))
        
    print("File created in --- %s seconds ---" % (time.time() - start_time))

In [93]:
# Check if there are duplicates in a list of elements 

def checkIfDuplicates_1(listOfElems):
    ''' Check if given list contains any duplicates '''
    if len(listOfElems) == len(set(listOfElems)):
        return 'No Duplicates'
    else:
        return 'Duplicates Found'


In [94]:
# Create geneotype frequency file for china subpopulation
gfdict('anno_biallelic_china.vcf.gz','China_GF.csv')

File created in --- 143.3972499370575 seconds ---


In [95]:
# Create geneotype frequency file for Bengali subpopulation
gfdict('anno_biallelic_bengali.vcf.gz','Bengali_GF.csv')


File created in --- 124.66317868232727 seconds ---


In [96]:
# Create geneotype frequency file for GBR subpopulation
gfdict('anno_biallelic_GBR.vcf.gz','GBR_GF.csv')

File created in --- 128.75586009025574 seconds ---


In [None]:
# Create geneotype frequency file for Peru subpopulation
gfdict('anno_biallelic_peru.vcf.gz','Peru_GF.csv')

In [None]:
# Create geneotype frequency file for Esan_Nigeria subpopulation
gfdict('anno_biallelic_esan_nigeria.vcf.gz','Esan_Nigeria_GF.csv')

In [None]:
# Create geneotype frequency file for full chr22 VCF across all populations
gfdict('full_chr22.vcf.gz','Full_chr22_GF.csv')