---
# Count triplet frequencies in Human genome HG38
---

- To estimate the relative mutation rate of each triplet 
  we need to know how many of each triplet exist in the 
  human genome
  
- We simply walk through the genome counting each unique
  triplet.
  
- Triplets will include those with N's and lower case letters

        - We ignore those with N's and convert the lower case to upper



In [1]:
import json
from Bio import SeqIO
import gzip 
from collections import defaultdict

In [2]:

trips = defaultdict(int)

In [4]:
trips

defaultdict(int, {})

In [None]:


trips = defaultdict(int)

ref = gzip.open("/scratch/research/references/hsapiens/hg38/hg38.fa.gz", 'rt')

for s in SeqIO.parse(ref, 'fasta'):
    for i in range(len(s.seq)-3):
        triplet = str(s.seq[i:i+3])
        trips[triplet]+=1
            
    

In [15]:
from itertools import product

# this deals with ignoring triplets with Ns and uppercases all the triplets with lower case ba
trip_data = dict(zip(["".join(t) for t in product("ACGT", repeat=3) ], [0]*64))
for triplet, t_count in trips.items():
    if triplet.upper() in trip_data:
        trip_data[triplet.upper()] += t_count

triplets = list(trip_data.keys())
triplets.sort()
for t in triplets:
    print(t,":\t", trip_data[t])
        

AAA :	 116219865
AAC :	 45054039
AAG :	 60506716
AAT :	 74983114
ACA :	 61496684
ACC :	 35209715
ACG :	 7954793
ACT :	 49169423
AGA :	 67980623
AGC :	 42741732
AGG :	 53953871
AGT :	 49109578
ATA :	 62191925
ATC :	 40532560
ATG :	 55516564
ATT :	 75663351
CAA :	 57170968
CAC :	 45873394
CAG :	 62322498
CAT :	 55814087
CCA :	 55568638
CCC :	 39882676
CCG :	 8495526
CCT :	 54101070
CGA :	 6847361
CGC :	 7416430
CGG :	 8705912
CGT :	 8010016
CTA :	 38950865
CTC :	 51615280
CTG :	 61556952
CTT :	 61394655
GAA :	 61143571
GAC :	 28896461
GAG :	 51713795
GAT :	 41018890
GCA :	 44177624
GCC :	 36081859
GCG :	 7479830
GCT :	 42326258
GGA :	 48024396
GGC :	 36067367
GGG :	 40000766
GGT :	 35209635
GTA :	 34395292
GTC :	 28621456
GTG :	 46474021
GTT :	 44703218
TAA :	 62229283
TAC :	 34006723
TAG :	 39242680
TAT :	 62088324
TCA :	 59937972
TCC :	 46873740
TCG :	 7049574
TCT :	 67921009
TGA :	 59920239
TGC :	 43840071
TGG :	 56641534
TGT :	 61864773
TTA :	 62028934
TTC :	 61013121
TTG :	 58719052

** copying the above into a string, convertoing to dictionary, saving ** 

In [3]:
TripletCountString = "AAA :	 116219865 AAC :	 45054039 AAG :	 60506716 AAT :	 74983114 ACA :	 61496684 ACC :	 35209715 ACG :	 7954793 ACT :	 49169423 AGA :	 67980623 AGC :	 42741732 AGG :	 53953871 AGT :	 49109578 ATA :	 62191925 ATC :	 40532560 ATG :	 55516564 ATT :	 75663351 CAA :	 57170968 CAC :	 45873394 CAG :	 62322498 CAT :	 55814087 CCA :	 55568638 CCC :	 39882676 CCG :	 8495526 CCT :	 54101070 CGA :	 6847361 CGC :	 7416430  CGG :	 8705912 CGT :	 8010016 CTA :	 38950865 CTC :	 51615280 CTG :	 61556952 CTT :	 61394655 GAA :	 61143571 GAC :	 28896461 GAG :	 51713795 GAT :	 41018890 GCA :	 44177624 GCC :	 36081859 GCG :	 7479830 GCT :	 42326258 GGA :	 48024396 GGC :	 36067367 GGG :	 40000766 GGT :	 35209635 GTA :	 34395292 GTC :	 28621456 GTG :	 46474021 GTT :	 44703218 TAA :	 62229283 TAC :	 34006723 TAG :	 39242680 TAT :	 62088324 TCA :	 59937972 TCC :	 46873740 TCG :	 7049574 TCT :	 67921009 TGA :	 59920239 TGC :	 43840071 TGG :	 56641534 TGT :	 61864773 TTA :	 62028934 TTC :	 61013121 TTG :	 58719052  TTT :	 117589720"


In [6]:
rob_string_split = TripletCountString.split()
rob_string_split = list(filter(lambda x: x != ":", rob_string_split))
print(rob_string_split[0:10])

['AAA', '116219865', 'AAC', '45054039', 'AAG', '60506716', 'AAT', '74983114', 'ACA', '61496684']


In [7]:
triplet_counts_dict = {}
for i in range(0,len(rob_string_split),2): 
    triplet = rob_string_split[i]
    count = rob_string_split[i+1]
    triplet_counts_dict[triplet] = count

In [8]:
with open('GenomeTripletCounts_2020_12_02_genomeWide.txt', 'w') as outfile:
    json.dump(triplet_counts_dict, outfile)