In [72]:
#imports ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from numpy.random import choice
import copy 
from tqdm import tqdm
import sys 
from datetime import datetime
import time
import timeit
import numpy as np 
import json
from multiprocessing import Pool
import pandas as pd

In [10]:

#defining funtions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def base_to_mutate(DNA, current_weights, indices): 
    """
    (DNA: str, weights: Dict) -> int
    
    Will return the index of a random choice of a base that will be mutated based on probabilities given by the
    weights dictionary (i.e. the model)
    ex. "ATCGTA" --> index 3 ("G") will mutate
    """

    # NORMALIZE THE POPULATION OF WEIGHTS    
    total_freq = sum(current_weights) - current_weights[0] - current_weights[-1] # remove the start and end weight
    normalized_weights = ["error"]*len(current_weights)
    for index, value in enumerate(current_weights):
        normalized_weights[index] = value/total_freq 
    
    normalized_weights[0] = 0
    normalized_weights[-1] = 0
    # DRAW THE INDEX OF THE BASE THAT WILL BE MUTATED
    base_index = choice(indices, p=normalized_weights) 
    

    # RETURN THE INDEX
    return base_index 


In [27]:
#define the simulation function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def intron_sim(trialN): 
    
    #set up for the simulation
    DNA  = choice(["A", "T", "C", "G"], size = dna_length)
    DNA = "".join(DNA)
    DNA_initial = copy.copy(DNA)

    #triplets
    triplets = []
    for i_1 in ["A", "T", "G", "C"]: 
        for i_2 in ["A", "T", "G", "C"]: 
            for i_3 in ["A", "T", "G", "C"]: 
                triplets.append(i_1+i_2+i_3)


    #mutability model 
    model = json.load(open("../Human_mutability_model/Model_2020_12_02_genomeWide.txt"))
    #triplet-count dict 
    triplet_chosen_count_dict = {}
    triplet_into_count_dict = {}
    for triplet in triplets: 
        triplet_chosen_count_dict[triplet] = [0,0,0]  
        triplet_into_count_dict[triplet] = [0,0,0]

    #making the weights 
    current_weights = []
    for i in range(1,len(DNA)-1): 
        triplet = DNA[i-1:i+2]
        current_weights.append(model[triplet][0])

    indices = [i for i in range(1, len(DNA)-1)]
    mut_indices = []

    #making the av muts lists
    #creating the black plotting lists 
    av_mutability = []
    intron_mut = []
    exon_mut = []

    #perfomr the simulation
    for i in (range((int(dna_length*prop_muts+1)))):
    #     s0 = time.time()
        base_index = base_to_mutate(DNA, current_weights, indices)

        mut_indices.append(base_index)

        #adding the count for "chosen to mutate" in coutns dict 
        c_triplet = DNA[base_index-1: base_index+2]
        c_triplet_left = DNA[base_index-2: base_index+1]
        c_triplet_right = DNA[base_index: base_index+3]    
        triplet_chosen_count_dict[c_triplet_left][0] += 1
        triplet_chosen_count_dict[c_triplet][1] += 1
        triplet_chosen_count_dict[c_triplet_right][2] += 1

        #accurate model INTO probability ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~OR ^ 
        curr_into_bases = []
        curr_into_bases_probs = []
        for into_base, prob in model[c_triplet][1].items(): 
            curr_into_bases.append(into_base)
            curr_into_bases_probs.append(prob)
        mb =  choice(curr_into_bases, p = curr_into_bases_probs)[1]

        #adding the count for "mutated into thiss" in the triplet dict 
        m_triplet = c_triplet[0]+mb+c_triplet[2]
        m_triplet_left = c_triplet_left[0:2]+mb
        m_triplet_right = mb+c_triplet_right[1:3]

        triplet_into_count_dict[m_triplet_left][0] += 1 
        triplet_into_count_dict[m_triplet][1] += 1 
        triplet_into_count_dict[m_triplet_right][2] += 1 

        #chanaging the dna 
        DNA = DNA[:base_index]+mb+DNA[base_index+1:]

        #updating the weights 
        current_weights[base_index-2]= model[m_triplet_left][0]    
        current_weights[base_index-1]= model[m_triplet][0]
        current_weights[base_index] = model[m_triplet_right][0]

        #appending the av mut lists 
        av_mutability.append(np.mean(current_weights))

        if i in list(range(0,dna_length*prop_muts+1,int(dna_length*prop_muts/nTimesToPrint))): 
            file = open("data/trip_eq/{t}_DNA_gen{i}.txt".format(t=trialN,i=i), "w")
            file.write(str(DNA))
            file.close()


    text_file = open("data/trip_eq/{t}_triplet_into_count_dict.txt".format(t=trialN), "w")
    n = text_file.write(json.dumps(triplet_into_count_dict))
    text_file.close()

    text_file = open("data/trip_eq/{t}_triplet_chosen_count_dict.txt".format(t=trialN), "w")
    n = text_file.write(json.dumps(triplet_chosen_count_dict))
    text_file.close()

    file = open("data/trip_eq/{t}_avMut.txt".format(t=trialN), "w")
    file.write(json.dumps(av_mutability))
    file.close()


**perfomring the paraellization**

In [29]:
prop_muts =2
nTimesToPrint =10
dna_length = 10000

if __name__ == '__main__':
    with Pool(10) as p:
        p.map(intron_sim,["tripletEq_trial"+str(i) for i in range(100)])

In [23]:
max(range((int((dna_length)*prop_muts+1))))

20000

In [28]:
!rm data/trip_eq/*

# **prepping results for plotting**

In [88]:
trips_32 = []
for base1 in ["A","T","C","G"]: 
    for base2 in ["C","T"]: 
        for base3 in  ["A","T","C","G"]: 
            trips_32.append(base1+base2+base3)

In [89]:
model = json.load(open("../Human_mutability_model/Model_2020_12_02_genomeWide.txt"))
def revComp(triplet): 
    conversion_dict = {"A":"T","T":"A","C":"G","G":"C"} #dictionary that contaisn the reverse complement of each base 
    if triplet[1] in ["A","G"]: # i personally like to remove A&Gs 
        rc_triplet = str()      #set up the new triplet ot be created 
        for base in triplet: 
            rc_triplet = conversion_dict[base]+rc_triplet #adds the reverse complemented bases to the rc_triplet in reverse order 
        return rc_triplet
    else: 
        return triplet    #if the middle base is C or T, keep the triplet as is 

In [90]:
gens_to_print = list(range(0,dna_length*prop_muts+1,int(dna_length*prop_muts/nTimesToPrint)))
gen_sum_dict = {}
ntrial = 100
for gen in gens_to_print: 
    gen_sum_dict["gen"+str(gen)] = {}
    for trial in range(ntrial): 
        gen_sum_dict["gen"+str(gen)]["trial"+str(trial)]={}

In [91]:
gens_to_print

[0, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000, 20000]

In [92]:
for gen in tqdm(gens_to_print): 
    for trial in range(ntrial): 
        DNA = open("data/trip_eq/tripletEq_trial{k}_DNA_gen{i}.txt".format(k=trial,i=int(gen))).readlines()[0]
        trip_list = []
        for i in range(1,len(DNA)-1): 
            triplet = DNA[i-1:i+2]
            trip_list.append(triplet)
        for triplet in model.keys(): 
            if triplet[1] in ["C","T"]: 
                tripAndRevComp_count = 0
                tripAndRevComp_count+= trip_list.count(triplet)
                tripAndRevComp_count+= trip_list.count(revComp(triplet))
            gen_sum_dict["gen"+str(gen)]["trial"+str(trial)][triplet]=tripAndRevComp_count
                
            

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:22<00:00,  2.00s/it]


In [121]:
trip_sum_dict = {}
for triplet in model.keys(): 
    if triplet[1] in ["C","T"]: 
        trip_sum_dict[triplet] = {}

In [122]:
for triplet in model.keys(): 
    if triplet[1] in ["C","T"]:
        trip_sum_dict[triplet]["mutability"]  = model[triplet][0]
        trip_sum_dict[triplet]["log10mut"] = np.log10(model[triplet][0])
        for gen in gens_to_print: 
            tripletTrial_list = []
            for trial_g in range(0,ntrial,10): 
                trial_g_counts = 0
                for trial_i in range(trial_g,trial_g+10): 
                    trial_g_counts += gen_sum_dict["gen"+str(gen)]["trial"+str(trial_i)][triplet]
                tripletTrial_list.append(trial_g_counts)
            trip_sum_dict[triplet]["meanCount_"+str((gen/dna_length))+"xMutCov"] = np.mean(tripletTrial_list)
            trip_sum_dict[triplet]["stderrCount_"+str((gen/dna_length))+"xMutCov"] = np.std(tripletTrial_list)/np.sqrt(10)

In [123]:
trip_sum_df = pd.DataFrame.from_dict(trip_sum_dict,orient="index")

In [125]:
trip_sum_df.head()

Unnamed: 0,mutability,log10mut,meanCount_0.0xMutCov,stderrCount_0.0xMutCov,meanCount_0.2xMutCov,stderrCount_0.2xMutCov,meanCount_0.4xMutCov,stderrCount_0.4xMutCov,meanCount_0.6xMutCov,stderrCount_0.6xMutCov,...,meanCount_1.2xMutCov,stderrCount_1.2xMutCov,meanCount_1.4xMutCov,stderrCount_1.4xMutCov,meanCount_1.6xMutCov,stderrCount_1.6xMutCov,meanCount_1.8xMutCov,stderrCount_1.8xMutCov,meanCount_2.0xMutCov,stderrCount_2.0xMutCov
GCG,0.000293,-3.533051,3204.6,40.04902,445.4,28.176657,271.6,8.04388,203.8,3.681847,...,198.0,6.723095,202.0,2.416609,215.4,5.33704,196.8,5.947773,206.8,5.672389
GTT,2.1e-05,-4.681856,3156.0,65.984241,3347.0,55.399819,3596.0,46.615877,3690.6,24.362348,...,4103.8,35.804413,4208.2,70.798559,4240.2,36.8971,4278.4,13.97369,4337.6,37.840507
TTG,1.7e-05,-4.77711,3062.4,35.945292,4455.8,32.030548,4664.4,27.647495,4707.8,43.533389,...,4947.4,35.673015,4986.6,48.331398,5054.4,65.719282,5051.4,76.845325,5071.8,37.515277
GTG,2.1e-05,-4.683584,3167.6,41.0198,3796.2,54.724364,3566.0,42.38679,3243.4,57.812836,...,2994.6,56.815526,2983.2,25.808836,2905.8,48.128536,2908.4,25.072375,3023.4,15.218541
ATA,3.8e-05,-4.416792,3169.2,53.023165,3630.2,57.797197,3960.6,40.976139,4163.4,61.975511,...,4458.2,54.789379,4458.8,25.964899,4495.8,47.63146,4448.0,14.634207,4532.8,11.48634


In [126]:
timestamp = datetime.now().strftime("%Y_%m_%d_%H")
trip_sum_df.to_csv("data/trip_eq/tripeq_tripSummaryDf_{}h.csv".format(timestamp))