# ** analysis of the sliding window for the paper ** 

In [1]:
import matplotlib.pyplot as plt
import numpy
import json
import numpy.random
import copy
import numpy as np 
from datetime import datetime #for plotting 
from tqdm import tqdm

In [2]:
model = json.load(open("../Human_mutability_model/Model_2020_12_02_genomeWide.txt"))

** calculating the mean mutability of the whole initial strand ** 

In [5]:
DNAI = open("data/same_DNA/Files_to_use_for_parallel_DNA_gen0_cds.txt").readlines()[0]

In [13]:
initial_muts = []
for i in range(1, len(DNAI)-1): 
    triplet = DNAI[i-1: i+2]
    initial_muts.append(model[triplet][0])

In [14]:
np.mean(initial_muts)

5.4395848326695644e-05

** calculating the mean mutability of all the final strnads ** 

In [25]:
DNAF_dataDict = {}
#note I want the dna string to be an entry in the list value fo the DNAF dictionary so i can add weigths later 
n_gens = 600000
nTrials = 10
for trial_num in range(1,nTrials+1):
    file_name = "data/same_DNA/Trial{t}_DNA_final_cds_invariant50_{g}.txt".format(t=trial_num, g=n_gens)
    DNAF_dataDict["Trial"+str(trial_num)] = open(file_name).readlines()[0]

In [26]:
final_av_muts = []
for trial_num in range(1,nTrials+1):
    cur_final_muts = []
    trial_name = "Trial{t}".format(t=trial_num)
    for i in range(1, len(DNAI)-1): 
        triplet = DNAF_dataDict[trial_name][i-1:i+2]
        cur_final_muts.append(model[triplet][0])
    final_av_muts.append(np.mean(cur_final_muts))

In [28]:
np.mean(final_av_muts)

3.201456438655606e-05

** calculating the mutability of the initial strand exons ** 

In [36]:
exon_inser_dict = json.load(open("data/same_DNA/Files_to_use_for_parallel_exon_insertion_dict.txt"))

In [40]:
initial_coding_mut = []
for exon_bounds in exon_inser_dict.values(): 
    start = exon_bounds[0]
    end = exon_bounds[1]
    length = end-start
    cur_exon_muts = []
    for i in range(1, length-1): 
        triplet = DNAI[i-1:i+2]
        mut = model[triplet][0]
        cur_exon_muts.append(mut)
    initial_coding_mut.append(np.mean(cur_exon_muts))

In [42]:
np.mean(initial_coding_mut)

5.9234892254934978e-05

** calculating the mutability of the initial strand non-coding ** 

In [43]:
exon_bounds_list = []
for exon_bounds in exon_inser_dict.values(): 
    exon_bounds_list.extend(exon_bounds)
exon_bounds_list.sort()

In [45]:
exon_bounds_list

[7141,
 11479,
 18621,
 23421,
 30563,
 35810,
 42952,
 54730,
 61872,
 68859,
 76001,
 80708,
 87850,
 100144]

In [46]:
intron_start = 1
intron_muts = []
for j in range(0, len(exon_bounds_list), 2): #the cds_pos dict was converted into a list, here here we want to iterate it by 2 
    start = exon_bounds_list[j] #start of the exon 
    end = exon_bounds_list[j+1] #end of the exon 
    for k in range(intron_start,start): #analyze triplets from previous exon end to this intron start 
        triplet = DNAI[k-1: k+2]
        intron_muts.append(model[triplet][0])
    intron_start = end #update the "intron starting value" with the end of the current intron
np.mean(intron_muts)

6.1015518031740047e-05

** how the exception exons chnaged (didnt decrease as much) ** 

In [56]:
exept_exons = {'ENST00000287097.6':[],'ENST00000338368.7':[]}

In [57]:
# initial mutability 
for exon_name,value in exept_exons.items(): 
    bounds = exon_inser_dict[exon_name]
    exon_mut = []
    for i in range(bounds[0], bounds[1]): 
        triplet = DNAI[i-1:i+2]
        mut = model[triplet][0]
        exon_mut.append(mut)
    value.append(exon_mut)

In [59]:
np.mean(exept_exons['ENST00000287097.6']), np.mean(exept_exons['ENST00000338368.7'])

(3.1645926001598445e-05, 3.8259818539031552e-05)

In [53]:
exon_inser_dict

{'ENST00000287097.6': [7141, 11479],
 'ENST00000293879.9': [30563, 35810],
 'ENST00000338368.7': [87850, 100144],
 'ENST00000346169.7': [18621, 23421],
 'ENST00000389722.7': [61872, 68859],
 'ENST00000437464.1': [42952, 54730],
 'ENST00000613019.4': [76001, 80708]}

In [57]:
# initial mutability 
for exon_name,value in exept_exons.items(): 
    bounds = exon_inser_dict[exon_name]
    exon_mut = []
    for i in range(bounds[0], bounds[1]): 
        triplet = DNAI[i-1:i+2]
        mut = model[triplet][0]
        exon_mut.append(mut)
    value.append(exon_mut)

In [61]:
len(DNAF_dataDict["Trial1"])

100145