In [None]:


import msprime
import tskit
import random
import numpy as np
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import math

# --- Demographic model ---

def history_of_mexicans(len_seq, T, t_mig, N_e, n ):
    demography = msprime.Demography()
    
    demography.add_population(name="AF", initial_size=500000)
    demography.add_population(name="EU", initial_size=300000)
    demography.add_population(name="NA", initial_size=40000)
    demography.add_population(name="MX", initial_size=40000)

    demography.add_population(name="OUT_AF", initial_size=N_e)
    demography.add_population(name="AF_ANC", initial_size=N_e)

    demography.add_population(name="NEAND", initial_size=30000)
    demography.add_population(name="ANCES", initial_size=N_e)




    demography.add_admixture(time= T[3] , derived="MX", ancestral=["AF", "EU", "NA"], proportions=[0.05, 0.5, 0.45])
    demography.add_population_split(time=T[2], derived=["EU", "NA"], ancestral="OUT_AF")
    demography.add_mass_migration(time=t_mig, source='OUT_AF', dest='NEAND', proportion=0.05)
    demography.add_population_split(time=T[1], derived=["AF", "OUT_AF"], ancestral="AF_ANC")
    demography.add_population_split(time=T[0], derived=["AF_ANC", "NEAND"], ancestral="ANCES")

    TS = msprime.sim_ancestry(
        samples=
        [       
                msprime.SampleSet(1, ploidy=1, population='MX'),
                msprime.SampleSet(n, ploidy=1, population='EU'), 
                msprime.SampleSet(n, ploidy=1, population='NA'),
                msprime.SampleSet(n, ploidy=1, population='AF'),
                msprime.SampleSet(25, ploidy=1, population='NEAND', time=2000)          
               
        ],
    
        ploidy=1,    
        sequence_length=len_seq,
        recombination_rate=2.5e-9, 
        demography=demography,
        record_migrations=True, 
        random_seed=54321299,
        num_replicates = 15
                                
    )
    prop = []
    for replicate_index, ts in enumerate(TS):
        INDIVIDUAL = -1
        for i in ts.tables.migrations:
            if i.time == T[3] and i.dest == 0:
                INDIVIDUAL = i.node
                
            if INDIVIDUAL != -1: 
                
                proportions = np.array([0,  0, 0]) #proportions Af, Eu, NA
                for i in ts.tables.migrations:
                    if i.time == T[3] and i.node == INDIVIDUAL:
                        if i.dest == 0:
                            proportions[0] += i.right - i.left
                        if i.dest == 1:
                            proportions[1] += i.right - i.left
                        if i.dest == 2:
                            proportions[2] += i.right - i.left
                proportions = proportions / sum(proportions)
                break
        if INDIVIDUAL != -1:
            prop.append([proportions, ts])
    
    
    s = 1
    for i in range(len(prop)):
        if prop[i][ 0][0] < 0.1 and prop[i][ 0][0] > 0.01 and prop[i][0][2] > 0.2 and prop[i][0][1] > 0.3:
            s = prop[i][0][0]
            ts_af = prop[i][1]
            proportions = prop[i][0]
            break
    print('We generate mexican individual with Af-Eu-NA proportions = ', proportions)
    
    ts_af = msprime.sim_mutations(ts_af, rate=1.25e-8, random_seed=4321994)
    
    return ts_af, proportions


In [None]:

def clean_tracts(tractInit, CUT):
    tract = np.copy(tractInit)
    tract = tract/CUT
    tract=tract.astype(int)
    flag = True
    while(flag):
        flag=False
        for i in range(len(tract)):
            for j in range(len(tract)):
                if not flag and tract[i,0]==tract[j,1]:
                    tract[j,1]=tract[i,1]
                    tract = np.delete(tract,i,0)
                    flag=True
    flag = True
    while(flag):
        flag=False
        for i in range(len(tract)):
            for j in range(i+1,len(tract)):
                if tract[i,0]>tract[j,0]:
                    save0=tract[i,0]
                    save1=tract[i,1]
                    tract[i,0]=tract[j,0]
                    tract[i,1]=tract[j,1]
                    tract[j,0]=save0
                    tract[j,1]=save1
                    flag=True
    return tract

      

#несколько вспомогательных функций
def connected(m):
    for i in range(len(m)-1):
        if m[i][1] == m[i+1][0]:
            return True
    return False
        
def remove_one(m):
    mas = m
    while connected(mas) == True:
        for i in range(len(mas)-1):
            if mas[i][1] == mas[i+1][0]:
                mas[i][1] = mas[i+1][1]
                mas.pop(i+1)
                break
    return mas


#Вход: ts, название популяции, индивид(которого мы препарируем), время предка
def get_migrating_tracts_ind(ts, pop, ind, T_anc):
    mig = ts.tables.migrations
    migration_int = []

    for tree in ts.trees():  #перебираем все деревья. Как известно, каждому дереву отвечает участок днк  
        anc_node = ind #выбираем мексиканца
        while tree.time( tree.parent(anc_node) ) <= T_anc : #идем в прошлое до вершины anc_node по предкам нашего мексиканца, пока не наткнемся на миграцию неандертальцев
            anc_node = tree.parent(anc_node)
        migs = np.where(mig.node == anc_node)[0] #выбирем все строки, соответствующие заданному узлу

        #идем по таблице миграций с anc_node и проверяем, чтобы миграции попадали в тот самый участок днк
        for i in migs:

            stroka = mig[i]
            if stroka.time == T_anc and stroka.dest == pop and tree.interval.left >= stroka.left and tree.interval.right <= stroka.right:
                migration_int.append([tree.interval.left, tree.interval.right])

    migration_int2 = []
    for i in range(len(migration_int)):
        if migration_int[i][0] != migration_int[i][1]:
            migration_int2.append(migration_int[i])
    migration_int = migration_int2
    
    mi = remove_one(migration_int)
    mi.sort()  

    return mi



#пересекаются ли интервалы. -1==нет
def two_intersection(i1, i2):
    if i1[0] > i2[1] or i1[1] < i2[0]:
        return -1
    if i1[0] >= i2[0] and i1[0] <= i2[1] and i1[1] >= i2[1]:
        return [i1[0], i2[1]]
    if i1[0] <= i2[0] and i1[1] <= i2[1] and i1[1] >= i2[0]:
        return [i2[0],i1[1]]
    if (i1[0] <= i2[0] and i1[1] >= i2[1]) or  (i2[0] <= i1[0] and i2[1] >= i1[1]):
        return [max(i1[0],i2[0]), min(i1[1], i2[1])]
    
    
#для двух наборов интервалов ищется их пересечение    
def intersections(l1 , l2):
    s = []
    for i in l1:
        for j in l2:
            int_ij = two_intersection(i, j) 
            if int_ij != -1 and int_ij[0]!=int_ij[1]:
                s.append(int_ij)

    return s

# лежит ли точка p в интервале i?
def point_in_interval(p, i):
    if p >= i[0] and p <= i[1]:
        return True
    else:
        return False
    
# возвращает раскраску по состояниям в каждой позиции, тракты и [пропорции соврем днк в европейце, в американце ]

def coloring(ts, ind,  t_mex):
    
    mig_neand=get_migrating_tracts_ind(ts, 6, ind, t_mig) # 6 номер популяции Неандертальца
    
    
  
    mig_eu = get_migrating_tracts_ind(ts,1, ind, t_mex) #европейцы
    mig_na = get_migrating_tracts_ind(ts,2, ind, t_mex) # американцы(native americans)
    mig_af = get_migrating_tracts_ind(ts,0, ind, t_mex) # Африканцы   
        
    
    mig_eu.sort()
    mig_na.sort()
    mig_af.sort()    
    mig_neand.sort()

    
    return mig_neand, mig_eu, mig_na, mig_af

In [None]:
# лежит ли точка p в интервале i?
def point_in_interval(p, i):
    if p >= i[0] and p <= i[1]:
        return True
    else:
        return False
    
    
#для двух наборов интервалов ищется их пересечение    
def intersections(l1 , l2):
    s = []
    for i in l1:
        for j in l2:
            int_ij = two_intersection(i, j) 
            if int_ij != -1 and int_ij[0]!=int_ij[1]:
                s.append(int_ij)

    return s
def coloring_by_snps(sites, mig_neand, mig_eu, mig_na, mig_af):
    
    
    neand_eu = intersections(mig_neand , mig_eu) #неандерталец в европейце= пересечение неандертальских трактов и европейских
  
    neand_na = intersections(mig_neand, mig_na)
    color = [[-1]*len(sites) for i in range(2)]
    color[0] = sites
    
    
    
    s, s1,s2,s3,s4=[0,0,0,0,0] # число сайтов для каждого состояния 
    
    for i in range(len(sites)):
        for j in neand_eu:
            if point_in_interval(color[0][i], j) == True:                
                color[1][i] = 1            
                
    
    
    for i in range(len(sites)):        
        for j in neand_na:            
            if point_in_interval(color[0][i], j) == True:
                color[1][i] = 3               
                  
            
  
    for i in range(len(sites)):
        for j in mig_af:
            if point_in_interval(color[0][i], j) == True:
                color[1][i] = 4
                                
               
     
    for i in range(len(sites)):
        for j in mig_eu:
            if point_in_interval(color[0][i], j) == True and color[1][i] == -1:
                color[1][i] = 0                              
                
              
     
    for i in range(len(sites)):
        for j in mig_na:
            if point_in_interval(color[0][i], j) == True and color[1][i] == -1:
                color[1][i] = 2
    return color
            
        
        

In [None]:
#массив из номеров позиций снипов
def snps_positions(ts):

    pos = []      
    for v in ts.variants():
        pos.append(v.site.position)

    for i in range(len(pos)):
        pos[i] = int(pos[i])
        return pos


In [None]:
def frequences(ts):
    freq = []
    print(type(freq))
    for v in ts.variants():
        freq.append([float(sum(v.genotypes[1:(n+1)]))/ n, float(sum(v.genotypes[(n+1):(2*n+1)]))/ n,
                     float(sum(v.genotypes[(2*n+1) :(3*n+1)]))/ n, float(sum(v.genotypes[(3*n+1):(4*n+1)]))/ n]) 
    freq = np.array(freq)
    freq = freq.transpose()
    return freq

In [None]:
def emission_probabilities(mu, frq, snps, t, t_africa):

    sc_factor = math.exp(-mu * 2 *t)
    sc_factor2 = math.exp(-mu * 2 * t_africa)
    
    b = []
    theta = frq
    print(len(snps))
    
    for tt in range(len(snps)):
        
        
        b.append([[(1-theta[0, tt]) * sc_factor + theta[0, tt] * (1 - sc_factor),
                   theta[0, tt] * sc_factor + (1-theta[0, tt]) * (1 - sc_factor)],
                  [(1-theta[1, tt]) * sc_factor + theta[1, tt] * (1 - sc_factor),
                   theta[1, tt] * sc_factor + (1-theta[1, tt]) * (1 - sc_factor)],              
                  [(1-theta[2, tt]) * sc_factor2 + theta[2, tt] * (1 - sc_factor2),
                   theta[2, tt] * sc_factor2 + (1-theta[2, tt]) * (1 - sc_factor2)]])

        
    return np.array(b)  
        
def observations(ts):
        # наблюдаемый мексиканец 
    mexican = []
    for v in ts.variants():
        mexican.append(v.genotypes[0])
    return np.array(mexican)

In [None]:

N = 5 #number of hidden states
K = 2 # 0 or 1 alleles

#O - массив наблюдений из нулей и единичек; возвращаем наблюдение в позиции t
def choose_state(o,t):
    return o[t] 

# #этот кусок возвращает матрицу перехода, здесь d это как раз и есть расстояние между снипами,
# r_m, r_a - недавняя и давняя рекомбинации
# p_ee - вероятность перейти из европейца в европейца, p_ea - вероятность перейти из европейца в американца
# (1 - p_ea - p_ee) - вероятность перейти из европейца в африканца
#p_e_m - вероятность быть на современном куске, будучи в европейском куске
#(1-p_e_m) - вероятность быть в древнем куске
def transition_matrix(d, r_a, r_m, p_ee, p_e_m, p_a_m, p_ea, p_ae, p_aa, p_afe, p_afa):
    tr_mat = np.zeros((5, 5))
    
    et_a = math.exp(-r_a * d)
    et_m = math.exp(-r_m * d)
    
    tr_mat[0, 0] = et_a * et_m + ((1 - et_a) * et_m+(1 - et_m) * p_ee) * p_e_m
    tr_mat[0, 1] = ((1 - et_a) * et_m + (1 - et_m) * p_ee) * (1 - p_e_m)
    tr_mat[0, 2] = (1 - et_m) * p_ea * p_a_m
    tr_mat[0, 3] = (1 - et_m) * p_ea * (1 - p_a_m)
    tr_mat[0, 4] = (1 - et_m) * (1 - p_ea - p_ee)
    
    tr_mat[1, 0] = ((1 - et_a) * et_m + (1 - et_m) * p_ee) * p_e_m
    tr_mat[1, 1] = et_a * et_m + ((1 - et_a) * et_m + (1 - et_m) * p_ee) * (1 - p_e_m)
    tr_mat[1, 3] = (1 - et_m) * p_ea * (1 - p_a_m)
    tr_mat[1, 2] = (1 - et_m) * p_ea * p_a_m
    tr_mat[1, 4] = (1 - et_m) * (1 - p_ea -p_ee)   
    
    
    tr_mat[2, 0] = (1 - et_m) * p_ae * p_e_m
    tr_mat[2, 1] = (1 - et_m) * p_ae * (1 - p_e_m)
    tr_mat[2, 2] = et_a * et_m + ((1-et_a) * et_m +(1 - et_m) * p_aa) * p_a_m
    tr_mat[2, 3] = ((1 - et_a) * et_m + (1 - et_m) * p_aa) * (1 - p_a_m)
    tr_mat[2, 4] = (1 - et_m) * (1 - p_ae - p_aa)
    

    tr_mat[3, 0] = (1 - et_m) * p_ae * p_e_m
    tr_mat[3, 1] = (1 - et_m) * p_ae * (1 - p_e_m)
    tr_mat[3, 3] = et_a * et_m + ((1-et_a) * et_m+(1 - et_m) * p_aa) * (1 - p_a_m)
    tr_mat[3, 2] = ((1 - et_m) * et_m + (1 - et_m) * p_aa) * p_a_m
    tr_mat[3, 4] = (1 - et_m) * (1 - p_ea - p_ee)
    
    tr_mat[4, 4] = et_m + (1-et_m) * (1 - p_afe - p_afa)
    tr_mat[4, 0] = (1 - et_m) * p_afe * p_e_m
    tr_mat[4, 1] = (1 - et_m) * p_afe * (1 - p_e_m)
    tr_mat[4, 3] = (1 - et_m) * p_afa * (1 - p_a_m)
    tr_mat[4, 2] = (1 - et_m) * p_afa * p_a_m
    
    return tr_mat

# log-Viterbi algo for our matrices a-> many parameters
def log_Viterbi(o, p,  b, pos):
    
    M = len(pos)
    delta = np.zeros((N, M))
    psi = [[0]*M for i in range(N)]
    psi = np.array(psi)
    
    
    
    
    for i in range(N):
        delta[i, 0] = math.log(b[0, i, choose_state(o, 0)]) + math.log(p[i])        
    
    for t in range(1, M):
        
        # define tr matrix

        a = np.array(transition_matrix(pos[t]-pos[t-1], r_a, r_m, p_ee, p_e_m, p_a_m, p_ea, p_ae, p_aa, p_afe, p_afa))
        
        for j in range(N):
            lst = []
            for i in range(0, N):                
                lst.append(delta[i, t-1] + math.log( a[i, j]))
            delta[j, t] =  np.max (lst) + math.log( b[t,j, choose_state(o, t)])
            psi[j, t] = np.argmax (lst )
            
    p = np.max(delta[:, M-1])
    q = [0 for i in range(0,M)]
    
    q[M-1] = np.argmax(delta[:, M-1])    
        
    for t in range(M-2, 0, -1):
        q[t] = psi[q[t+1], t+1]   
    
        
    return q



In [None]:
def emission_probabilities(mu, frq, snps, t, t_africa, t_neand_mig):

    sc_factor = math.exp(-mu * 2 * t)
    sc_factor2 = math.exp(-mu * 2 * t_africa)
    sc_factor3 = math.exp(-mu * 2 * t_neand_mig)
    
    b = []
    theta = frq
    print(len(snps))
    
    for tt in range(len(snps)):
        
        
        b.append([[(1-theta[0, tt]) * sc_factor + theta[0, tt] * (1 - sc_factor),
                   theta[0, tt] * sc_factor + (1-theta[0, tt]) * (1 - sc_factor)],
                  [(1-theta[3, tt]) * sc_factor3 + theta[3, tt] * (1 - sc_factor3),
                   theta[3, tt] * sc_factor3 + (1-theta[3, tt]) * (1 - sc_factor3)],
                  [(1-theta[1, tt]) * sc_factor + theta[1, tt] * (1 - sc_factor),
                   theta[1, tt] * sc_factor + (1-theta[1, tt]) * (1 - sc_factor)],
                  [(1-theta[3, tt]) * sc_factor3 + theta[3, tt] * (1 - sc_factor3),
                   theta[3, tt] * sc_factor3 + (1-theta[3, tt]) * (1 - sc_factor3)],
                  [(1-theta[2, tt]) * sc_factor + theta[2, tt] * (1 - sc_factor),
                   theta[2, tt] * sc_factor + (1-theta[2, tt]) * (1 - sc_factor)]])

        
    return np.array(b)  