In [None]:
import msprime
import tskit
import random
import numpy as np
import sklearn
import math
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import random 
from random import randint, randrange
from scipy.stats import poisson

In [None]:
# --- Demographic model ---

def history_of_mexicans(len_seq, T, t_mig, N_e, n ):
    demography = msprime.Demography()
    
    demography.add_population(name="AF", initial_size=500000)
    demography.add_population(name="EU", initial_size=300000)
    demography.add_population(name="NA", initial_size=40000)
    demography.add_population(name="MX", initial_size=40000)

    demography.add_population(name="OUT_AF", initial_size=N_e)
    demography.add_population(name="AF_ANC", initial_size=N_e)

    demography.add_population(name="NEAND", initial_size=30000)
    demography.add_population(name="ANCES", initial_size=N_e)




    demography.add_admixture(time= T[3] , derived="MX", ancestral=["AF", "EU", "NA"], proportions=[0.05, 0.5, 0.45])
    demography.add_population_split(time=T[2], derived=["EU", "NA"], ancestral="OUT_AF")
    demography.add_mass_migration(time=t_mig, source='OUT_AF', dest='NEAND', proportion=0.05)
    demography.add_population_split(time=T[1], derived=["AF", "OUT_AF"], ancestral="AF_ANC")
    demography.add_population_split(time=T[0], derived=["AF_ANC", "NEAND"], ancestral="ANCES")

    TS = msprime.sim_ancestry(
        samples=
        [       
                msprime.SampleSet(1, ploidy=1, population='MX'),
                msprime.SampleSet(n, ploidy=1, population='EU'), 
                msprime.SampleSet(n, ploidy=1, population='NA'),
                msprime.SampleSet(n, ploidy=1, population='AF'),
                msprime.SampleSet(25, ploidy=1, population='NEAND', time=2000)          
               
        ],
    
        ploidy=1,    
        sequence_length=len_seq,
        recombination_rate=2.5e-9, 
        demography=demography,
        record_migrations=True, 
        random_seed=5289,
        num_replicates = 40
                                
    )
    prop = []
    for replicate_index, ts in enumerate(TS):
        INDIVIDUAL = -1
        for i in ts.tables.migrations:
            if i.time == T[3] and i.dest == 0:
                INDIVIDUAL = i.node
                
            if INDIVIDUAL != -1: 
                
                proportions = np.array([0,  0, 0]) #proportions Af, Eu, NA
                for i in ts.tables.migrations:
                    if i.time == T[3] and i.node == INDIVIDUAL:
                        if i.dest == 0:
                            proportions[0] += i.right - i.left
                        if i.dest == 1:
                            proportions[1] += i.right - i.left
                        if i.dest == 2:
                            proportions[2] += i.right - i.left
                proportions = proportions / sum(proportions)
                break
        if INDIVIDUAL != -1:
            prop.append([proportions, ts])
    
    
    s = 1
    for i in range(len(prop)):
        if prop[i][ 0][0] < 0.1 and prop[i][ 0][0] > 0.01 and prop[i][0][2] > 0.2 and prop[i][0][1] > 0.3:
            s = prop[i][0][0]
            ts_af = prop[i][1]
            proportions = prop[i][0]
            break
    print('We generate mexican individual with Af-Eu-NA proportions = ', proportions)
    
    ts_af = msprime.sim_mutations(ts_af, rate=1.25e-8, random_seed=4321994, discrete_genome=False)
    
    return ts_af, proportions

In [None]:

def clean_tracts(tractInit, CUT):
    tract = np.copy(tractInit)
    tract = tract/CUT
    tract=tract.astype(int)
    flag = True
    while(flag):
        flag=False
        for i in range(len(tract)):
            for j in range(len(tract)):
                if not flag and tract[i,0]==tract[j,1]:
                    tract[j,1]=tract[i,1]
                    tract = np.delete(tract,i,0)
                    flag=True
    flag = True
    while(flag):
        flag=False
        for i in range(len(tract)):
            for j in range(i+1,len(tract)):
                if tract[i,0]>tract[j,0]:
                    save0=tract[i,0]
                    save1=tract[i,1]
                    tract[i,0]=tract[j,0]
                    tract[i,1]=tract[j,1]
                    tract[j,0]=save0
                    tract[j,1]=save1
                    flag=True
    return tract

      

#несколько вспомогательных функций
def connected(m):
    for i in range(len(m)-1):
        if m[i][1] == m[i+1][0]:
            return True
    return False
        
def remove_one(m):
    mas = m
    while connected(mas) == True:
        for i in range(len(mas)-1):
            if mas[i][1] == mas[i+1][0]:
                mas[i][1] = mas[i+1][1]
                mas.pop(i+1)
                break
    return mas


#Вход: ts, название популяции, индивид(которого мы препарируем), время предка
def get_migrating_tracts_ind(ts, pop, ind, T_anc):
    mig = ts.tables.migrations
    migration_int = []

    for tree in ts.trees():  #перебираем все деревья. Как известно, каждому дереву отвечает участок днк  
        anc_node = ind #выбираем мексиканца
        while tree.time( tree.parent(anc_node) ) <= T_anc : #идем в прошлое до вершины anc_node по предкам нашего мексиканца, пока не наткнемся на миграцию неандертальцев
            anc_node = tree.parent(anc_node)
        migs = np.where(mig.node == anc_node)[0] #выбирем все строки, соответствующие заданному узлу

        #идем по таблице миграций с anc_node и проверяем, чтобы миграции попадали в тот самый участок днк
        for i in migs:

            stroka = mig[i]
            if stroka.time == T_anc and stroka.dest == pop and tree.interval.left >= stroka.left and tree.interval.right <= stroka.right:
                migration_int.append([tree.interval.left, tree.interval.right])

    migration_int2 = []
    for i in range(len(migration_int)):
        if migration_int[i][0] != migration_int[i][1]:
            migration_int2.append(migration_int[i])
    migration_int = migration_int2
    
    mi = remove_one(migration_int)
    mi.sort()  

    return mi



#пересекаются ли интервалы. -1==нет
def two_intersection(i1, i2):
    if i1[0] > i2[1] or i1[1] < i2[0]:
        return -1
    if i1[0] >= i2[0] and i1[0] <= i2[1] and i1[1] >= i2[1]:
        return [i1[0], i2[1]]
    if i1[0] <= i2[0] and i1[1] <= i2[1] and i1[1] >= i2[0]:
        return [i2[0],i1[1]]
    if (i1[0] <= i2[0] and i1[1] >= i2[1]) or  (i2[0] <= i1[0] and i2[1] >= i1[1]):
        return [max(i1[0],i2[0]), min(i1[1], i2[1])]
    
    
#для двух наборов интервалов ищется их пересечение    
def intersections(l1 , l2):
    s = []
    for i in l1:
        for j in l2:
            int_ij = two_intersection(i, j) 
            if int_ij != -1 and int_ij[0]!=int_ij[1]:
                s.append(int_ij)

    return s

# лежит ли точка p в интервале i?
def point_in_interval(p, i):
    if p >= i[0] and p <= i[1]:
        return True
    else:
        return False
    
# возвращает раскраску по состояниям в каждой позиции, тракты и [пропорции соврем днк в европейце, в американце ]

def coloring(ts, ind,  t_mex):
    
    mig_neand=get_migrating_tracts_ind(ts, 6, ind, t_mig) # 6 номер популяции Неандертальца
    
    
  
    mig_eu = get_migrating_tracts_ind(ts,1, ind, t_mex) #европейцы
    mig_na = get_migrating_tracts_ind(ts,2, ind, t_mex) # американцы(native americans)
    mig_af = get_migrating_tracts_ind(ts,0, ind, t_mex) # Африканцы   
        
    
    mig_eu.sort()
    mig_na.sort()
    mig_af.sort()    
    mig_neand.sort()

    
    return mig_neand, mig_eu, mig_na, mig_af

# функция, переводящая состояния-гаплотипы в состояния европа-америка-африка
def haplo_states_to_eu_am_af_states(m):
    
    m2 = []
    
    st0 = [i for i in range(n_haplo)]
    st1 = [i for i in range(n_haplo, 2* n_haplo)]
    st2 = [i for i in range(2*n_haplo, 3*n_haplo)]
    st3 = [i for i in range(3*n_haplo, 3*n_haplo+n_neand)]
    
    for i in range(len(m)):
        if m[i] in st0:
            m2.append(0)
        if  m[i] in st1:
             m2.append(1)
        if  m[i] in st2:
             m2.append(2)  
        if  m[i] in st3:
             m2.append(3) 
    
    return m2

def coloring_by_snps_haplo(sites, mig_neand, mig_eu, mig_na, mig_af):
    
    

    color = [[-1]*len(sites) for i in range(2)]
    color[0] = sites
    
    
    

    
    for i in range(len(sites)):
        for j in mig_neand:
            if point_in_interval(color[0][i], j) == True:                
                color[1][i] = 3            
                

  
    for i in range(len(sites)):
        for j in mig_af:
            if point_in_interval(color[0][i], j) == True:
                color[1][i] = 2
                                
               
     
    for i in range(len(sites)):
        for j in mig_eu:
            if point_in_interval(color[0][i], j) == True and color[1][i] == -1:
                color[1][i] = 0                              
                
              
     
    for i in range(len(sites)):
        for j in mig_na:
            if point_in_interval(color[0][i], j) == True and color[1][i] == -1:
                color[1][i] = 1
    return color

In [None]:
# выбираем гаплотипы для состояний. важно, чтобы они не повторялись

#если в массиве есть повторяющиеся значения, то возвращает True, иначе False
def repetitions(arr):

    for elem in arr:
        if arr.count(elem) > 1:
            return True
    return False


# массив m - массив массивов с номерами(почему не просто массив? так вышло.) Возвращает массив из значений True/False, 
#отвечающих за повторяющиеся значения. 
def repetitions2(m):
    return [repetitions(m[0]), repetitions(m[1]), repetitions(m[2])]

#возвращает массив номеров гаплотипов для ХММ 
# [randrange(0, n) for i in range(n_haplo)] выбирает n_haplo чисел из интервала [0, n-1]
def haplotypes_random_numbers(n_haplo, n,n_neand):
    m = [[randrange(1, n+1) for i in range(n_haplo)], [randrange(n+1, 2*n+1) for i in range(n_haplo)], 
         [randrange(2*n+1, 3*n+1) for i in range(n_haplo)], [randrange(3*n+1, 3*n+1+n_neand) for i in range(n_neand)]]
    
    # пока есть хоть одно повторяющееся значение, делаем...
    while True in repetitions2(m):
        m = [[randrange(1, n+1) for i in range(n_haplo)], [randrange(n+1, 2*n+1) for i in range(n_haplo)], 
             [randrange(2*n+1, 3*n+1) for i in range(n_haplo)], [randrange(3*n+1, 3*n+1+n_neand) for i in range(n_neand)]]
       
    m = [sorted(m[0]), sorted(m[1]), sorted(m[2]), sorted(m[3])]
    print('Индексы индивидуумов', m)
    mm = []
    for i in range(len(m)):
        for j in range(len(m[i])):
            mm.append(m[i][j])
    return mm

In [None]:
#массив из номеров позиций снипов
def snps_positions(ts):

    pos = []      
    for v in ts.variants():
        pos.append(v.site.position)


    return np.array(pos)

In [None]:
def observations_haplo_window(ts, L,   hap_nums):
    
    
    n_h = len(hap_nums)
    f = L
    d_in_snps = np.zeros(n_h, dtype = np.int8)
    d_in_one_wind = []
    d_in_wind = []
    

    
    for v in ts.variants():
        
        if v.site.position < f:
            d_in_snps = [abs(v.genotypes[0]-v.genotypes[i]) for i in hap_nums]
            
            d_in_one_wind.append(d_in_snps)
            
        else:
            f += L
            
            d_in_one_wind = np.array(d_in_one_wind)
            d_in_wind.append( np.sum(d_in_one_wind, axis=0))
            
            
            
            
            d_in_one_wind = []
            d_in_snps = [abs(v.genotypes[0]-v.genotypes[i]) for i in hap_nums]
            d_in_one_wind.append(d_in_snps)
            
    
    d_in_one_wind = np.array(d_in_one_wind)
    d_in_wind.append( np.sum(d_in_one_wind, axis=0))
    return np.array(d_in_wind)                
  

In [None]:
def emission_haplo_window(o, L, t_mx,  t_nd, mu, n_haplo, n_nd):
    # определим количество разных наблюдений
    o = np.array(o)    
    n_o = np.max(o)
    b=[]
    
    lambda_nd = mu * L * t_nd
    lambda_mx = mu * L * t_mx
    
    b_nd = [poisson.pmf(k=i, mu=lambda_nd) for i in range(n_o+1)]
    b_mx = [poisson.pmf(k=i, mu=lambda_mx) for i in range(n_o+1)]
          
    
    for i in range(3*n_haplo+n_nd):
        if i < 3*n_haplo:
            b.append(b_mx)

        else:
            
            b.append(b_nd)
    
    
    return np.array(b)  
    

In [None]:
def choose_state(o,t,i):
    return o[t][i] 

def log_Viterbi(o, p,  b, a):
    
    M = len(o)
    N = N_haplo*3+n_neand
    delta = np.zeros((N_haplo*3+n_neand, M))
    psi = [[0]*M for i in range(N)]
    psi = np.array(psi)
    
    
    
    
    for i in range(N):
        delta[i, 0] = math.log(b[ i, choose_state(o, 0, i)]) + math.log(p[i])        
    
    for t in range(1, M):
        
        # define tr matrix

        
        
        for j in range(N):
            lst = []
            for i in range(0, N):                
                lst.append(delta[i, t-1] + math.log( a[i, j]))
            delta[j, t] =  np.max (lst) + math.log( b[j, choose_state(o, t, j)])
            psi[j, t] = np.argmax (lst )
            
    p = np.max(delta[:, M-1])
    q = [0 for i in range(0,M)]
    
    q[M-1] = np.argmax(delta[:, M-1])    
        
    for t in range(M-2, 0, -1):
        q[t] = psi[q[t+1], t+1]   
    
        
    return q


In [None]:
def tr_mat_4x4(r,L,t_nd, t_mx, p_eu, p_na, p_nd):
    a=np.zeros((4,4),dtype='f')
    
    a[0][1] = t_mx * r * L * p_na * (1-p_nd)
    a[0][2] = t_mx * r * L * (1-p_eu-p_na)
    a[0][3] = t_nd * r * L * (p_eu+p_na)* p_nd
    a[0][0] = 1- a[0][1]-a[0][2]-a[0][3]
    
    a[1][0] = t_mx * r * L * p_eu * (1-p_nd)    
    a[1][2] = t_mx * r * L * (1-p_eu-p_na)
    a[1][3] = t_nd * r * L *  (p_eu+p_na)*p_nd
    a[1][1] = 1- a[1][0]-a[1][2]-a[1][3]
    
    a[2][0] = t_mx * r * L * p_eu * (1-p_nd)
    a[2][1] = t_mx * r * L * p_na * (1-p_nd)
    a[2][3] = t_nd * r * L * (p_eu+p_na)* p_nd
    a[2][2] = 1- a[2][0]-a[2][1]-a[2][3]    

    a[3][0] = t_nd * r * L * p_eu * (1-p_nd)
    a[3][1] = t_nd * r * L * p_na * (1-p_nd)
    a[3][2] = t_mx * r * L * (1-p_eu-p_na)
    a[3][3] = 1- a[3][0]-a[3][1]-a[3][2]   
    
    return a

def transition_matrix_haplo(r, L, N_haplo, n_neand, t_nd, t_mx, p_eu, p_na, p_nd):
    AA = tr_mat_4x4(r,L,t_nd, t_mx, p_eu, p_na, p_nd)
    
    tr_mat_haplo = np.zeros((3 * N_haplo+n_neand, 3 * N_haplo+n_neand))
    
    for i in range(3):
        for j in range(3):
            for k in range(N_haplo):
                for l in range(N_haplo):
                    tr_mat_haplo [N_haplo * i + k][N_haplo * j+l] = AA[i][j]/ N_haplo
                    
    for k in range(n_neand):
        for l in range(n_neand):
            tr_mat_haplo [3*N_haplo  + k][3*N_haplo +l] = AA[3][3]/ n_neand
            
            
    for j in range(3):
        for k in range(n_neand):
            for l in range(N_haplo):
                tr_mat_haplo [3*N_haplo  + k][j*N_haplo +l] = AA[3][j]/ n_neand
                
                
    for i in range(3):
        for k in range(N_haplo):
            for l in range(n_neand):
            
                tr_mat_haplo [i*N_haplo  + k][ 3*N_haplo+l] = AA[i][3]/ n_neand
 
    return np.array(tr_mat_haplo)

In [None]:
def initial(n_haplo, n_neand):
    p = np.zeros(3*n_haplo+n_neand, dtype = 'f')
    for i in range(3*n_haplo):
        p[i] = 0.25/(n_haplo)
    for i in range(3*n_haplo, 3*n_haplo+n_neand):
        p[i] = 0.25/(n_neand)
        
    return p


In [None]:
def vit_st_window_to_snp_st(snps, L, vit_states_window):
    m=[]
    k=0
    for i in range(len(snps)):
        
        
        if snps[i] < k*L +L:
            m.append(vit_states_window[k])
        else:
            k+=1
            m.append(vit_states_window[k])
    return m

print(vit_st_window_to_snp_st([5,6,15,19,21,24,35,36], 10, [2,3,2,5]))

In [None]:
#demographic hystory and ancestry of tracts

time_units = 1000 / 25 #number of generations per 1000 years   
T = [600 * time_units, 70 * time_units, 30 * time_units,  time_units] 
t_mig = 50 * time_units 

r = 2.5e-9
mu=1.25e-8
N_e = 50000 
n = 250 
ind_number = 0
len_sequence = 12e7
t_mex = T[3]

ts, proportions = history_of_mexicans(len_sequence, T, t_mig, N_e, n)

tractsND = get_migrating_tracts_ind(ts, 6, ind_number, t_mig)
tractsEU = get_migrating_tracts_ind(ts,1, ind_number, t_mex)
tractsAS = get_migrating_tracts_ind(ts,2, ind_number, t_mex)
tractsAF =get_migrating_tracts_ind(ts,0, ind_number, t_mex)

In [None]:
n_haplo=...
N_haplo=...
n_neand=..
L = ...
haplotypes_numbers = haplotypes_random_numbers(n_haplo, n, n_neand)
O = observations_haplo_window(ts, L,   haplotypes_numbers)


In [None]:
#detecting only neanderthal states

b = emission_haplo_window(O, L, t_mex,  t_mig, mu, n_haplo, n_neand)
p=initial(n_haplo,n_neand)
A = transition_matrix_haplo(r,L,N_haplo, n_neand, t_mig, t_mex, 0.5, 0.45, 0.05)  
vit_states_window = log_Viterbi(O, p,  b, A)
states_window = haplo_states_to_eu_am_af_states(vit_states_window)
state_vits_snp = vit_st_window_to_snp_st(snps, L, states_window)


real_states = coloring_by_snps_haplo(snps, tractsND, tractsEU, tractsAS, tractsAF)[1]
print(sklearn.metrics.classification_report(real_states,state_vits_snp,  digits=5))
print(sklearn.metrics.confusion_matrix(real_states,state_vits_snp))