In [1]:
import numpy as np
import pandas as pd
import pysam
import random
from convert import convert_hap_samples_to_dataframe
import os
import math
from copy import deepcopy
import numba
import os
import rpy2
import rpy2.robjects as robjects
from rpy2.robjects import numpy2ri
# from rpy2.robjects.packages import importr

import warnings
warnings.filterwarnings('ignore')

def get_file(data_path):
    '''从文件夹中，得到各文件的路径'''
    for filename in os.listdir(data_path):
        if filename.endswith('.gz'):
            hap_file = os.path.join(data_path,filename)
        if filename.endswith('.sample'):
            samples_file = os.path.join(data_path,filename)
        if filename.endswith('.vcf'):
            vcf_file = os.path.join(data_path,filename)
        if filename.endswith('.ped'):
            ped_file = os.path.join(data_path,filename)
    assert (hap_file and samples_file and vcf_file and ped_file),'missing data file'
    return  hap_file,samples_file,vcf_file,ped_file

In [2]:
def calcul_XY(df_select):
    '''遍历选择的位点计算X,Y和c, 返回logX,logY and c'''
    log_mu = -8 # 默认突变概率为1e-8
    log2 = np.log10(2)

    n = df_select.shape[0]
    log_x = 0
    log_y = 0
    c = []
    
    for i in range(n):
        father = df_select['faGT'][i]
        mother = df_select['moGT'][i]
        child = df_select['chGT'][i]
        if child == (0,0):
            if mother == (0,1) or mother == (1,0):
                log_y = log_y - log2
                c.append(0)
                if father == (0,1) or father == (1,0):
                    log_x = log_x - 2*log2
                elif father == (0,0):
                    log_x = log_x - log2
                else:
                    log_x = log_x + log_mu - log2

            elif mother == (0,0):
                c.append(0)
                if father == (0,1) or father == (1,0):
                    log_x = log_x - log2
                elif father == (0,0):
                    pass
                else:
                    log_x = log_x + log_mu

            else:
                log_y = log_y + log_mu
                c.append(0)
                if father == (0,1) or father == (1,0):
                    log_x = log_x + log_mu -log2
                elif father == (0,0):
                    log_x = log_x + log_mu
                else:
                    log_x = log_x + 2*log_mu

        elif child == (0,1) or child == (1,0):
            if mother == (0,1) or mother == (1,0):
                assert 1==0,'mother and child are both heteGT'

            elif mother == (0,0):
                c.append(1)
                if father == (0,1) or father == (1,0):
                    log_x = log_x - log2
                elif father == (0,0):
                    log_x = log_x + log_mu
                else:
                    pass

            else:
                c.append(0)
                if father == (0,1) or father == (1,0):
                    log_x = log_x - log2
                elif father == (0,0):
                    pass
                else:
                    log_x = log_x + log_mu

        else:
            if mother == (0,1) or mother == (1,0):
                log_y = log_y - log2
                c.append(1)
                if father == (0,1) or father == (1,0):
                    log_x = log_x - 2*log2
                elif father == (0,0):
                    log_x = log_x + log_mu - log2
                else:
                    log_x = log_x -log2

            elif mother == (0,0):
                log_y = log_y + log_mu
                c.append(1)
                if father == (0,1) or father == (1,0):
                    log_x = log_x + log_mu - log2
                elif father == (0,0):
                    log_x = log_x + 2* log_mu
                else:
                    log_x = log_x + log_mu

            else:
                c.append(1)
                if father == (0,1) or father == (1,0):
                    log_x = log_x - log2
                elif father == (0,0):
                    log_x = log_x + log_mu
                else:
                    pass
    c = np.array(c)
    return log_x,log_y,c

In [3]:
def calcul_pi_ind(df_select,log_x,log_y,c,c_rand):
    '''计算独立情况的亲权系数'''
    f0 = df_select['f0'].values
    pr_ind = (np.log10(np.where(c==0,f0,1-f0))).sum() # type: ignore
    pr_ind_rand = (np.log10(np.where(c_rand==0,f0,1-f0))).sum() # type: ignore
    return log_x - log_y - pr_ind, pr_ind_rand

In [4]:
@numba.jit(nopython=True)
def freq2(x,y):
    '''返回bool列表,True表示x,y对应位置同时为0'''
    return (x==0)&(y==0)

def calcul_fij(df_data,df_select,n):
    num_row = df_data.shape[0]
    fi =  df_select['f0'].values
    locus_list = df_select.index.to_list()
    fij = np.zeros((n,n))
    for i in range(n):  # 因为fij对称，只遍历上三角矩阵
        col1 = df_data[locus_list[i]].to_numpy().astype(int)   # 这里类型的变换，是为了使用numba加速
        for j in range(i+1,n):
            col2 = df_data[locus_list[j]].to_numpy().astype(int)
            fij[i,j] = freq2(col1,col2).sum()/num_row  # 统计两列数据同时为０的概率
            fij[j,i] = fij[i,j]
    for i in range(n):
        fij[i,i] = fi[i]
    return fij

def cMat(fi,fij,n):
    cmat = fij - fi*(fi.reshape(n,1))
    print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
    cmat = np.where(cmat>1e-5,cmat,0)
    return cmat


In [5]:
def inv_eig(cmat):
    D,V = np.linalg.eig(cmat)
    lambda_inv= np.diag(list(map(lambda x: x.real/(0.001+x.real**2), D)))
    inv_C2 = np.dot(np.dot(V, lambda_inv), V.T).real
    return inv_C2

def invc_glassoR(cmat):
    np.savetxt('/home/anran/paternity/version4/cmat.txt',cmat)
    np.savetxt('/home/anran/paternity/version4/invc.txt',inv_eig(cmat))
    robjects.r('.libPaths("/home/anran/miniconda3/lib/R/library")')
    robjects.r.source("/home/anran/paternity/version4/invc.R",encoding = "UTF-8")  # type: ignore
    invc = pd.read_csv("/home/anran/paternity/version4/invc_glasso.csv",sep=' ').values
    robjects.r('rm(list = ls())')  # 清除R中所有环境变量
    # if(os.path.isfile("/home/anran/paternity/version4/cmat.txt")):
    #     os.remove("/home/anran/paternity/version4/cmat.txt")
    #     os.remove("/home/anran/paternity/version4/invc.txt")
    #     os.remove("/home/anran/paternity/version4/invc_glasso.csv")
    return invc


def calcul_e(invc,fi,n):
    res = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            a = 2*fi[i]*fi[j]
            delta = 1-4*a*invc[i, j]
            if delta<0 or fi[i]==0 or fi[j]==0:
                res[i,j] = -invc[i,j]
            elif delta==0:
                res[i,j] =  -1/(2*a)
            else:
                s1 = (-1+math.sqrt(delta))/(2*a)
                s2 = (-1+math.sqrt(delta))/(2*a)
                if abs(s1-invc[i,j])>abs(s2-invc[i,j]):  # 当有2实根，选择距invc较近的根
                    res[i,j] = s2
                else:
                    res[i,j] = s1
    return res


def margin(fi,emat,n):
    e = deepcopy(emat)
    for i in range(n):  # h计算时，对　j!=i　的eij项操作
        e[i,i] = 0
    h = np.arctanh(fi) + (e**2 * fi.reshape(n,1) * (1-fi**2)).sum(axis=1) - (e*fi).sum(axis=1)
    return h


In [6]:
def get_b(k):
    '''根据k生成b, 维度为(2^k,k), 一行是一个样本'''
    num_rows = 2**k  # 矩阵b的行数
    b = np.zeros((num_rows, k), dtype=int)
    for i in range(num_rows):
        binary_representation = format(i, '0' + str(k) + 'b')  # 将整数i转换为k位的二进制字符串
        for j in range(k):
            b[i, j] = int(binary_representation[j])
    return b

def select_emat(seq,j,emat,k):
    '''对于第j个位点, 只考虑它及其前面的k-1个位点'''
    s = np.tile(seq,(k,1))
    mask = np.logical_not(s ^ s.T)
    mask = np.where(mask==0,-1,mask)  # mask的维度是(k,k)
    return np.triu(emat[j-k+1:j+1,j-k+1:j+1]*mask,1)

def get_addition(j,k,b,e,h):
    '''返回E向量, 维度(2**k,)'''
    coef = np.zeros(2**k)
    for i in range(2**k):
        seq = b[i,:]
        mask = np.where(np.logical_not(seq[-1]^seq[:-1]),1,-1)
        coef[i] = (e[j,j-k+1:j] * mask).sum() + h[j]*(-1)**seq[-1]
    return coef

def calcul_Z(e,h,k,n):
    z = np.zeros((2**k,n-k+1))
    b = get_b(k)

    for i in range(2**k):  # 计算ｚ的第一列数据z[0]
        seq = b[i,:]
        z[i,0] = (h[:k]*np.where(seq==0,1,-1)).sum()+select_emat(seq,k-1,e,k).sum()
    
    for l in range(1,n-k+1): # 循环计算z[1]到z[n-k], l指向z的index, 对应e/h的index为 l+(k-1)
        coef = get_addition(l+k-1,k,b,e,h)
        for i in range(2**k):
            seq = b[i,:]
            l0 = (i-seq[-1]) // 2  # 二进制数右移移位，最高位为0
            l1 = (l0 + 2**(k-1)) # 二进制数右移移位，最高位为１
            z0 = min(z[l0,l-1],z[l1,l-1])
            z1 = max(z[l0,l-1],z[l1,l-1])
            if z1-z0>200:
                z[i,l] = coef[i] + z1
            else: 
                z[i,l] = coef[i] + z0 + np.log(np.exp(1)+np.exp(z1-z0))
    z_min = z[:,-1].min()
    log10_z = np.log10(np.exp(z[:,-1]-z_min).sum()) + np.log10(np.exp(1))*z_min
    return log10_z


# def calcul_num(c, e, h, n, k):
#     '''输入e维度N*N
#     返回10的指数部分'''
#     nums = np.zeros(n-k+1)

#     nums[0] = (h[:k]*np.where(c[:k]==0,1,-1)).sum()+select_emat(c[:k],k-1,e,k).sum()

#     for j in range(k,n):
#         l = j-k+1  # nums中的index
#         mask = np.where(np.logical_not(c[j]^c[j-k+1:j]),1,-1)  # 使用c_j与其前k-1个位点计算mask
#         coef = (e[j,j-k+1:j] * mask).sum() + h[j]*(-1)**c[j]
#         nums[l] = coef + nums[l-1]
#     return nums[-1]*np.log10(np.exp(1))
def calcul_num(c, e, h, n, k):
    '''输入e维度N*N
    返回10的指数部分'''
    nums = np.zeros(n-k+1)
    sum_h = (h*np.where(c==0,1,-1)).sum()

    nums[0] = select_emat(c[:k],k-1,e,k).sum()

    for j in range(k,n):
        l = j-k+1  # nums中的index
        mask = np.where(np.logical_not(c[j]^c[l:j]),1,-1)  # 使用c_j与其前k-1个位点计算mask
        coef = (e[j,l:j] * mask).sum()
        nums[l] = coef + nums[l-1]
    return (nums[-1]+sum_h)*np.log10(np.exp(1))

def calcul_pi_nind(i,j,df_select, df_data, n, k, log_x, log_y, c,c_rand):
    fi =  df_select['f0'].values
    fij = calcul_fij(df_data,df_select,n)

    cmat = cMat(fi,fij,n)  
    print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
    invc = invc_glassoR(cmat)
    print(invc[0,:k+2])

    if (np.abs(invc)).max()>20:
        save_locus = df_select["f0"]
        file_name = f'/home/anran/paternity/version4/locus_{i}{j}.csv'
        save_locus.to_csv(file_name)
        assert 1==0,'invc max value too large'


    emat = calcul_e(invc,fi,n) 
    h = margin(fi,emat,n)  
    log_num = calcul_num(c,emat,h,n,k)
    log_num_rand = calcul_num(c_rand,emat,h,n,k)
    print('\t log_num:',log_num)
    log_z = calcul_Z(emat,h,k,n)
    print('\t log_z,:',log_z)

    pi_nind = log_x-log_y-log_num + log_z

    return pi_nind,log_num_rand


In [7]:
def select_locus(vcf, n, father, mother, child):
    '''从vcf中随机抽出父母和孩子n行的数据'''
    df = pd.DataFrame(columns=['locus','moGT','faGT', 'chGT','f0'])
    for record in vcf:   # 先将所有数据读进df
        locus = (record.chrom,str(record.pos))
        f0 = 1-record.info['AF'][0]
        if 0.2<f0<0.8:  # 除去f0=1的情况，不然会导致之后 arctanh(1)=inf
            mo = record.samples[mother]['GT']
            ch = record.samples[child]['GT']
            if (mo!=(0,1) and mo!=(1,0)) or (ch!=(0,1) and ch!=(1,0)):
                fa = record.samples[father]['GT']
                df.loc[len(df.index)] = [locus,mo,fa,ch,f0] # type: ignore
    num_row = df.shape[0]
    assert num_row>=n,'not enough locus to select'
    df_select = df.sample(n,replace=False,axis=0)
    df_select.set_index('locus',drop=True,inplace=True)
    return df_select

In [9]:
data_path = "/home/anran/paternity/family-data/sim_seg1" 
hap_file,samples_file,vcf_file,ped_file=get_file(data_path)
persons = pd.read_table(ped_file)
df_data = convert_hap_samples_to_dataframe(hap_file,samples_file)
df_data.drop(['ID','REF','ALT'],axis=0,inplace=True)

In [9]:
N=1000
K=10

- 计算互信息

In [45]:
def calcul_MI(fi,fij):
    n = fi.shape[0]
    mi = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j:
                mi[i,j] = 0
            else:
                fi0 = fi[i]
                fi1 = 1-fi0
                fj0 = fi[j]
                fj1 = 1-fj0
                fij00 = fij[i,j]
                fij01 = fi0-fij00
                fij10 = fj0 - fij00
                fij11 = 1 + fij00 - fi0 - fj0
                mi[i,j] = fij00*np.log(fij00/(fi0*fj0)) + fij01*np.log(fij01/(fi0*fj1)) + fij10*np.log(fij10/(fi1*fj0)) + fij11*np.log(fij11/(fi1*fj1))
                # mi00 = fij[i,j]*np.log(fij[i,j]/(fi[i]*fi[j]))
                # mi01 = (fi[i]-fij[i,j])*np.log((fi[i]-fij[i,j])/(fi[i]*(1-fi[j]))) 
                # mi10 = (fi[j]-fij[i,j])*np.log((fi[j]-fij[i,j])/((1-fi[i])*fi[j]))
                # mi11 = (1+fij[i,j]-fi[i]-fi[j])*np.log((1+fij[i,j]-fi[i]-fi[j])/((1-fi[i])*(1-fi[j])))
                # mi[i,j] = mi00+mi01+mi10+mi11
                # print(mi00,mi01,mi10,mi11,mi[i,j])
        #     break
        # break
    return mi

In [11]:
fi = np.load('/home/anran/paternity/version4/hypara/fi.npy')
fij = np.load('/home/anran/paternity/version4/hypara/fij.npy')

In [46]:
mi = calcul_MI(fi,fij)

In [58]:
mask_nan = np.where(np.isnan(mi))

In [67]:
mask_nan[0][0],mask_nan[1][0]

(6, 761)

In [65]:
mi[6,761]

nan

In [68]:
i=6
j=761
fi0 = fi[i]
fi1 = 1-fi0
fj0 = fi[j]
fj1 = 1-fj0
fij00 = fij[i,j]
fij01 = fi0-fij00
fij10 = fj0 - fij00
fij11 = 1 + fij00 - fi0 - fj0
print(fi0,fj0,fij00,fij01,fij10,fij11)

0.7710810005664825 0.7687380015850067 0.5398188632104934 0.2312621373559891 0.22891913837451328 -1.3894099581968078e-07


- 调整glasso超参，压缩h到0~1

In [32]:
vcf = pysam.VariantFile(vcf_file)
df_select = select_locus(vcf,N,persons.father[0],persons.mother[0],persons.child[0])
print(persons.father[0],persons.mother[0],persons.child[0])
log_x,log_y,c = calcul_XY(df_select)

HG00099 HG00097 child_HG00099_HG00097


In [33]:
fi =  df_select['f0'].values
fij = calcul_fij(df_data,df_select,N)

In [34]:
np.save('/home/anran/paternity/version4/hypara/fi.npy',fi)
np.save('/home/anran/paternity/version4/hypara/fij.npy',fij)

In [47]:
fi = np.load('/home/anran/paternity/version4/hypara/fi.npy')
fij = np.load('/home/anran/paternity/version4/hypara/fij.npy')

In [11]:
def cMat(fi,fij,n):
    cmat = fij - fi*(fi.reshape(n,1))
    print('c的最小特征值',np.min(np.linalg.eigvals(cmat)))
    cmat = np.where(cmat>1e-3,cmat,0)
    return cmat

In [12]:
def freq_pc(fi,fij,pc):
    #fij = calcul_fij(df_data,locus_list,fi,n)
    return (1-pc)*fi + pc/2,(1-pc)*fij + pc/4

In [13]:
import subprocess

In [None]:
df = pd.DataFrame(columns=['fi_pc','rho1','rho2','diag','e_min','e_max','e_var','h_min','h_max','argmax_h','h_var'])
for pc in [1e-3,1e-2,5e-2,1e-1]: # c中最小非零阈值
    print("-----------------------------alpha=",pc)
    fi2,fij2 = freq_pc(fi,fij,pc)
    cmat = cMat(fi2,fij2,N)
    c_path = '/home/anran/paternity/version4/hypara/cmat.txt'
    invc_path = '/home/anran/paternity/version4/hypara/invc.txt'
    np.savetxt(c_path,cmat)
    np.savetxt(invc_path,inv_eig(cmat))
    for rho1 in [0.1,0.2,0.5]:  # 对角线上罚项系数
        for rho2 in [1e-3,5e-3,1e-2,2e-2,5e-2]:    # 类对角线罚项系数
            print('-------------------rho1,rho2=',rho1,rho2) 
            r_path = f"/home/anran/paternity/version4/invc.R"
            r_command = ['Rscript',r_path]
            r_command.extend(map(str,[c_path, invc_path, rho1, rho2]))
            subprocess.run(r_command,check=True)
            invc = pd.read_csv("/home/anran/paternity/version4/hypara/invc_glasso.csv",sep=' ').values
            diag = ((invc-np.diag(np.diagonal(invc)))==0).all()  # 判断逆矩阵是否为对角矩阵
            emat = calcul_e(invc,fi2,N) 
            h = margin(fi2,emat,N)
            print('h最大值',np.argmax(h))
            # print('\t emat min,max,var:',emat.min(),emat.max(),np.var(emat))
            print('\t h,min,max,var:',h.min(),h.max(),np.var(h))
            df = df._append({'fi_pc':pc,
                             'rho1':rho1,
                             'rho2':rho2,
                             'diag':diag,
                             'e_min':emat.min(),
                             'e_max':emat.max(),
                             'e_var':np.var(emat),
                             'h_min':h.min(),
                             'h_max':h.max(),
                             'argmax_h':np.argmax(h),
                             'h_var':np.var(h)},ignore_index=True)
df.to_csv('/home/anran/paternity/version4/hypara/result4.csv')

In [14]:
df = pd.DataFrame(columns=['diag','e_min','e_max','e_var','h_min','h_max','argmax_h','h_var'])
pc = 0.001
rho1 = 0.5
rho2 = 0.1
for i in range(3):
    vcf = pysam.VariantFile(vcf_file)
    df_select = select_locus(vcf,N,persons.father[0],persons.mother[0],persons.child[0])
    log_x,log_y,c = calcul_XY(df_select)
    fi0 =  df_select['f0'].values
    fij0 = calcul_fij(df_data,df_select,N)
    fi,fij = freq_pc(fi0,fij0,pc)

    cmat = cMat(fi,fij,N)
    c_path = '/home/anran/paternity/version4/hypara/cmat.txt'
    invc_path = '/home/anran/paternity/version4/hypara/invc.txt'
    np.savetxt(c_path,cmat)
    np.savetxt(invc_path,inv_eig(cmat))

    r_path = f"/home/anran/paternity/version4/invc.R"
    r_command = ['Rscript',r_path]
    r_command.extend(map(str,[c_path, invc_path, rho1, rho2]))
    subprocess.run(r_command,check=True)
    invc = pd.read_csv("/home/anran/paternity/version4/hypara/invc_glasso.csv",sep=' ').values
    diag = ((invc-np.diag(np.diagonal(invc)))==0).all()  # 判断逆矩阵是否为对角矩阵
    emat = calcul_e(invc,fi,N) 
    h = margin(fi,emat,N)
    print('h最大值',np.argmax(h))
    # print('\t emat min,max,var:',emat.min(),emat.max(),np.var(emat))
    print('\t h,min,max,var:',h.min(),h.max(),np.var(h))
    
    df = df._append({'diag':diag,
                    'e_min':emat.min(),
                    'e_max':emat.max(),
                    'e_var':np.var(emat),
                    'h_min':h.min(),
                    'h_max':h.max(),
                    'argmax_h':np.argmax(h),
                    'h_var':np.var(h)},ignore_index=True)
df.to_csv('/home/anran/paternity/version4/hypara/result_final.csv')

c的最小特征值 (-4.696284958144072e-07-2.2910963476416514e-18j)
h最大值 236
	 h,min,max,var: 0.1982954154035611 1.0976936129720012 0.06333981806673544
c的最小特征值 (-4.6962849584193677e-07+0j)
h最大值 411
	 h,min,max,var: 0.18519958394124256 1.0976936129720012 0.060884869399292336
c的最小特征值 (-4.696284957902065e-07+0j)
h最大值 532
	 h,min,max,var: 0.18917621932968537 1.0968269174254268 0.06213459899485613


In [28]:
cmat = cMat(fi,fij,N,alpha)   # type: ignore
print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))

	 c的最小特征值 (-7.434551922059885e-07+0j)
	 c的最小特征值 (-0.005211388131469439+0j)


In [30]:
c_path = '/home/anran/paternity/version4/hypara/cmat.txt'
invc_path = '/home/anran/paternity/version4/hypara/invc.txt'
np.savetxt(c_path,cmat)
np.savetxt(invc_path,inv_eig(cmat))

In [40]:
robjects.r('.libPaths("/home/anran/miniconda3/lib/R/library")')
robjects.r.source(f"/home/anran/paternity/version4/invc.R {cpath} {invc_path} {rho1} {rho0}",encoding = "UTF-8")  # type: ignore
invc = pd.read_csv("/home/anran/paternity/version4/hypara/invc_glasso.csv",sep=' ').values
robjects.r('rm(list = ls())')

In [42]:
emat = calcul_e(invc,fi,N) 
h = margin(fi,emat,N) # type: ignore

	 emat min,max,var: -1.99937568446395 0.3420401982175919 0.0038864875195840564
	 h,min,max,var: 0.018603996645236376 4.728874871293823 0.7793322371276544


|对角线|次对角线|其他|invc|e_min|e_max|e_var|h_min|h_max|h_var|
|-|-|-|-|-|-|-|-|-|-|
|0.1|0.01|1|diag|-10.0 |2.13| 0.09|-2.09| 4.38| 2.14|
|0.1|0.5|1|diag|-1.99|0.34| 0.004|0.02| 4.73| 0.78|
|0.5|0|1|NoDiag|-1.99| 0.35| 0.003|-0.01| 4.73| 0.79|

- 验证num计算

In [10]:
vcf = pysam.VariantFile(vcf_file)
df_select00 = select_locus(vcf,N,persons.father[0],persons.mother[0],persons.child[0])
print(persons.father[0],persons.mother[0],persons.child[0])
log_x00,log_y00,c00 = calcul_XY(df_select00)

HG00099 HG00097 child_HG00099_HG00097
X,Y,len_c: -71.9461689636915 -30.404029562062103 2840


In [11]:
c_rand = np.random.randint(2,size=N)
log_pi_ind00,pr_ind_rand00 = calcul_pi_ind(df_select00,log_x00,log_y00,c00,c_rand)
print(log_pi_ind00,pr_ind_rand00)

60.037378576528 -4048.902942626874


In [12]:
fi =  df_select00['f0'].values
fij = calcul_fij(df_data,df_select00,N)

cmat = cMat(fi,fij,N)   # type: ignore
print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
invc = invc_glassoR(cmat)

	 c的最小特征值 (-6.864641899179187e-05+0j)


In [14]:
persons

Unnamed: 0,child,father,mother
0,child_HG00099_HG00097,HG00099,HG00097
1,child_HG00100_HG00108,HG00100,HG00108
2,child_HG00103_HG00105,HG00103,HG00105
3,child_HG00102_HG00101,HG00102,HG00101
4,child_HG00106_HG00096,HG00106,HG00096


- 相同的一段

In [1]:
def get_fi(vcf_file,n):
    '''select N locus from vcf_file'''
    vcf = pysam.VariantFile(vcf_file)
    f0=[]
    for record in vcf:
        f1 = record.info['AF'][0]
        locus = (record.chrom,str(record.pos))
        if f1<1 and f1>0:
            f0.append((locus,1-f1))
    f = random.sample(f0, n)
    freq0 = dict(f)
    fi = pd.Series(freq0)
    return fi


def get_data(vcf,locus_list, father, mother, child):
    df = pd.DataFrame(columns=['locus','moGT','faGT', 'chGT','f0'])
    for record in vcf:   # 先将所有数据读进df
        locus = (record.chrom,str(record.pos))
        f0 = 1-record.info['AF'][0]
        if locus in locus_list:  # 除去f0=1的情况，不然会导致之后 arctanh(1)=inf
            mo = record.samples[mother]['GT']
            ch = record.samples[child]['GT']
            fa = record.samples[father]['GT']
            df.loc[len(df.index)] = [locus,mo,fa,ch,f0] # type: ignore
    df.set_index('locus',drop=True,inplace=True)
    return df

In [11]:
fi_serie = get_fi(vcf_file,N)
locus_list = fi_serie.index.to_list()

In [14]:
i = 0
j = 0
print('三联体：',persons.father[i],persons.mother[j],persons.child[j])
vcf = pysam.VariantFile(vcf_file)
df_select = get_data(vcf,locus_list,persons.father[i],persons.mother[j],persons.child[j])

三联体： HG00099 HG00097 child_HG00099_HG00097


In [22]:
vcf = pysam.VariantFile(vcf_file)
df_select1 = get_data(vcf,locus_list,persons.father[i],persons.mother[1],persons.child[1])

In [23]:
df_select1.head(3)

Unnamed: 0_level_0,moGT,faGT,chGT,f0
locus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(chr6, 19721072)","(0, 0)","(0, 0)","(0, 0)",0.988757
"(chr6, 19721377)","(0, 0)","(0, 0)","(0, 0)",0.999688
"(chr6, 19721812)","(0, 0)","(0, 0)","(0, 0)",0.989226


In [24]:
fi = df_select['f0']
# print('fi:', fi.min(),fi.max())
log_x,log_y,c = calcul_XY(df_select)
log_x1,log_y1,c1 = calcul_XY(df_select1)
c_rand = np.random.randint(2,size=N)
log_pi_ind,pr_ind_rand = calcul_pi_ind(df_select,log_x,log_y,c,c_rand)
log_pi_ind1,pr_ind_rand1 = calcul_pi_ind(df_select1,log_x1,log_y1,c1,c_rand)
print('00 \t pi_ind,pi_ind_rand:',log_pi_ind,pr_ind_rand)
print('01 \t pi_ind,pi_ind_rand:',log_pi_ind1,pr_ind_rand1)

X,Y,len_c: -61.41011911545217 -18.06179973983887 2888
X,Y,len_c: -195.0018791501403 -19.265919722494797 2905
00 	 pi_ind,pi_ind_rand: 35.86325316769033 -4009.861894375218
01 	 pi_ind,pi_ind_rand: -97.02368964985658 -4009.861894375218


In [25]:
fi =  df_select['f0'].values
fij = calcul_fij(df_data,df_select,N)

cmat = cMat(fi,fij,N)   # type: ignore
print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
invc = invc_glassoR(cmat)

	 c的最小特征值 (-2.7163672093222447e-08+0j)


In [26]:
emat = calcul_e(invc,fi,N) 
h = margin(fi,emat,N) # type: ignore
log_z = calcul_Z(emat,h,K,N) 

	 emat: -10.543918372004335 2.780606698023711
	 h: -1.216363316223822 7.440060415829609


In [27]:
log_num = calcul_num(c,emat,h,N,K)
log_num1 = calcul_num(c1,emat,h,N,K)
log_num_rand = calcul_num(c_rand,emat,h,N,K)

In [None]:
pi_nind = log_x-log_y-log_num + log_z
pi_nind1 = log_x1-log_y1-log_num1 + log_z


In [30]:
print('00 \t ',pi_nind)
print('01 \t ',pi_nind1)

00 	  65.33704368407962
01 	  -66.90891533295326


- 不同的两段

In [29]:
i=0
for j in range(2):
    print('---------------------',i,j)
    vcf = pysam.VariantFile(vcf_file)
    df_select3 = select_locus(vcf,N,persons.father[i],persons.mother[j],persons.child[j])
    log_x3,log_y3,c3 = calcul_XY(df_select3)
    c_rand = np.random.randint(2,size=N)
    log_pi_ind3,pr_ind_rand3 = calcul_pi_ind(df_select3,log_x3,log_y3,c3,c_rand)
    print('pi_ind,pi_ind_rand:',log_pi_ind3,pr_ind_rand3)
    log_pi_nind3,log_num_rand3 = calcul_pi_nind(i,j,df_select3,df_data,N,K,log_x3,log_y3,c3,c_rand)
    print("pi_nind,pi_nind_rand:",log_pi_nind3,log_num_rand3)
    file_name = f'/home/anran/paternity/version4/df_select_{i}{j}.csv'
    df_select.to_csv(file_name)

--------------------- 0 0
X,Y,len_c: -29.50093957507016 -9.030899869919436 2886
pi_ind,pi_ind_rand: 35.86325316769033 -4009.861894375218
	 c的最小特征值 (-3.387350499310996e-05+0j)
	 emat: -11.38122512947847 3.002813286844906
	 h: -1.7529015076342453 8.852378052484466
	 log_num: 4539.651909824559
	 log_z,: 4660.191754712258
pi_nind,pi_nind_rand: 100.06980518254841 31.781126686189136
--------------------- 0 1
X,Y,len_c: -34.061799739838875 0 2908
pi_ind,pi_ind_rand: 35.86325316769033 -4009.861894375218
	 c的最小特征值 (-3.610589786391214e-05+0j)
	 emat: -9.98441567737025 2.5005372967630115
	 h: -1.9910166038420685 6.998126618981804
	 log_num: 4590.988896684733
	 log_z,: 4683.307381806428
pi_nind,pi_nind_rand: 58.25668538185619 9.106272640057018


In [31]:
for i in range(6):
    print('**************************')
    print('---------------------',0,0)
    vcf = pysam.VariantFile(vcf_file)
    df_select00 = select_locus(vcf,N,persons.father[0],persons.mother[0],persons.child[0])
    print(persons.father[0],persons.mother[0],persons.child[0])
    log_x00,log_y00,c00 = calcul_XY(df_select00)
    c_rand = np.random.randint(2,size=N)
    log_pi_ind00,pr_ind_rand00 = calcul_pi_ind(df_select00,log_x00,log_y00,c00,c_rand)
    log_pi_nind00,log_num_rand00 = calcul_pi_nind(0,0,df_select00,df_data,N,K,log_x00,log_y00,c00,c_rand)
    print('pi_ind,pi_ind_rand:',log_pi_ind00,pr_ind_rand00)
    print("pi_nind,pi_nind_rand:",log_pi_nind00,log_num_rand00)
    print('---------------------',0,1)
    vcf = pysam.VariantFile(vcf_file)
    df_select01 = select_locus(vcf,N,persons.father[0],persons.mother[1],persons.child[1])
    print(persons.father[0],persons.mother[1],persons.child[1])
    log_x01,log_y01,c01 = calcul_XY(df_select01)
    log_pi_ind01,pr_ind_rand01 = calcul_pi_ind(df_select01,log_x01,log_y01,c01,c_rand)
    log_pi_nind01,log_num_rand01 = calcul_pi_nind(0,1,df_select01,df_data,N,K,log_x01,log_y01,c01,c_rand)
    print('pi_ind,pi_ind_rand:',log_pi_ind01,pr_ind_rand01)
    print("pi_nind,pi_nind_rand:",log_pi_nind01,log_num_rand01)
    if log_pi_ind00<=log_pi_ind01 or log_pi_nind00<= log_pi_nind01:
        file_name00 = f'/home/anran/paternity/version4/df_select00_{i}.csv'
        file_name01 = f'/home/anran/paternity/version4/df_select01_{i}.csv'
        df_select00.to_csv(file_name00)
        df_select01.to_csv(file_name01)

**************************
--------------------- 0 0
HG00099 HG00097 child_HG00099_HG00097
X,Y,len_c: -32.81226952737395 -3.3113299523037933 2984
	 c的最小特征值 (-2.324417386451752e-05+0j)
	 emat: -9.98442309783204 2.3365599782701767
	 h: -1.4873980977933055 5.1391562315308965
	 log_num: 4662.27885181469
	 log_z,: 4708.596723763737
pi_ind,pi_ind_rand: 31.496916839255455 -4095.878940291688
pi_nind,pi_nind_rand: 16.81693237397667 146.83532874219367
--------------------- 0 1
HG00099 HG00108 child_HG00100_HG00108
X,Y,len_c: -284.07818924986873 -30.705059557726084 2960
	 c的最小特征值 (-9.795178068691069e-05+0j)
	 emat: -10.6461621462701 2.9575155267244915
	 h: -2.6222193499173643 5.878858789479525
	 log_num: 4502.101846953369
	 log_z,: 4586.61213874528
pi_ind,pi_ind_rand: -175.5471631327178 -3921.0889608148627
pi_nind,pi_nind_rand: -168.86283790023117 166.27540914275508
**************************
--------------------- 0 0
HG00099 HG00097 child_HG00099_HG00097
X,Y,len_c: -19.265919722494797 -1.5051499

In [9]:
def select_locus2(vcf,n,father0,mother0,child0,father1,mother1,child1):
    '''从vcf中随机抽出<<连续的父母>>和孩子n行的数据'''
    df = pd.DataFrame(columns=['locus','moGT0','faGT0', 'chGT0','moGT1','faGT1','chGT1','f0'])
    for record in vcf:   # 先将所有数据读进df
        locus = (record.chrom,str(record.pos))
        f0 = 1-record.info['AF'][0]
        if f0<1:  # 除去f0=1的情况，不然会导致之后 arctanh(1)=inf
            mo0 = record.samples[mother0]['GT']
            ch0 = record.samples[child0]['GT']
            mo1 = record.samples[mother1]['GT']
            ch1 = record.samples[child1]['GT']
            if (mo0!=(0,1) and mo0!=(1,0)) or (ch0!=(0,1) and ch0!=(1,0)):
                if (mo1!=(0,1) and mo1!=(1,0)) or (ch1!=(0,1) and ch1!=(1,0)):
                    fa = record.samples[father0]['GT']
                    fa1 = record.samples[father1]['GT']
                    df.loc[len(df.index)] = [locus,mo0,fa,ch0,mo1,fa1,ch1,f0] # type: ignore
    num_row = df.shape[0]
    assert num_row>=n,'not enough locus to select'
    i = random.randint(n, num_row)
    df_select = df.iloc[i-n:i,:]
    df_select.set_index('locus',drop=True,inplace=True)
    return df_select

In [None]:
for i in range(6):
    print('**************************',i)
    vcf = pysam.VariantFile(vcf_file)
    father0,mother0,child0 = persons.father[0],persons.mother[0],persons.child[0]
    mother1,child1 = persons.mother[1],persons.child[1]
    df_select = select_locus2(vcf,N,father0,mother0,child0,mother1,child1)
    df_select00 = df_select[['moGT0','faGT', 'chGT0','f0']]
    df_select00 = df_select00.rename(columns={'moGT0':'moGT', 'chGT0':'chGT'})
    df_select01 = df_select[['moGT1','chGT1','faGT','f0']]
    df_select01 = df_select01.rename(columns={'moGT1':'moGT', 'chGT1':'chGT'})

    log_x00,log_y00,c00 = calcul_XY(df_select00)
    c_rand = np.random.randint(2,size=N)
    log_pi_ind00,pr_ind_rand00 = calcul_pi_ind(df_select00,log_x00,log_y00,c00,c_rand)

    log_x01,log_y01,c01 = calcul_XY(df_select01)
    log_pi_ind01,pr_ind_rand01 = calcul_pi_ind(df_select01,log_x01,log_y01,c01,c_rand)

    # 计算连锁时的fij,emat,h 和z
    fi =  df_select['f0'].values
    fij = calcul_fij(df_data,df_select,N)
    cmat = cMat(fi,fij,N)   # type: ignore
    print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
    invc = invc_glassoR(cmat)
    emat = calcul_e(invc,fi,N) 
    h = margin(fi,emat,N)   # type: ignore
    log_z = calcul_Z(emat,h,K,N)
    print('\t log_z,:',log_z)
    # 计算完毕

    log_num00 = calcul_num(c00,emat,h,N,K)
    log_num_rand = calcul_num(c_rand,emat,h,N,K)
    log_num01 = calcul_num(c01,emat,h,N,K)
    print('\t log_num00,log_num01:',log_num00,log_num01)

    log_pi_nind00 = log_x00 - log_y00 - log_num00 + log_z
    log_pi_nind01 = log_x01 - log_y01 - log_num01 + log_z

    print('pr_ind_rand,pr_nind_rand:',pr_ind_rand01,log_num_rand-log_z)
    print('pi_ind00,pi_ind01:',log_pi_ind00,log_pi_ind01)
    print("pi_nind00,pi_nind10:",log_pi_nind00,log_pi_nind01)
    if log_pi_ind00<=log_pi_ind01 or log_pi_nind00<= log_pi_nind01:
        file_name = f'/home/anran/paternity/version4/df_select{i}.csv'
        df_select.to_csv(file_name)

In [8]:
data_path = "/home/anran/paternity/family-data/real_seg1" 
hap_file,samples_file,vcf_file,ped_file=get_file(data_path)
persons = pd.read_table(ped_file)
df_data = convert_hap_samples_to_dataframe(hap_file,samples_file)
df_data.drop(['ID','REF','ALT'],axis=0,inplace=True)
N=3000
K=10

In [11]:
print('真实数据集上')
for i in range(5):
    print('**************************',i)
    vcf = pysam.VariantFile(vcf_file)
    father0,mother0,child0 = persons.father[0],persons.mother[0],persons.child[0]
    father1,mother1,child1 = persons.father[1],persons.mother[1],persons.child[1]
    df_select = select_locus2(vcf,N,father0,mother0,child0,father1,mother1,child1)
    df_select00 = df_select[['moGT0','faGT0', 'chGT0','f0']]
    df_select00 = df_select00.rename(columns={'moGT0':'moGT', 'chGT0':'chGT','faGT0':'faGT'})
    df_select01 = df_select[['moGT1','chGT1','faGT0','f0']]
    df_select01 = df_select01.rename(columns={'moGT1':'moGT', 'chGT1':'chGT','faGT0':'faGT'})

    log_x00,log_y00,c00 = calcul_XY(df_select00)
    c_rand = np.random.randint(2,size=N)
    log_pi_ind00,pr_ind_rand00 = calcul_pi_ind(df_select00,log_x00,log_y00,c00,c_rand)

    log_x01,log_y01,c01 = calcul_XY(df_select01)
    log_pi_ind01,pr_ind_rand01 = calcul_pi_ind(df_select01,log_x01,log_y01,c01,c_rand)

    # 计算连锁时的fij,emat,h 和z
    fi =  df_select['f0'].values
    fij = calcul_fij(df_data,df_select,N)
    cmat = cMat(fi,fij,N)   # type: ignore
    print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
    invc = invc_glassoR(cmat)
    emat = calcul_e(invc,fi,N) 
    h = margin(fi,emat,N)   # type: ignore
    log_z = calcul_Z(emat,h,K,N)
    print('\t log_z,:',log_z)
    # 计算完毕

    log_num00 = calcul_num(c00,emat,h,N,K)
    log_num_rand = calcul_num(c_rand,emat,h,N,K)
    log_num01 = calcul_num(c01,emat,h,N,K)
    print('\t log_num00,log_num01:',log_num00,log_num01)

    log_pi_nind00 = log_x00 - log_y00 - log_num00 + log_z
    log_pi_nind01 = log_x01 - log_y01 - log_num01 + log_z

    print('pr_ind_rand,pr_nind_rand:',pr_ind_rand01,log_num_rand-log_z)
    print('pi_ind00,pi_ind01:',log_pi_ind00,log_pi_ind01)
    print("pi_nind00,pi_nind10:",log_pi_nind00,log_pi_nind01)
    if log_pi_ind00<=log_pi_ind01 or log_pi_nind00<= log_pi_nind01:
        file_name = f'/home/anran/paternity/version4/df_select_real{i}.csv'
        df_select.to_csv(file_name)

真实数据集上
************************** 0
X,Y,len_c: -34.317419505693856 -10.23501985257536 2935
X,Y,len_c: -221.62874945799766 -13.546349804879155 2953
	 c的最小特征值 (-1.629283769412441e-05+0j)
	 emat: -9.98441194007582 1.8876715179009624
	 h: -1.1379489318963727 4.739073816508676
	 log_z,: 4728.537149510334
	 log_num00,log_num01: 4643.7783378080485 4666.316452179802
pr_ind_rand,pr_nind_rand: -4293.055518593701 -4766.694299472704
pi_ind00,pi_ind01: 29.495286060680222 -162.99486668731242
pi_nind00,pi_nind10: 60.676412049167084 -145.861702322587
************************** 1
X,Y,len_c: -55.08848920650856 -11.740169830895267 2880
X,Y,len_c: -654.4410189853683 -27.09269960975831 2849
	 c的最小特征值 (-5.223866902232964e-05+0j)
	 emat: -10.8037662194585 3.004747135046737
	 h: -1.4946438777763706 5.705218278929137
	 log_z,: 4611.769726344533
	 log_num00,log_num01: 4456.290356421181 4481.92960014067
pr_ind_rand,pr_nind_rand: -4154.0505854436615 -4658.390374966572
pi_ind00,pi_ind01: 76.07537829162214 -526.644

In [12]:
print('真实数据集上')
for i in range(5):
    print('**************************',i)
    vcf = pysam.VariantFile(vcf_file)
    father0,mother0,child0 = persons.father[0],persons.mother[0],persons.child[0]
    father1,mother1,child1 = persons.father[2],persons.mother[2],persons.child[2]
    df_select = select_locus2(vcf,N,father0,mother0,child0,father1,mother1,child1)
    df_select00 = df_select[['moGT0','faGT0', 'chGT0','f0']]
    df_select00 = df_select00.rename(columns={'moGT0':'moGT', 'chGT0':'chGT','faGT0':'faGT'})
    df_select01 = df_select[['moGT1','chGT1','faGT0','f0']]
    df_select01 = df_select01.rename(columns={'moGT1':'moGT', 'chGT1':'chGT','faGT0':'faGT'})

    log_x00,log_y00,c00 = calcul_XY(df_select00)
    c_rand = np.random.randint(2,size=N)
    log_pi_ind00,pr_ind_rand00 = calcul_pi_ind(df_select00,log_x00,log_y00,c00,c_rand)

    log_x01,log_y01,c01 = calcul_XY(df_select01)
    log_pi_ind01,pr_ind_rand01 = calcul_pi_ind(df_select01,log_x01,log_y01,c01,c_rand)

    # 计算连锁时的fij,emat,h 和z
    fi =  df_select['f0'].values
    fij = calcul_fij(df_data,df_select,N)
    cmat = cMat(fi,fij,N)   # type: ignore
    print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
    invc = invc_glassoR(cmat)
    emat = calcul_e(invc,fi,N) 
    h = margin(fi,emat,N)   # type: ignore
    log_z = calcul_Z(emat,h,K,N)
    print('\t log_z,:',log_z)
    # 计算完毕

    log_num00 = calcul_num(c00,emat,h,N,K)
    log_num_rand = calcul_num(c_rand,emat,h,N,K)
    log_num01 = calcul_num(c01,emat,h,N,K)
    print('\t log_num00,log_num01:',log_num00,log_num01)

    log_pi_nind00 = log_x00 - log_y00 - log_num00 + log_z
    log_pi_nind01 = log_x01 - log_y01 - log_num01 + log_z

    print('pr_ind_rand,pr_nind_rand:',pr_ind_rand01,log_num_rand-log_z)
    print('pi_ind00,pi_ind01:',log_pi_ind00,log_pi_ind01)
    print("pi_nind00,pi_nind10:",log_pi_nind00,log_pi_nind01)
    if log_pi_ind00<=log_pi_ind01 or log_pi_nind00<= log_pi_nind01:
        file_name = f'/home/anran/paternity/version4/df_select_real02{i}.csv'
        df_select.to_csv(file_name)

真实数据集上
************************** 0
X,Y,len_c: -61.10908911978819 -34.61844950135784 2902
X,Y,len_c: -259.8225694840137 -9.331929865583417 2917
	 c的最小特征值 (-3.91358000440015e-05+0j)
	 emat: -10.0338891063974 2.380490127786847
	 h: -2.18177923120965 5.884206061352682
	 log_z,: 4689.923522672774
	 log_num00,log_num01: 4553.948327808481 4568.5301355556785
pr_ind_rand,pr_nind_rand: -4238.669597798449 -4717.364901119864
pi_ind00,pi_ind01: 56.262181285267815 -173.1978796104178
pi_nind00,pi_nind10: 109.4845552458628 -129.09725250133488
************************** 1
X,Y,len_c: -61.71114911111615 -23.480339661790534 2879
X,Y,len_c: -1076.2883987859093 -46.057589336589125 2790
	 c的最小特征值 (-8.14812616373147e-05+0j)
	 emat: -10.6981257260695 2.982880301859352
	 h: -2.477821887585051 6.963420562024372
	 log_z,: 4562.183657506501
	 log_num00,log_num01: 4427.543575153238 4339.98316666962
pr_ind_rand,pr_nind_rand: -3990.8004260048374 -4525.923153202911
pi_ind00,pi_ind01: 73.84404708850849 -876.4733292557

In [9]:
def select_data(vcf,father0,mother0,child0,father1,mother1,child1):
    '''从vcf中抽出<<连续的父母>>和孩子所有的数据'''
    df = pd.DataFrame(columns=['locus','moGT0','faGT0', 'chGT0','moGT1','faGT1','chGT1','f0'])
    for record in vcf:   # 先将所有数据读进df
        locus = (record.chrom,str(record.pos))
        f0 = 1-record.info['AF'][0]
        if f0<1:  # 除去f0=1的情况，不然会导致之后 arctanh(1)=inf
            mo0 = record.samples[mother0]['GT']
            ch0 = record.samples[child0]['GT']
            mo1 = record.samples[mother1]['GT']
            ch1 = record.samples[child1]['GT']
            if (mo0!=(0,1) and mo0!=(1,0)) or (ch0!=(0,1) and ch0!=(1,0)):
                if (mo1!=(0,1) and mo1!=(1,0)) or (ch1!=(0,1) and ch1!=(1,0)):
                    fa = record.samples[father0]['GT']
                    fa1 = record.samples[father1]['GT']
                    df.loc[len(df.index)] = [locus,mo0,fa,ch0,mo1,fa1,ch1,f0] # type: ignore
    # num_row = df.shape[0]
    # assert num_row>=n,'not enough locus to select'
    # i = random.randint(n, num_row)
    # df_select = df.iloc[i-n:i,:]
    # df_select.set_index('locus',drop=True,inplace=True)
    # return df_select
    return df

In [10]:
data_path = "/home/anran/paternity/family-data/sim_seg1" 
hap_file,samples_file,vcf_file,ped_file=get_file(data_path)
persons = pd.read_table(ped_file)
df_data = convert_hap_samples_to_dataframe(hap_file,samples_file)
df_data.drop(['ID','REF','ALT'],axis=0,inplace=True)
# vcf = pysam.VariantFile(vcf_file)
# father0,mother0,child0 = persons.father[0],persons.mother[0],persons.child[0]
# father1,mother1,child1 = persons.father[1],persons.mother[1],persons.child[1]
# df_select = select_data(vcf,N,father0,mother0,child0,father1,mother1,child1)
# df_select.to_csv('/home/anran/paternity/version4/data_family01.csv')

- 比较突变是否是区分亲子关系的必要因素

In [16]:
N=3000
K=10

In [17]:
df_res = pd.DataFrame(columns=['N','K','times','F','C','X00','Y00','log_PI_ind00','log_PI_nind00','X01','Y01','log_PI_ind01','log_PI_nind01'])
for i in range(2):
    for j in range(2):
        if i!=j:
            vcf = pysam.VariantFile(vcf_file)
            father0,mother0,child0 = persons.father[i],persons.mother[i],persons.child[i]
            father1,mother1,child1 = persons.father[j],persons.mother[j],persons.child[j]
            df_select = select_data(vcf,N,father0,mother0,child0,father1,mother1,child1)
            for k in range(3):
                l = random.randint(0,df_select.shape[0]-N)
                df_select = df_select[l:l+N,:]
                df_select00 = df_select[['moGT0','faGT0', 'chGT0','f0']]
                df_select00 = df_select00.rename(columns={'moGT0':'moGT', 'chGT0':'chGT','faGT0':'faGT'})
                df_select01 = df_select[['moGT1','chGT1','faGT0','f0']]
                df_select01 = df_select01.rename(columns={'moGT1':'moGT', 'chGT1':'chGT','faGT0':'faGT'})

                log_x00,log_y00,c00 = calcul_XY(df_select00)
                c_rand = np.random.randint(2,size=N)
                log_pi_ind00,pr_ind_rand00 = calcul_pi_ind(df_select00,log_x00,log_y00,c00,c_rand)

                log_x01,log_y01,c01 = calcul_XY(df_select01)
                log_pi_ind01,pr_ind_rand01 = calcul_pi_ind(df_select01,log_x01,log_y01,c01,c_rand)

                # 计算连锁时的fij,emat,h 和z
                fi =  df_select['f0'].values
                fij = calcul_fij(df_data,df_select,N)
                cmat = cMat(fi,fij,N)   # type: ignore
                print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
                invc = invc_glassoR(cmat)
                emat = calcul_e(invc,fi,N) 
                h = margin(fi,emat,N)   # type: ignore
                log_z = calcul_Z(emat,h,K,N)
                print('\t log_z,:',log_z)
                # 计算完毕

                log_num00 = calcul_num(c00,emat,h,N,K)
                log_num_rand = calcul_num(c_rand,emat,h,N,K)
                log_num01 = calcul_num(c01,emat,h,N,K)
                print('\t log_num00,log_num01:',log_num00,log_num01)

                log_pi_nind00 = log_x00 - log_y00 - log_num00 + log_z
                log_pi_nind01 = log_x01 - log_y01 - log_num01 + log_z
                print('pr_ind_rand,pr_nind_rand:',pr_ind_rand01,log_num_rand-log_z)
                print('pi_ind00,pi_ind01:',log_pi_ind00,log_pi_ind01)
                print("pi_nind00,pi_nind10:",log_pi_nind00,log_pi_nind01)
                file_name = f'/home/anran/paternity/version4/result_debug/F{i}C{j}_t{k}.csv'
                df_select.to_csv(file_name)
                df_res = df_res._append({'F':i,
                                         'times':k,
                                         'N':N,
                                         'K':K,
                                         'C':j,
                                         'X00':log_x00,
                                         'Y00': log_y00,
                                         'log_PI_ind00': log_pi_ind00,
                                         'log_PI_nind00': log_pi_nind00,
                                         'X01':log_x01,
                                         'Y01': log_y01,
                                         'log_PI_ind01': log_pi_ind01,
                                         'log_PI_nind01': log_pi_nind01}, ignore_index=True) # type: ignore
        else:
            for k in range(3):
                vcf = pysam.VariantFile(vcf_file)
                df_select = select_locus(vcf,N,persons.father[i],persons.mother[j],persons.child[j])
                fi = df_select['f0']
                log_x,log_y,c = calcul_XY(df_select)
                c_rand = np.random.randint(2,size=N)
                log_pi_ind,pr_ind_rand = calcul_pi_ind(df_select,log_x,log_y,c,c_rand)
                print('pi_ind,pi_ind_rand:',log_pi_ind,pr_ind_rand)
                log_pi_nind,log_num_rand = calcul_pi_nind(i,j,df_select,df_data,N,K,log_x,log_y,c,c_rand)
                print("pi_nind,pi_nind_rand:",log_pi_nind,log_num_rand)
                file_name = f'/home/anran/paternity/version4/result_debug/F{i}C{i}_t{k}.csv'
                df_select.to_csv(file_name)
                df_res = df_res._append({'F':i,
                                        'times':k,
                                        'N':N,
                                        'K':K,
                                        'C':j,
                                        'X':log_x,
                                        'Y': log_y,
                                        'log_PI_ind00': log_pi_ind,
                                        'log_PI_nind00': log_pi_nind,
                                        'Pr_ind_rand00': pr_ind_rand,
                                        'log_num_rand00': log_num_rand}, ignore_index=True) # type: ignore


pi_ind,pi_ind_rand: 52.86119171401394 -4189.834776798675
	 c的最小特征值 (-7.414574546187543e-05+0j)
[9.83168958e+00 3.01360149e-05 9.36680309e-03 9.36680309e-03
 1.99642883e-05 1.48132986e-05 2.11988321e-03 2.64459114e-02
 9.36680309e-03 4.01792547e-05 0.00000000e+00 0.00000000e+00]
	 emat: -11.9728217048332 2.3131459676042274
	 h: -2.239476387893023 10.064801018164356
	 log_num: 4513.8996231958845
	 log_z,: 4645.09508372868
pi_nind,pi_nind_rand: 94.7708310574535 -62.90947742938967
pi_ind,pi_ind_rand: 42.603745592041946 -4150.565937710382
	 c的最小特征值 (-3.5238374379151856e-05+0j)
[9.93796884e+00 2.10723042e-05 1.14614366e-03 3.50477505e-05
 2.74457560e-03 2.82222103e-03 2.10723042e-05 8.36174619e-05
 2.63889389e-03 1.24675105e-04 0.00000000e+00 0.00000000e+00]
	 emat: -10.7126012527239 2.5983387507700595
	 h: -1.9553125906544433 5.712211575822206
	 log_num: 4557.075549293411
	 log_z,: 4650.773102878496
pi_nind,pi_nind_rand: 81.05429376719712 2.5282215644646016
pi_ind,pi_ind_rand: 44.9446673930

TypeError: select_data() takes 7 positional arguments but 8 were given

- 比较孩子０和父亲１的系数

In [8]:
data_path = "/home/anran/paternity/family-data/sim_seg1" 
hap_file,samples_file,vcf_file,ped_file=get_file(data_path)
persons = pd.read_table(ped_file)
df_data = convert_hap_samples_to_dataframe(hap_file,samples_file)
df_data.drop(['ID','REF','ALT'],axis=0,inplace=True)

In [10]:
N=3000
K=10
for i in range(3):
    print('**************************',i)
    vcf = pysam.VariantFile(vcf_file)
    father0,mother0,child0 = persons.father[0],persons.mother[0],persons.child[0]
    father1,mother1,child1 = persons.father[1],persons.mother[1],persons.child[1]
    df_select = select_locus2(vcf,N,father0,mother0,child0,father1,mother1,child1)
    df_select00 = df_select[['moGT0','faGT1', 'chGT0','f0']]
    df_select00 = df_select00.rename(columns={'moGT0':'moGT', 'chGT0':'chGT','faGT1':'faGT'})
    df_select01 = df_select[['moGT1','chGT1','faGT1','f0']]
    df_select01 = df_select01.rename(columns={'moGT1':'moGT', 'chGT1':'chGT','faGT1':'faGT'})

    log_x00,log_y00,c00 = calcul_XY(df_select00)
    c_rand = np.random.randint(2,size=N)
    log_pi_ind00,pr_ind_rand00 = calcul_pi_ind(df_select00,log_x00,log_y00,c00,c_rand)

    log_x01,log_y01,c01 = calcul_XY(df_select01)
    log_pi_ind01,pr_ind_rand01 = calcul_pi_ind(df_select01,log_x01,log_y01,c01,c_rand)

    # 计算连锁时的fij,emat,h 和z
    fi =  df_select['f0'].values
    fij = calcul_fij(df_data,df_select,N)
    cmat = cMat(fi,fij,N)   # type: ignore
    print('\t c的最小特征值',np.min(np.linalg.eigvals(cmat)))
    invc = invc_glassoR(cmat)
    emat = calcul_e(invc,fi,N) 
    h = margin(fi,emat,N)   # type: ignore
    log_z = calcul_Z(emat,h,K,N)
    print('\t log_z,:',log_z)
    # 计算完毕

    log_num00 = calcul_num(c00,emat,h,N,K)
    log_num_rand = calcul_num(c_rand,emat,h,N,K)
    log_num01 = calcul_num(c01,emat,h,N,K)
    print('\t log_num00,log_num01:',log_num00,log_num01)

    log_pi_nind00 = log_x00 - log_y00 - log_num00 + log_z
    log_pi_nind01 = log_x01 - log_y01 - log_num01 + log_z

    print('pr_ind_rand,pr_nind_rand:',pr_ind_rand01,log_num_rand-log_z)
    print('pi_ind00,pi_ind01:',log_pi_ind00,log_pi_ind01)
    print("pi_nind00,pi_nind10:",log_pi_nind00,log_pi_nind01)
    if log_pi_ind00<=log_pi_ind01 or log_pi_nind00<= log_pi_nind01:
        file_name = f'/home/anran/paternity/version4/df_select01_{i}.csv'
        df_select.to_csv(file_name)

************************** 0
X,Y,len_c: -94.5318394449896 -21.975189683470628 2993
X,Y,len_c: -18.663859731166834 -2.1072099696478683 2990
	 c的最小特征值 (-3.7448591322134905e-05+0j)
	 emat: -10.7527872441522 2.276845744034853
	 h: -2.1938331665687665 4.731170259269297
	 log_z,: 4691.914909539268
	 log_num00,log_num01: 4626.209505687194 4627.097878579139
pr_ind_rand,pr_nind_rand: -4156.7682854868035 -4625.139444266227
pi_ind00,pi_ind01: -28.366431686772003 21.556952511924624
pi_nind00,pi_nind10: -6.851245909445424 48.26038119860914
************************** 1
X,Y,len_c: -202.1357785864579 -39.73595942764552 2923
X,Y,len_c: -74.35440892900336 -15.954589770191005 2962
	 c的最小特征值 (-8.081724947358958e-05+0j)
	 emat: -10.6461621785028 2.9594222604659315
	 h: -2.6222193499173643 5.110516713899704
	 log_z,: 4617.837612643539
	 log_num00,log_num01: 4531.249632461859 4539.551841233411
pr_ind_rand,pr_nind_rand: -4136.372143812877 -4620.471260137297
pi_ind00,pi_ind01: -116.01859164815272 12.2350688020

# 重连锁

In [23]:
import random
import argparse
import pysam

def parse_args():
    parser = argparse.ArgumentParser(description="Generate trios from a multi-sample VCF file")
    parser.add_argument("vcf", help="Input VCF file")   # 输入VCF文件路径
    parser.add_argument("output_vcf", help="Output VCF file with trios")   # 输出VCF文件路径
    parser.add_argument("output_ped", help="Output PED file with family relationships") # 输出PED文件路径
    parser.add_argument("-n", "--num_trios", type=int, default=1, help="Number of trios to simulate (default: 1)")  # 模拟的三代家庭数量
    parser.add_argument("-r", "--num_recombinations", type=int, default=1, help="Number of recombinations per trio (default: 1)")   # 每个家庭模拟的重组事件数量

    return parser.parse_args()

def generate_recombination_intervals(num_recombinations, num_variants):
    '''生成给定数量的重组间隔，即随机选择一定数量的变异位点作为重组点，形成相邻两个变异位点之间的重组区间。'''
    recombination_points = sorted(random.sample(range(1, num_variants), 2 * num_recombinations))
    recombination_intervals = [(recombination_points[i], recombination_points[i + 1]) for i in range(0, len(recombination_points), 2)]
    return recombination_intervals

def is_variant_in_recombination_intervals(variant_pos, recombination_intervals):
    '''检查一个变异位点是否在给定的重组区间内'''
    for start, end in recombination_intervals:
        if start <= variant_pos <= end:
            return True
    return False

def read_vcf(filename):
    '''读取输入VCF文件,获取样本列表(samples)、原始变异位点列表(variants)和输入VCF头信息 (header)'''
    vcf = pysam.VariantFile(filename)  
    samples = list(vcf.header.samples)  # 样本列表
    variants = list(vcf.fetch())        # 变异位点列表

    return samples, variants, vcf.header

def write_vcf(out_vcf, new_header, new_variants):
    '''将生成的新变异位点写入输出的VCF文件。'''
    out_vcf =  pysam.VariantFile(out_vcf, "w", header=new_header)
    for variant in new_variants:
        out_vcf.write(variant)
    out_vcf.close()

def write_ped(filename, trios):
    '''将生成的三代家庭关系写入输出的PED文件'''
    with open(filename, 'w') as f:
        f.write('child\tfather\tmother\n')
        for k, v in trios.items():
            f.write(f"{k}\t{v[0]}\t{v[1]}\n")
        

def simulate_trio(num_recombinations, out_samples, variants, out_header, trios):
    '''模拟三联家庭的遗传关系'''
    recombination_intervals = generate_recombination_intervals(num_recombinations, len(variants))
    new_variants = []   # 用于存储模拟生成的新变异位点
    print(recombination_intervals)
    # parent1, parent2 = random.sample(samples, 2)

    # child_name = f"child_{parent1}_{parent2}"
    # samples.append(child_name)
    # out_header = in_header.deepcopy()
    new_header = pysam.VariantHeader()  # 存储新的VCF文件的头信息
    for record in out_header.records:   # 循环复制原始VCF文件头信息，并将其添加到new_header中
        new_header.add_line(str(record))
    print(list(new_header.samples))
    # original_samples = list(out_header.samples)
    # for sample in original_samples:
    #     # if sample not in out_samples:
    #     out_header.samples.remove(sample)
    for sample in out_samples:
        new_header.add_sample(sample)
    print(list(new_header.samples))

    for variant in variants:    # 对每个原始变异位点进行遍历，并为每个样本生成新的基因型信息
        new_variant = new_header.new_record()
        new_variant.chrom = variant.chrom
        new_variant.pos = variant.pos
        new_variant.ref = variant.ref
        new_variant.alts = variant.alts
        new_variant.id = variant.id
        new_variant.qual = variant.qual

        for key, value in variant.info.items():
            new_variant.info[key] = value

        for sample in out_samples:
            if "child" in sample:   # 样本是子代,
                gt1 = variant.samples[trios[sample][0]]['GT']   # 从父母中获取基因型信息
                gt2 = variant.samples[trios[sample][1]]['GT']

                if is_variant_in_recombination_intervals(variant.pos, recombination_intervals):
                    print("in")
                    gt1 = gt1[::-1] # 在重组区间内发生重组时对基因型进行翻转
                    gt2 = gt2[::-1]

                child_gt = (gt1[0], gt2[0])
                new_variant.samples[sample]['GT'] = child_gt
                new_variant.samples[sample].phased = True
            else:   # 如果样本不是子代，则直接复制原始变异位点的基因型信息
                new_variant.samples[sample]['GT'] = variant.samples[sample]['GT']
                new_variant.samples[sample].phased = True

        new_variants.append(new_variant)

    return new_variants, new_header

def get_unique_pairs(samples, previous_pairs):
    while True:
        pair = tuple(sorted(random.sample(samples, 2)))
        if pair not in previous_pairs:
            previous_pairs.add(pair)
            return pair

In [24]:
vcf = '/home/anran/paternity/family-data/sim_seg2/sim.5ped.bisnp.chr6-49720000-53130000.dedup.vcf'

In [26]:
samples, variants, in_header = read_vcf(vcf)

In [27]:
trios = {}  # 用于存储三代家庭的关系信息
out_samples = []    # 用于存储输出的样本列表
previous_pairs = set()

In [30]:
parent1, parent2 = random.sample(samples, 2)
print(samples)
print(parent1,parent2)


['HG00100', 'HG00107', 'child_HG00100_HG00107', 'HG00099', 'HG00097', 'child_HG00099_HG00097', 'HG00108', 'HG00106', 'child_HG00108_HG00106', 'HG00101', 'HG00102', 'child_HG00101_HG00102', 'HG00103', 'HG00096', 'child_HG00103_HG00096']
HG00107 HG00100


In [31]:
out_samples.append(parent1)
out_samples.append(parent2)

In [32]:
for k in range(2):
    out_samples.append(f"child{k}_{parent1}_{parent2}")
    trios[f"child{k}_{parent1}_{parent2}"] = (parent1, parent2, f"child{k}_{parent1}_{parent2}")

In [33]:
print(out_samples)
print(trios)

['HG00107', 'HG00100', 'child0_HG00107_HG00100', 'child1_HG00107_HG00100']
{'child0_HG00107_HG00100': ('HG00107', 'HG00100', 'child0_HG00107_HG00100'), 'child1_HG00107_HG00100': ('HG00107', 'HG00100', 'child1_HG00107_HG00100')}


In [35]:
num_recombinations = 1000
print('len(variant)',len(variants))
recombination_points = sorted(random.sample(range(1, len(variants)), 2 * num_recombinations))
print('--------------------')
print(len(recombination_points))
print(recombination_points[:5])
recombination_intervals = [(recombination_points[i], recombination_points[i + 1]) for i in range(0, len(recombination_points), 2)]
print('--------------------')
print(len(recombination_intervals))
print(recombination_intervals[:5])

len(variant) 72661
--------------------
2000
[30, 39, 82, 89, 92]
--------------------
1000
[(30, 39), (82, 89), (92, 154), (206, 211), (258, 295)]


In [36]:
new_variants = []
out_header = in_header
new_header = pysam.VariantHeader()
for record in out_header.records:   # 循环复制原始VCF文件头信息，并将其添加到new_header中
    new_header.add_line(str(record))
print(list(new_header.samples))

[]


In [37]:
for sample in out_samples:
    new_header.add_sample(sample)
print(list(new_header.samples))

['HG00107', 'HG00100', 'child0_HG00107_HG00100', 'child1_HG00107_HG00100']


In [41]:
variant = variants[0]
print(variant)
new_variant = new_header.new_record()
new_variant.chrom = variant.chrom
new_variant.pos = variant.pos
new_variant.ref = variant.ref
new_variant.alts = variant.alts
new_variant.id = variant.id
new_variant.qual = variant.qual
for key, value in variant.info.items():
    new_variant.info[key] = value

chr6	49720113	6:49720113:C:T	C	T	.	.	AC=0;AC_AFR=3;AC_AMR=0;AC_EAS=0;AC_EUR=0;AC_Het=3;AC_Het_AFR=3;AC_Het_AMR=0;AC_Het_EAS=0;AC_Het_EUR=0;AC_Het_SAS=0;AC_Hom=0;AC_Hom_AFR=0;AC_Hom_AMR=0;AC_Hom_EAS=0;AC_Hom_EUR=0;AC_Hom_SAS=0;AC_SAS=0;AF=0.000468457;AF_AFR=0.00167973;AF_AMR=0;AF_EAS=0;AF_EUR=0;AF_SAS=0;AN=22;AN_AFR=1786;AN_AMR=980;AN_EAS=1170;AN_EUR=1266;AN_SAS=1202;BaseQRankSum=-0.022;ClippingRankSum=-0.998;DP=94334;FS=5.717;HWE=1;HWE_AFR=1;HWE_AMR=1;HWE_EAS=1;HWE_EUR=1;HWE_SAS=1;InbreedingCoeff=-0.0005;MLEAC=3;MLEAF=0.0004685;MQ=60;MQ0=0;MQRankSum=0.952;POSITIVE_TRAIN_SITE;QD=12.7;ReadPosRankSum=0.952;SOR=1.13;VQSLOD=17.86;culprit=MQ;AN_EUR_unrel=1006;AN_EAS_unrel=1008;AN_AMR_unrel=694;AN_SAS_unrel=978;AN_AFR_unrel=1322;AF_EUR_unrel=0;AF_EAS_unrel=0;AF_AMR_unrel=0;AF_SAS_unrel=0;AF_AFR_unrel=0.00151286;AC_EUR_unrel=0;AC_EAS_unrel=0;AC_AMR_unrel=0;AC_SAS_unrel=0;AC_AFR_unrel=2;AC_Het_EUR_unrel=0;AC_Het_EAS_unrel=0;AC_Het_AMR_unrel=0;AC_Het_SAS_unrel=0;AC_Het_AFR_unrel=2;AC_Hom_EUR_unr

In [42]:
print(new_variant)

chr6	49720113	6:49720113:C:T	C	T	.	.	AC=0;AC_AFR=3;AC_AMR=0;AC_EAS=0;AC_EUR=0;AC_Het=3;AC_Het_AFR=3;AC_Het_AMR=0;AC_Het_EAS=0;AC_Het_EUR=0;AC_Het_SAS=0;AC_Hom=0;AC_Hom_AFR=0;AC_Hom_AMR=0;AC_Hom_EAS=0;AC_Hom_EUR=0;AC_Hom_SAS=0;AC_SAS=0;AF=0.000468457;AF_AFR=0.00167973;AF_AMR=0;AF_EAS=0;AF_EUR=0;AF_SAS=0;AN=22;AN_AFR=1786;AN_AMR=980;AN_EAS=1170;AN_EUR=1266;AN_SAS=1202;BaseQRankSum=-0.022;ClippingRankSum=-0.998;DP=94334;FS=5.717;HWE=1;HWE_AFR=1;HWE_AMR=1;HWE_EAS=1;HWE_EUR=1;HWE_SAS=1;InbreedingCoeff=-0.0005;MLEAC=3;MLEAF=0.0004685;MQ=60;MQ0=0;MQRankSum=0.952;POSITIVE_TRAIN_SITE;QD=12.7;ReadPosRankSum=0.952;SOR=1.13;VQSLOD=17.86;culprit=MQ;AN_EUR_unrel=1006;AN_EAS_unrel=1008;AN_AMR_unrel=694;AN_SAS_unrel=978;AN_AFR_unrel=1322;AF_EUR_unrel=0;AF_EAS_unrel=0;AF_AMR_unrel=0;AF_SAS_unrel=0;AF_AFR_unrel=0.00151286;AC_EUR_unrel=0;AC_EAS_unrel=0;AC_AMR_unrel=0;AC_SAS_unrel=0;AC_AFR_unrel=2;AC_Het_EUR_unrel=0;AC_Het_EAS_unrel=0;AC_Het_AMR_unrel=0;AC_Het_SAS_unrel=0;AC_Het_AFR_unrel=2;AC_Hom_EUR_unr

In [46]:
trios

{'child0_HG00107_HG00100': ('HG00107', 'HG00100', 'child0_HG00107_HG00100'),
 'child1_HG00107_HG00100': ('HG00107', 'HG00100', 'child1_HG00107_HG00100')}

In [48]:
sample = 'child1_HG00107_HG00100'
gt1 = variant.samples[trios[sample][0]]['GT']   # 从父母中获取基因型信息
gt2 = variant.samples[trios[sample][1]]['GT']
print(gt1,gt2)

(0, 0) (0, 0)


In [50]:
gt = (0,1)
gt[::-1]

(1, 0)

In [None]:
for sample in out_samples:
    if 'child' not in sample:
        new_variant.samples[sample]['GT'] = variant.samples[sample]['GT']
        new_variant.samples[sample].phased = True
    else:
        gt1 = variant.samples[trios[sample][0]]['GT']   # 从父母中获取基因型信息
        gt2 = variant.samples[trios[sample][1]]['GT']

        if is_variant_in_recombination_intervals(variant.pos, recombination_intervals):
            print("in")
            gt1 = gt1[::-1] # 在重组区间内发生重组时对基因型进行翻转
            gt2 = gt2[::-1]

        child_gt = (gt1[0], gt2[0])
        new_variant.samples[sample]['GT'] = child_gt
        new_variant.samples[sample].phased = True

In [52]:
df_data.head(5)

Unnamed: 0,"(chr6, 19720059)","(chr6, 19720065)","(chr6, 19720068)","(chr6, 19720102)","(chr6, 19720240)","(chr6, 19720298)","(chr6, 19720334)","(chr6, 19720497)","(chr6, 19720505)","(chr6, 19720772)",...,"(chr6, 23129332)","(chr6, 23129346)","(chr6, 23129372)","(chr6, 23129482)","(chr6, 23129493)","(chr6, 23129563)","(chr6, 23129619)","(chr6, 23129705)","(chr6, 23129760)","(chr6, 23129898)"
HG00096_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG00096_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG00097_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG00097_1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
HG00099_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
init_couple = 10
persons = sorted(random.sample(range(0, df_data.shape[0]//2), 2 * init_couple))
persons = np.array(persons) 

In [84]:
person = np.sort(np.concatenate((persons*2,1+2*persons)))
person[:8],person.shape

(array([ 336,  337, 1046, 1047, 1140, 1141, 1296, 1297]), (40,))

In [85]:
index = np.array(df_data.index.tolist())[person]
index[:5]

array(['HG00365_0', 'HG00365_1', 'HG01276_0', 'HG01276_1', 'HG01375_0'],
      dtype='<U9')

In [None]:
def generate_linkage_intervals(num_linkage,num_locus):
    '''生成给定数量的重组间隔，即随机选择一定数量的变异位点作为重组点，形成相邻两个变异位点之间的重组区间。'''
    linkage_init = random.sample(num_locus-num_linkage,1)
    linkage_intervals = np.arange(num_linkage)+linkage_init
    return linkage_intervals

In [86]:
res = pd.DataFrame(columns=df_data.columns)
res.head()

Unnamed: 0,"(chr6, 19720059)","(chr6, 19720065)","(chr6, 19720068)","(chr6, 19720102)","(chr6, 19720240)","(chr6, 19720298)","(chr6, 19720334)","(chr6, 19720497)","(chr6, 19720505)","(chr6, 19720772)",...,"(chr6, 23129332)","(chr6, 23129346)","(chr6, 23129372)","(chr6, 23129482)","(chr6, 23129493)","(chr6, 23129563)","(chr6, 23129619)","(chr6, 23129705)","(chr6, 23129760)","(chr6, 23129898)"


In [88]:
num_couple = 5
num_generaton = 3

In [102]:
person1 = sorted(random.sample(range(0, df_data.shape[0]//2), 2 * num_couple))
person1 = np.array(person1) * 2
# index = np.sort(np.concatenate((person1,person1+1)))
persons1 = np.array(df_data.index.tolist())[index]
persons1

array(['HG00157_0', 'HG00157_1', 'HG01060_0', 'HG01060_1', 'HG01343_0',
       'HG01343_1', 'HG01674_0', 'HG01674_1', 'HG02420_0', 'HG02420_1',
       'HG03250_0', 'HG03250_1', 'HG03578_0', 'HG03578_1', 'NA18630_0',
       'NA18630_1', 'NA18878_0', 'NA18878_1', 'NA20811_0', 'NA20811_1'],
      dtype='<U9')

In [103]:
res0 = df_data.loc[persons1]
res0.head()

Unnamed: 0,"(chr6, 19720059)","(chr6, 19720065)","(chr6, 19720068)","(chr6, 19720102)","(chr6, 19720240)","(chr6, 19720298)","(chr6, 19720334)","(chr6, 19720497)","(chr6, 19720505)","(chr6, 19720772)",...,"(chr6, 23129332)","(chr6, 23129346)","(chr6, 23129372)","(chr6, 23129482)","(chr6, 23129493)","(chr6, 23129563)","(chr6, 23129619)","(chr6, 23129705)","(chr6, 23129760)","(chr6, 23129898)"
HG00157_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG00157_1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
HG01060_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG01060_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
HG01343_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
res0_index = res0.index.tolist()

In [113]:
res01 = res0
for i in range(0,res0.shape[0]-1,2):
    for j in range(res0.shape[1]):
        prob = random.random()
        if prob <= 0.01 :
            res0.iloc[i,j] = res0.iloc[i+1,j]
    res01 = res01.drop(res0_index[i+1],axis=0)
res01.head()

Unnamed: 0,"(chr6, 19720059)","(chr6, 19720065)","(chr6, 19720068)","(chr6, 19720102)","(chr6, 19720240)","(chr6, 19720298)","(chr6, 19720334)","(chr6, 19720497)","(chr6, 19720505)","(chr6, 19720772)",...,"(chr6, 23129332)","(chr6, 23129346)","(chr6, 23129372)","(chr6, 23129482)","(chr6, 23129493)","(chr6, 23129563)","(chr6, 23129619)","(chr6, 23129705)","(chr6, 23129760)","(chr6, 23129898)"
HG00157_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG01060_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG01343_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG01674_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HG02420_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
res = pd.DataFrame(columns=df_data.columns)
family = {}

for g in range(num_generaton):
    res0 = 