## Replace missing genotypes with column means and write genotypes in binary format


In [1]:
using DelimitedFiles, DataFrames, Statistics

In [2]:
function replaceMissing!(M)
    for i=1:size(M,1)
        iMean = mean(skipmissing(M[i,:]))
        M[i,ismissing.(M[i,:])] .= iMean
    end
end

replaceMissing! (generic function with 1 method)

In [3]:
function writeMatBin(inFileName,outFileName;header=true,dlm=' ',missingStr="NA")
    inFile   = open(inFileName, "r")
    n    = countlines(inFile)
    close(inFile)
    inFile   = open(inFileName, "r")
    line = readline(inFile)
    res  = split(line,dlm)
    p    = size(res,1)
    close(inFile)
 
    if header==true
        M = Array{Union{Float64,Missings.Missing}}(undef,p-1,n-1)
        j = 0
    else
        M = Array{Union{Float64,Missings.Missing}}(undef,p-1,n)
        ids = Array{String}(undef,n)            
        j = 1
    end
    outID = open(outFileName*".ids.txt","w")     
    for line in eachline(inFileName)
        if j > 0 # skip header
            if j%100 == 0 
                println(j)
            end
            res  = split(line,dlm)
            println(outID,res[1])   
            for i in 2:p 
                x = res[i]
                M[i-1,j] = x!=missingStr ? parse(Float64,x) : Missings.missing
            end
        end
        j += 1
    end

    replaceMissing!(M)
    M = convert(Array{Float64}, M)
    if header==true
        write(outFileName*".gen.bin",n-1,p-1,M')
    else
        write(outFileName*".gen.bin",n-1,p-1,M')
    end
    close(outID)            
end

writeMatBin (generic function with 1 method)

In [None]:
writeMatBin("ALGP2_Cycle_1_7_Geno_Complete_JWAS_MHC.txt","ALGP2_Cycle_1_7_Geno_Complete_Impute_JWAS_MHC.bin")

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100


In [3]:
function readMatBin(fileName)
    genStr = open(fileName)
    n = read(genStr,Int64)
    p = read(genStr,Int64)
    M = zeros(n,p)
    for j in 1:p
        for i in 1:n
            M[i,j] = read(genStr,Float64)
        end
    end
    close(genStr)
    return M
end

readMatBin (generic function with 1 method)

In [10]:
rowID = vec(readdlm("ALGP2_Cycle_1_7_Geno_Complete_Impute_JWAS_SNP.bin.ids.txt",String))

153-element Array{String,1}:
 "0"
 "0"
 "1"
 "1"
 "0"
 "1"
 "0"
 "1"
 "1"
 "0"
 "2"
 "0"
 "0"
 ⋮  
 "2"
 "1"
 "1"
 "1"
 "1"
 "0"
 "1"
 "1"
 "1"
 "0"
 "1"
 "1"

In [11]:
M = readMatBin("ALGP2_Cycle_1_7_GenoA_Complete_Impute_JWAS.bin.gen.bin")

153×435171 Array{Float64,2}:
 2.0  0.0  2.0  2.0  0.0  2.0  2.0  0.0  …  0.0  2.0  1.0  2.0  2.0  2.0  2.0
 2.0  0.0  2.0  2.0  0.0  2.0  2.0  0.0     0.0  2.0  0.0  2.0  2.0  2.0  2.0
 2.0  0.0  1.0  2.0  0.0  1.0  2.0  0.0     0.0  2.0  1.0  2.0  2.0  2.0  2.0
 2.0  0.0  1.0  2.0  0.0  1.0  2.0  0.0     0.0  2.0  0.0  2.0  2.0  2.0  2.0
 2.0  0.0  1.0  2.0  0.0  1.0  2.0  0.0     0.0  2.0  0.0  2.0  2.0  2.0  2.0
 2.0  0.0  1.0  2.0  0.0  1.0  2.0  0.0  …  0.0  2.0  0.0  2.0  2.0  2.0  2.0
 2.0  0.0  1.0  2.0  0.0  1.0  2.0  0.0     0.0  2.0  0.0  2.0  2.0  2.0  2.0
 1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0     0.0  2.0  0.0  2.0  2.0  2.0  2.0
 1.0  1.0  0.0  1.0  1.0  0.0  1.0  1.0     0.0  2.0  0.0  2.0  2.0  2.0  2.0
 2.0  0.0  2.0  2.0  0.0  2.0  2.0  0.0     0.0  2.0  0.0  2.0  2.0  2.0  2.0
 0.0  2.0  0.0  0.0  2.0  0.0  0.0  2.0  …  1.0  2.0  0.0  2.0  2.0  2.0  2.0
 2.0  0.0  2.0  2.0  0.0  2.0  2.0  0.0     0.0  2.0  0.0  2.0  2.0  2.0  2.0
 2.0  0.0  1.0  2.0  0.0  1.0  2.0 

In [None]:
;cat smallGenFile.txt

In [1]:
function add_genotypes(mme::MME,M::Array{Float64,2},G;header=false,center=true,rowID=false,G_is_marker_variance=false,df=4)
    if length(rowID) != size(M,1)
        rowID = string.(1:size(M,1))
    end
    if length(header) != (size(M,2)+1)
        header = ["id"; string.(1:size(M,2))]
    end
    mme.M   = readgenotypes(M,rowID = rowID, header=header, center=center)
    if G_is_marker_variance == true
        mme.M.G = G
    else
        mme.M.genetic_variance = G
    end
    mme.df.marker = Float64(df)

    println(size(mme.M.genotypes,2), " markers on ",size(mme.M.genotypes,1)," individuals were added.")
end

UndefVarError: UndefVarError: MME not defined