In [1]:
#==============================================================================#
# Header
#==============================================================================#
# ISU - Animal Breeding and Genetics Group
# Author: Mark Jian Cheng (with assistance from Hailin Su)
# Created (date): 07/05/2019
# Version (of the script): 1
# Program (if applicable): numpy pandas
# Program Version (if applicable): 
#==============================================================================#
# Description: Code to convert the missing SNP to the mean for large genotype file
#==============================================================================#

#==============================================================================#
# Setup
#==============================================================================#
#Input files: ALGP2_Cycle_1_7_Geno_GenSel.txt for GenSel
#Output files: Formatted genotype file ready for use in GenSel or JWAS 
#==============================================================================#

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer as Imputer

In [3]:
%time dataset = np.loadtxt('ALGP2_Cycle_1_7_GenoA_Complete_JWAS.txt', skiprows=1)

CPU times: user 1min 4s, sys: 2.08 s, total: 1min 6s
Wall time: 1min 6s


In [4]:
X = dataset[:, 1:]  # get genotypes
X

array([[0., 2., 0., ..., 2., 2., 2.],
       [1., 2., 0., ..., 2., 2., 2.],
       [0., 2., 0., ..., 2., 2., 2.],
       ...,
       [0., 2., 0., ..., 2., 2., 2.],
       [1., 2., 0., ..., 2., 2., 2.],
       [1., 1., 1., ..., 2., 2., 2.]])

In [5]:
imputer = Imputer(missing_values = 5, strategy = "mean")

In [6]:
%time imputer = imputer.fit(X)

CPU times: user 1.25 s, sys: 872 ms, total: 2.12 s
Wall time: 2.12 s


In [7]:
%time Ximp = imputer.transform(X)

CPU times: user 1.38 s, sys: 448 ms, total: 1.83 s
Wall time: 1.83 s


In [19]:
# %time Ximp.tofile("out", sep=" ")

CPU times: user 5min 16s, sys: 4.13 s, total: 5min 21s
Wall time: 5min 25s


In [8]:
header = pd.read_csv("ChrInfo_map_1_7_Complete_JWAS.txt", sep=",")

In [9]:
header_line = " ".join(header["SNP"].values)

In [10]:
header_line[:100]

'AX-116642173 AX-116642174 AX-116642175 AX-116642176 AX-116642178 AX-116642181 AX-116642184 AX-116642'

In [11]:
%time np.savetxt("GenoA_Complete_JWAS_impute.txt", Ximp, header=header_line, fmt='%i', comments="")

CPU times: user 39.4 s, sys: 1.2 s, total: 40.6 s
Wall time: 41.1 s
