In [1]:
import os, logging
import pandas as pd
import numpy as np
from tqdm import tqdm

from helpers import get_ip_address, has_write_permission, save_preprocessed_data, read_preprocessed_data

### Set variables

In [2]:
data_locations = {
    '223.195.111.48': '/project/datacamp/team11/data',
    '147.47.44.229': '/home/jinhyun/data/1kGP',
}

chr_list = [str(x) for x in range(1,23)]
gt_dict = {"0|0" :0, "0|1" : 1, "1|0" : 2, "1|1" : 3 } # genotype dict for converting string-> inteter 

### Validation checks

In [3]:
raw_data_path = data_locations.get(get_ip_address(), '/not_found')
sample_annotation_file = os.path.join(raw_data_path, "igsr-1000 genomes 30x on grch38.tsv")
preprocess_path = os.path.join(raw_data_path, "preprocessed")

assert os.path.exists(preprocess_path), f"Data path not exists: {raw_data_path} OR IP setting is incorrect: {get_ip_address()}"
assert os.path.isfile(sample_annotation_file), f"File not exists : {sample_annotation_file}"
assert has_write_permission(preprocess_path), f"You do not have write permission for {preprocess_path}"

In [4]:
sample_annotation_df = pd.read_csv(sample_annotation_file, sep="\t")
print(f"Read sample annotation info with shape : {sample_annotation_df.shape}")

Read sample annotation info with shape : (3202, 9)


### combine the genotype matrix

In [5]:
merged_file_save_prefix = os.path.join(preprocess_path, "merged")

In [6]:
genotype_array_list, variant_info_df_list = [], []
for chr in chr_list:
    gt_array, variant_info_df = read_preprocessed_data(os.path.join(preprocess_path, f"chr{chr}"))
    
    if gt_array is not None:
        genotype_array_list.append(gt_array)
        variant_info_df_list.append(variant_info_df)

genotype_array_combined = np.concatenate(genotype_array_list, axis=1)
variant_info_df_combined = pd.concat(variant_info_df_list, axis=0, ignore_index=True)
print(f"Combine result: genotype array with shape {genotype_array_combined.shape} and variant info with shape {variant_info_df_combined.shape}")

Reading data from files /home/jinhyun/data/1kGP/preprocessed/chr1_matrix.npy and /home/jinhyun/data/1kGP/preprocessed/chr1_variant.csv
Read genotype array of shape (3202, 5013617) and variant info dataframe of shape (5013617, 4)
Reading data from files /home/jinhyun/data/1kGP/preprocessed/chr2_matrix.npy and /home/jinhyun/data/1kGP/preprocessed/chr2_variant.csv
Read genotype array of shape (3202, 5318178) and variant info dataframe of shape (5318178, 4)
Reading data from files /home/jinhyun/data/1kGP/preprocessed/chr3_matrix.npy and /home/jinhyun/data/1kGP/preprocessed/chr3_variant.csv
Read genotype array of shape (3202, 4361093) and variant info dataframe of shape (4361093, 4)
Reading data from files /home/jinhyun/data/1kGP/preprocessed/chr4_matrix.npy and /home/jinhyun/data/1kGP/preprocessed/chr4_variant.csv
Read genotype array of shape (3202, 4264103) and variant info dataframe of shape (4264103, 4)
Reading data from files /home/jinhyun/data/1kGP/preprocessed/chr5_matrix.npy and /ho

In [7]:
save_preprocessed_data(genotype_array_combined, variant_info_df_combined, merged_file_save_prefix)

genotype matrix shape (#samples, #features) : (3202, 61599150) -> saved to /home/jinhyun/data/1kGP/preprocessed/merged_matrix.npy
variant info dataframe (#features, ): (61599150, 4) -> saved to /home/jinhyun/data/1kGP/preprocessed/merged_variant.csv
