In [None]:
import os, logging
import pandas as pd
import numpy as np
from tqdm import tqdm

from helpers import get_ip_address, has_write_permission, save_preprocessed_data, read_preprocessed_data

### Set variables

In [None]:
data_locations = {
    '223.195.111.48': '/project/datacamp/team11/data',
    '147.47.44.229': '/home/jinhyun/data/1kGP',
}

chr_list = [str(x) for x in range(1,23)]
gt_dict = {"0|0" :0, "0|1" : 1, "1|0" : 2, "1|1" : 3 } # genotype dict for converting string-> inteter 

### Validation checks

In [None]:
raw_data_path = data_locations.get(get_ip_address(), '/not_found')
sample_annotation_file = os.path.join(raw_data_path, "igsr-1000 genomes 30x on grch38.tsv")
preprocess_path = os.path.join(raw_data_path, "preprocessed")

assert os.path.exists(preprocess_path), f"Data path not exists: {raw_data_path} OR IP setting is incorrect: {get_ip_address()}"
assert os.path.isfile(sample_annotation_file), f"File not exists : {sample_annotation_file}"
assert has_write_permission(preprocess_path), f"You do not have write permission for {preprocess_path}"

In [None]:
sample_annotation_df = pd.read_csv(sample_annotation_file, sep="\t")
print(f"Read sample annotation info with shape : {sample_annotation_df.shape}")

### combine the genotype matrix

In [None]:
merged_file_save_prefix = os.path.join(preprocess_path, "merged")

In [None]:
genotype_array_list, variant_info_df_list = [], []
for chr in chr_list:
    gt_array, variant_info_df = read_preprocessed_data(os.path.join(preprocess_path, f"chr{chr}"))
    
    if gt_array is not None:
        genotype_array_list.append(gt_array)
        variant_info_df_list.append(variant_info_df)

genotype_array_combined = np.concatenate(genotype_array_list, axis=1)
variant_info_df_combined = pd.concat(variant_info_df_list, axis=0, ignore_index=True)
print(f"Combine result: genotype array with shape {genotype_array_combined.shape} and variant info with shape {variant_info_df_combined.shape}")

In [None]:
save_preprocessed_data(genotype_array_combined, variant_info_df_combined, merged_file_save_prefix)