In [1]:
import pathlib

import numpy as np
import pandas as pd
import tqdm

In [2]:
test_iids = pd.read_csv('../data/ukb_populations/test_all.tsv', sep='\t', usecols=['IID'])

In [None]:
total_sums = pd.DataFrame()

reader = pd.read_csv('../data/ukb_distances/ibd_combined.txt.gz', chunksize=1_000_000, 
                     sep='\s+', usecols=['IID1', 'IID2', 'PI_HAT', 'DST'])

for chunk_df in tqdm.tqdm_notebook(reader, total=750):
    chunk_sums_df = (
        chunk_df
        .merge(test_iids, left_on='IID1', right_on='IID', how='left')
        .merge(test_iids, left_on='IID2', right_on='IID', how='left')
        .assign(test_train=lambda df: np.logical_xor(pd.isna(df['IID_x']), pd.isna(df['IID_y'])))
        .query('test_train')
    )
    if chunk_sums_df.shape[0] == 0:
        continue
    
    chunk_sums_df = (
        chunk_sums_df
        .assign(
            test_iid=lambda df: df['IID_x'].combine_first(df['IID_y']).astype(int),
            train_iid=lambda df: df.apply(
                lambda row: row['IID1'] if row['IID1'] != row['test_iid'] 
                else row['IID2'], axis=1).astype(int)
        )
        .filter(items=['test_iid', 'train_iid', 'PI_HAT', 'DST'])
        .groupby('test_iid')
        .agg({'train_iid': 'count', 'PI_HAT': 'sum', 'DST': 'sum'})
    )
    total_sums = total_sums.add(chunk_sums_df, fill_value=0)

total_sums = total_sums.reset_index()
    
total_sums.to_csv('../data/ukb_distances/ibd_complete.tsv', sep='\t', index=False)

total_sums.head()

In [4]:
total_sums.shape

(32816, 4)