In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import itertools
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
all_geno_combs = list(itertools.product(["A", "T", "G", "C"], ["A", "T", "G", "C"])) + [("-", "-")]
all_geno_combs = np.unique(["".join(np.sort(cur_genos)) for cur_genos in all_geno_combs])
cur_source = "./data/var_autoencoder/aggression_indep_kl/"

In [3]:
def to_one_hot(df, vocab, stack_axis = 1):
    res = [np.float16(np.array(df) == cur_class) for cur_class in vocab]
    res = np.stack(res, axis = stack_axis)
    return res

In [4]:
child_genos_test = pd.read_csv(cur_source + "child_genos_test.csv")
child_genos_test_one_hot = to_one_hot(child_genos_test.to_numpy()[:, 3:-2], all_geno_combs)
child_genos_train = pd.read_csv(cur_source + "child_genos_train.csv")
child_genos_train_one_hot = to_one_hot(child_genos_train.to_numpy()[:, 3:-2], all_geno_combs)

p1_test_genos_df = pd.read_csv(cur_source + "p1_test_genos_df.csv")
p1_test_genos_one_hot = to_one_hot(p1_test_genos_df.to_numpy()[:, 1:-1], all_geno_combs)
p2_test_genos_df = pd.read_csv(cur_source + "p2_test_genos_df.csv")
p2_test_genos_one_hot = to_one_hot(p2_test_genos_df.to_numpy()[:, 1:-1], all_geno_combs)

p1_train_genos_df = pd.read_csv(cur_source + "p1_train_genos_df.csv")
p1_train_genos_one_hot = to_one_hot(p1_train_genos_df.to_numpy()[:, 1:-1], all_geno_combs)
p2_train_genos_df = pd.read_csv(cur_source + "p2_train_genos_df.csv")
p2_train_genos_one_hot = to_one_hot(p2_train_genos_df.to_numpy()[:, 1:-1], all_geno_combs)

all_p_test_one_hot = (p1_test_genos_one_hot * p2_test_genos_one_hot)
all_p_train_one_hot = (p1_train_genos_one_hot * p2_train_genos_one_hot)

In [5]:
unique_train_p = np.unique(all_p_train_one_hot.reshape((all_p_train_one_hot.shape[0], -1)), axis = 0)
unique_train_p_pops = np.unique(p1_train_genos_df["pop"].to_numpy())
unique_test_p = np.unique(all_p_test_one_hot.reshape((all_p_test_one_hot.shape[0], -1)), axis = 0)
unique_test_p_pops = np.unique(p1_test_genos_df["pop"].to_numpy())

In [43]:
all_p_vec = np.concatenate([unique_test_p, unique_train_p], axis = 0)
data_labels = ["test" for i in range(len(unique_test_p_pops))] + ["train" for i in range(len(unique_train_p_pops))]
pairwise_p_dist = pairwise_distances(all_p_vec, metric="hamming")
pairwise_p_dist_df = pd.DataFrame(pairwise_p_dist)
pairwise_p_dist_df["pop"] = np.concatenate((unique_test_p_pops, unique_train_p_pops))
pairwise_p_dist_df["data_type"] = data_labels
pairwise_p_dist_df.to_csv(cur_source + "p_genos_pairwise_dist.csv")

In [44]:
child_genos_test_long = child_genos_test_one_hot.reshape((child_genos_test_one_hot.shape[0], -1))
child_genos_train_long = child_genos_train_one_hot.reshape((child_genos_train_one_hot.shape[0], -1))

In [39]:
test_c_pops = child_genos_test["pop"].to_numpy()
train_c_pops = child_genos_train["pop"].to_numpy()

In [45]:
all_c_vec = np.concatenate([child_genos_test_long, child_genos_train_long], axis = 0)
data_labels = ["test" for i in range(len(test_c_pops))] + ["train" for i in range(len(train_c_pops))]
pairwise_c_dist = pairwise_distances(all_c_vec, metric="hamming")
pairwise_c_dist_df = pd.DataFrame(pairwise_c_dist)
pairwise_c_dist_df["pop"] = np.concatenate((test_c_pops, train_c_pops), axis = 0)
pairwise_c_dist_df["data_type"] = data_labels
pairwise_c_dist_df.to_csv(cur_source + "c_genos_pairwise_dist.csv")