In [1]:
from scipy.spatial import distance
import numpy as np
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt


In [10]:
!ls ../Data

DistanceMatrices	popfile_1240K_5000.txt	popfile_HO_3000.txt
popfile_1240K_1000.txt	popfile_HO_1000.txt	popfile_HO_4000.txt
popfile_1240K_2000.txt	popfile_HO_2000a.txt	popfile_HO_5000.txt
popfile_1240K_3000.txt	popfile_HO_2000.txt
popfile_1240K_4000.txt	popfile_HO_3000a.txt


In [5]:
yemenMeta = pd.read_csv('../Metadata/yemenRegions.csv', index_col='Id')
yemenMeta.columns = ['Population']

In [7]:
reichset = ['1240K', 'HO'][1]
if reichset=='1240K':
    columns2keep = [1,9, 12, 14]
elif reichset=='HO':
    columns2keep = [1, 5, 7, 9]
columnNames = 'Id Date Group_Label Country'.split()

reich=pd.read_csv(f"../Reich/v44.3_{reichset}_public.anno", sep='\t')

reich = reich.iloc[:,columns2keep]

reich.columns = columnNames
## QC
reich = reich[~reich.Group_Label.str.startswith("Ignore_")]
reich = reich[~reich.Group_Label.str.endswith("_outlier")]

## Ancient subset
reichAncient = reich[reich.Date > 10]
print(reichAncient.shape)
reich = reich[['Id', 'Group_Label']].set_index('Id')
reich.columns = ['Population'] 
meta = pd.concat([yemenMeta, reich])

(5392, 4)


### Loading PCA data

In [8]:
pca = pd.read_csv(f'../FlashPCAResults/YemenReichHO/pcs.txt', delimiter='\t')
ids = [iid.split('_')[-1] for iid in pca['IID']]
pca['FID1'] = ids
pca.set_index('FID1', inplace=True)
ddf = pca.join(meta).dropna(subset=['Population'])

### Population distances and Hierarchical clustering of populations
The goal
* combine highly similar populations (GujaratiA, GujaratiB). This brings more statistical power to subsequent analyses, like F3, F4 etc.
* See where our Yemen populations fall into
Method: for 2 populations A and B, calculate all pairwise Euclidean distances of samples from A, B, yielding 48 distances, which then is averaged.  
A sample is represented by its first 10 PCs (PC1-PC10).
https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist

In [9]:
def popdistance(pop1, pop2):
    pop1DF = ddf[ddf.Population==pop1]
    pop2DF = ddf[ddf.Population==pop2]
    XA = pop1DF.iloc[:,2:12].to_numpy()
    XB = pop2DF.iloc[:,2:12].to_numpy()
    return distance.cdist(XA, XB, 'euclidean').mean()    

In [None]:
pops = sorted(Counter(ddf.Population))

In [12]:
precalc = True ## for the time being, note, D-matrix currently only for 678 populations
if precalc:
    D1 = pd.read_csv('../Data/DistanceMatrices/distanceMatrix_thinnedContext.csv', index_col=0)
    #with open('populationDistance.npy','rb') as popD:
    #    D = np.load(popD)            
else:
    pops = sorted(Counter(ddf.Region))
    D = np.zeros((len(pops),len(pops)))
    for i1 in range(len(pops)):
        for i2 in range(i1+1, len(pops)):
            D[i1,i2] = popdistance(pops[i1], pops[i2])
    D += D.T
    D1 = pd.DataFrame(D, index=pops, columns=pops)
    D1.to_csv(f'../Data/DistancMatrices/distanceMatrix_{len(pops)}_{reichset}.csv')        
       