# Yemen + UAE context: HGDP

Overlap:329K variants
Just including 169 Yemeni samples

Requires a recent version of bokeh, to use legend labels.
Use conda environment impute on btc server.


In [1]:
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt

from bokeh.plotting import figure, show
from myutils import colors, colors2, markers

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

%matplotlib notebook
from bokeh.io import output_notebook
output_notebook()

In [13]:
markers

{'Sad': 'circle',
 'Amr': 'dot',
 'San': 'star',
 'Dhm': 'x',
 'Mhw': 'circle',
 'Haj': 'x',
 'Hdr': 'circle',
 'Shb': 'x',
 'Ibb': 'circle',
 'Tiz': 'x',
 'Jwf': 'circle',
 'Mrb': 'x',
 'Byd': 'star',
 'Dal': 'circle',
 'Lahj': 'x',
 'Abyn': 'dot',
 'Rsa': 'circle'}

### Loading PCA results

PCA was done for the merged dataset HGDP+ Yemen169.
Performed with FlashPCA, which gives up to 10 PCs.
Annoyingly, the Ids were of different format to the ones in the meta data, so had to fix that:

urn:wtsi:402769_H09_3577STDY6068568 -> 3577STDY6068568


In [2]:
pca = pd.read_csv(f'SWAsia_UAE/pcs.txt', delimiter='\t')
ids = [iid.split('_')[-1] for iid in pca['IID']]
pca['Id'] = ids
pca.set_index('Id', inplace=True)
pca

Unnamed: 0_level_0,FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12551,1,12551,-0.004323,0.082199,-0.000630,0.014921,-0.004459,-0.011395,-0.008377,0.037505,0.010839,0.005526
120001,1,120001,0.106806,0.123794,0.000588,0.021190,0.003744,0.000607,-0.017972,0.048858,0.020582,0.001086
HGDP00448,1,HGDP00448,-0.628873,-0.100136,0.013303,-0.011147,-0.049182,-0.061049,-0.334741,-0.069103,-0.021791,-0.005564
12572,2,12572,0.095784,0.147162,0.001922,0.003637,0.000735,0.003780,-0.000063,-0.032211,-0.012658,-0.001887
120002,2,120002,0.023246,0.060763,-0.007883,0.014814,-0.052158,-0.073848,0.005883,0.033054,0.008976,-0.001530
...,...,...,...,...,...,...,...,...,...,...,...,...
yemcha6089895,urn:wtsi:402772_H08_yemcha6089895,urn:wtsi:402772_H08_yemcha6089895,-0.394578,-0.011719,0.003811,-0.005275,0.040081,0.048341,0.169565,-0.042449,-0.016419,-0.003733
yemcha6089912,urn:wtsi:402772_H09_yemcha6089912,urn:wtsi:402772_H09_yemcha6089912,-0.425762,-0.024574,0.005122,-0.005644,0.038036,0.043751,0.158637,-0.022805,-0.002174,-0.003339
yemcha6103362,urn:wtsi:402772_H10_yemcha6103362,urn:wtsi:402772_H10_yemcha6103362,0.153160,0.177829,0.034390,-0.940099,0.000279,0.028669,-0.014442,0.041425,0.012257,0.001813
yemcha6103370,urn:wtsi:402772_H11_yemcha6103370,urn:wtsi:402772_H11_yemcha6103370,0.151877,0.174662,0.033086,-0.909215,0.001573,0.026077,-0.012832,0.036314,0.011791,0.000981


### Linking IDs to regions

Reading out the combined metadata 

In [3]:
meta = pd.read_csv("Metadata/allRegions.csv", index_col='Id')
hgdpMeta = pd.read_csv("HGDP/HGDPid_populations.csv.gz")
hgdpMeta = hgdpMeta[['Id', 'Region']].set_index('Id')

meta = pd.concat([meta, hgdpMeta])
'dubai' in set(meta.Region)

True

In [4]:
pca = pca.join(meta)
pca = pca.dropna(subset=['Region'])
pca

Unnamed: 0_level_0,FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,Region
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10031,489,10031,0.010982,0.111545,0.001678,0.013983,0.027266,0.048323,-0.000773,0.006160,0.003526,-0.006137,yemen
10049,492,10049,0.122976,0.127711,0.006431,0.010905,-0.028616,-0.056261,0.009054,0.004675,0.001162,-0.001559,abu_dhabi
10151,493,10151,0.077945,0.168194,0.000309,0.021677,0.058009,0.088829,-0.028231,0.015863,0.005191,-0.004856,yemen
10154,366,10154,0.111176,0.161050,-0.002004,0.017205,0.004974,0.003088,-0.015281,0.041456,0.009588,-0.000707,al_fujairah
10160,494,10160,0.106176,0.187748,0.003240,0.028427,0.085292,0.112373,-0.037923,0.028364,0.012048,0.000880,al_ain
...,...,...,...,...,...,...,...,...,...,...,...,...,...
yemcha6089908,urn:wtsi:402772_F09_yemcha6089908,urn:wtsi:402772_F09_yemcha6089908,-0.423641,-0.029730,0.007402,-0.001167,0.039073,0.047522,0.163804,-0.024693,-0.008964,-0.005269,Chad
yemcha6089910,urn:wtsi:402772_G09_yemcha6089910,urn:wtsi:402772_G09_yemcha6089910,-0.396792,-0.016044,0.004719,0.000192,0.042920,0.052515,0.183906,-0.047062,-0.010887,-0.006897,Chad
yemcha6089912,urn:wtsi:402772_H09_yemcha6089912,urn:wtsi:402772_H09_yemcha6089912,-0.425762,-0.024574,0.005122,-0.005644,0.038036,0.043751,0.158637,-0.022805,-0.002174,-0.003339,Chad
yemcha6089913,urn:wtsi:402772_A10_yemcha6089913,urn:wtsi:402772_A10_yemcha6089913,-0.396370,-0.009822,0.009087,0.000997,0.044040,0.052274,0.163494,-0.045535,-0.013838,-0.009691,Chad


In [5]:
## only use if you want to collapse all HGDP
collapseHGDP = False
if collapseHGDP:
    for i in ddf.index:
        if i.startswith('HGDP'):
            pca.at[i, 'region'] = 'HGDP'
pca

Unnamed: 0_level_0,FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,Region
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10031,489,10031,0.010982,0.111545,0.001678,0.013983,0.027266,0.048323,-0.000773,0.006160,0.003526,-0.006137,yemen
10049,492,10049,0.122976,0.127711,0.006431,0.010905,-0.028616,-0.056261,0.009054,0.004675,0.001162,-0.001559,abu_dhabi
10151,493,10151,0.077945,0.168194,0.000309,0.021677,0.058009,0.088829,-0.028231,0.015863,0.005191,-0.004856,yemen
10154,366,10154,0.111176,0.161050,-0.002004,0.017205,0.004974,0.003088,-0.015281,0.041456,0.009588,-0.000707,al_fujairah
10160,494,10160,0.106176,0.187748,0.003240,0.028427,0.085292,0.112373,-0.037923,0.028364,0.012048,0.000880,al_ain
...,...,...,...,...,...,...,...,...,...,...,...,...,...
yemcha6089908,urn:wtsi:402772_F09_yemcha6089908,urn:wtsi:402772_F09_yemcha6089908,-0.423641,-0.029730,0.007402,-0.001167,0.039073,0.047522,0.163804,-0.024693,-0.008964,-0.005269,Chad
yemcha6089910,urn:wtsi:402772_G09_yemcha6089910,urn:wtsi:402772_G09_yemcha6089910,-0.396792,-0.016044,0.004719,0.000192,0.042920,0.052515,0.183906,-0.047062,-0.010887,-0.006897,Chad
yemcha6089912,urn:wtsi:402772_H09_yemcha6089912,urn:wtsi:402772_H09_yemcha6089912,-0.425762,-0.024574,0.005122,-0.005644,0.038036,0.043751,0.158637,-0.022805,-0.002174,-0.003339,Chad
yemcha6089913,urn:wtsi:402772_A10_yemcha6089913,urn:wtsi:402772_A10_yemcha6089913,-0.396370,-0.009822,0.009087,0.000997,0.044040,0.052274,0.163494,-0.045535,-0.013838,-0.009691,Chad


In [32]:

selection = ['Asia', 'abu_dhabi', 'Chad', 'Europe', 'Middle_Est', 'Subsaharian_Africa', 'dubai', 'rak', 'al_ain', 'sharjah', 'yemen', 'Druze', 'Palestinian', 'Rsa', 'al_fujairah', 'North_Africa', 'BedouinA', 'Brahui', 'Mozabite', 'india', 'Hdr', 'Balochi', 'BedouinB', 'Ibb', 'Kalash', 'Adygei', 'uae', 'Tiz', 'Hazara', 'Sindhi_Pakistan', 'oman', 'Jwf', 'San', 'um_alquwain', 'Shb', 'ajman', 'Dal', 'Haj', 'qatar', 'kuwait', 'egypt', 'Abyn', 'Dhm', 'Amr', 'bahrain', 'palestine', 'saudi_arabia', 'Sad', 'Lahj', 'africa', 'Mrb', 'iran', 'sudan', 'Mhw', 'Byd', 'syria']
selectionLocal = ['abu_dhabi',  'dubai', 'rak', 'al_ain', 'sharjah', 'yemen', 'Rsa', 'al_fujairah', 'Hdr', 'Ibb','uae', 'Tiz',  'Jwf', 'San', 'um_alquwain', 'Shb', 'ajman', 'Dal', 'Haj', 'Abyn', 'Dhm', 'Amr',  'Mhw', 'Byd', ]
yemenRegions = ['Rsa',  'Hdr', 'Ibb','uae', 'Tiz',  'Jwf', 'San', 'Shb', 'ajman', 'Dal', 'Haj', 'Abyn', 'Dhm', 'Amr',  'Mhw', 'Byd', 'Lahj', 'Mrb']
uaeEmirates = ['abu_dhabi',  'dubai', 'rak', 'al_ain', 'sharjah', 'al_fujairah', 'um_alquwain',  'ajman']

In [29]:
'Chad' in colors

False

In [33]:

p = figure(width=1400, height=1200)

x = pca[['PC1', 'PC2']].values
y = pca['Region'].values
colors['Chad'] = 'red'
for e in selection:
    size = 7 if e in selectionLocal else 4
    marker = 'square'
    if e in yemenRegions: marker = 'triangle'
    if e in uaeEmirates: marker = 'circle'
    if e in colors:  
        p.scatter(x[y==e].T[0], x[y==e].T[1], 
            color=colors.get(e, 'whitesmoke'), 
            size=size,
            marker=marker,
            alpha=1, muted_color='grey', muted_alpha = 0.1, legend_label=e
        )
p.legend.location = "top_left"
p.legend.click_policy = 'mute'
show(p)

In [11]:
f = 'region'
x = ddf[['PC1', 'PC3']].values
y = ddf[f].values

In [12]:

fig, ax = plt.subplots(nrows=12, ncols=3, figsize=(16,18))
i,j = 0,0
for pc1 in range(1,10):
    for pc2 in range(pc1+1,10):

        x = ddf[['PC%s'%pc1, 'PC%s'%pc2]].values
        #plt.figure(figsize=(9.5, 8))
        for e in order[f]:
            n = sum(y == e)
            ax[i][j].set_title('PC %s vs PC %s' %(pc1, pc2))
            ax[i][j].scatter(
                x[y==e].T[0], x[y==e].T[1], label=e,
                c=colors[f][e],
                marker=markers[f][e]
            )
        j+=1
        if j==3: 
            j=0; i+=1
#plt.legend(bbox_to_anchor=(0.9667, -.1), ncol=3)

ax.tight_layout()

<IPython.core.display.Javascript object>

NameError: name 'order' is not defined

In [None]:
dff[['PC1', 'PC2', 'region', 'SUPPLIER SAMPLE NAME']].sort_values(by='PC2')

In [None]:

x = ddf[['PC1', 'PC2', 'PC3']].values
y = ddf[f].values

fig = plt.figure(figsize=(9.5, 8))
ax = fig.add_subplot(projection='3d')

for e in order[f]:
    n = sum(y == e)
    ax.scatter(
        x[y==e].T[0], x[y==e].T[1], x[y==e].T[2], label=e,
        color=[colors[f][e]] * n,
        marker=markers[f][e]
    )

ax.view_init(azim=46, elev=-43)
plt.legend(bbox_to_anchor=(0.9667, 0.1667), ncol=3)

In [None]:
from numpy import random
100*random.rand(2)

In [None]:
from math import pi

import pandas as pd

from bokeh.io import output_file, show
from bokeh.palettes import Category20c
from bokeh.plotting import figure
from bokeh.transform import cumsum

x = { 'United States': 157, 'United Kingdom': 93, 'Japan': 89, 'China': 63,
    'Germany': 44, 'India': 42, 'Italy': 40, 'Australia': 35,
    'Brazil': 32, 'France': 31, 'Taiwan': 31, 'Spain': 29 }

data = pd.Series(x).reset_index(name='value').rename(columns={'index':'country'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = Category20c[len(x)]

p = figure(plot_height=200, plot_width=600, title="PCA + Admixture", x_range=(0, 100), y_range=(0, 100),
    tooltips="@country: @value")

for i in range(100):
    x, y = 100* random.rand(2)
    p.wedge(x=x, y=y, radius=2,
            start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
            line_color="white", fill_color='color',  source=data)

p.wedge(x=1, y=1.3, radius=0.3,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', source=data)

show(p)

In [None]:
cumsum?

In [None]:
data

In [None]:
!pwd

In [None]:
hgdpSampleInfo = pd.read_csv("HGDP/HGDPid_populations.csv", sep=',', index_col='Id')

In [None]:
admixData = "Admixture/Yemen1240K/yemen_clean_reich1240K4.16.Q"
q = pd.read_csv(admixData, header=None, sep=' ')

q