# PCA visualization
Using output files from SmartPCA or FlashPCA
SmartPCA has region info included, no need to merge with meta data

In [69]:
from bokeh.palettes import Dark2_5 as palette
import itertools
from bokeh.plotting import output_file, save

In [70]:
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from bokeh.palettes import Spectral4
from bokeh.core.enums import MarkerType
from bokeh.plotting import figure, show
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from matplotlib import rcParams
rcParams['figure.figsize'] = 11.7,8.27
%matplotlib notebook
from bokeh.io import output_notebook
output_notebook()

  from IPython.core.display import display, HTML


In [71]:
from myutils import colors, colors2, markers, markers2
import numpy as np

### Loading PCA results

PCA was done for the merged dataset HGDP+ Yemen169.
Performed with FlashPCA, which gives up to 10 PCs.
Annoyingly, the Ids were of different format to the ones in the meta data, so had to fix that:

urn:wtsi:402769_H09_3577STDY6068568 -> 3577STDY6068568


In [72]:
## global variables
yregions = ('Sad', 'Amr', 'San', 'Dhm', 'Mhw', 'Haj', 'Hdr', 'Shb', 'Ibb', 'Tiz', 'Jwf', 'Mrb', 'Byd', 'Dal', 'Lahj', 'Abyn', 'Rsa')

def format(line):
    fields = line.split()
    iid = fields[0]
    if iid.startswith: iid = iid[-15:]
    return [iid] + list(map(float, fields[1:-1])) + fields[-1:]

def getPCAdata(geoPCA=3000, dataset='HO'):
    if geoPCA:
        addon=geoPCA//1000
    else: addon=""
    base = f"{dataset}_mind0.3_geno_0.2"
    wdir = f"../AdmixTools_{base}"
    pcafile = f'{wdir}/yemen_reich{base}.LD.QC{addon}.pcs.txt' ## careful, slight inconsist. naming convention HGDP/Reich

    df = pd.DataFrame([format(line) for line in open(pcafile) if not line.strip().startswith('#eigvals')])
    df.columns = ['FID1'] + [f'PC{i}' for i in range(1,11)] + ['Region']
    ddf = df.set_index('FID1')
    return ddf

In [73]:
from bokeh.core.enums import MarkerType
#from bokeh import 
def plotPCA(ddf, regions, unmuted, width=1400, height=1600, saveFig=None, title="", nonYemenMarkers=['triangle']):
    regions = unmuted  + list(set(regions).difference(unmuted))
    if saveFig and saveFig.endswith(".html"):
        output_file(filename=saveFig, title=title)
    p = figure(width=width, height=height)

    x = ddf[['PC1', 'PC2']].values
    y = ddf['Region'].values

    bokehColors = itertools.cycle(palette)
    bokehMarkers = itertools.cycle(nonYemenMarkers)
    scatter={}
    for region in regions:
        if region in yregions: 
            alpha = 1
        else:
            alpha = 0.5
        scatter[region] = p.scatter(x[y==region].T[0], x[y==region].T[1], 
            color=colors.get(region, next(bokehColors)), 
            marker=markers2.get(region, next(bokehMarkers)),
            #marker = marker,
            size=7,
            alpha=alpha, muted_color='grey', muted_alpha = 0.1, legend_label=region
        )
        if not region in unmuted:
            scatter[region].muted = True            
    p.legend.location = "top_left"
    p.legend.click_policy = 'mute'
    show(p)
    if saveFig: 
        if saveFig.endswith('.html'):
            result = save(p)
            print(result)
        elif saveFig.endswith('.svg'):
            export_svg(p, filename=saveFig)
    return p
#        else:
#            
#            p.savefig(saveFig)


In [74]:
def getRegions(pcaDF, dataset, geoPCA=None, ancient=False, contemp=True):
    if geoPCA:
        # contemporary
        regions = [line.strip() for line in open(f"../Data/popfile_{dataset}_{geoPCA}.txt")]
    else:  
        if contemp:
            regions = yregions + list(zip(*Counter(pcaDF.Region).most_common(40)))[0]
        if ancient:
            ancientAsianPops = ['Turkmenistan_Gonur_BA_1', 'Iran_C_TepeHissar', 'Israel_MLBA', 'Turkmenistan_Gonur_BA_1_lc', 'Uzbekistan_SappaliTepe_BA', 'Israel_C', 'Turkey_N', 'India_RoopkundB', 'India_RoopkundA', 'Taiwan_Hanben_IA', 'Pakistan_Katelai_IA', 'Pakistan_Loebanr_IA', 'Mongolia_EIA_Sagly_4', 'Pakistan_Udegram_IA', 'Jordan_LBA', 'Mongolia_EIA_SlabGrave_1', 'Turkmenistan_C_Geoksyur', 'Kyrgyzstan_TianShan_Hun.SG', 'Mongolia_LBA_Khovsgol_6', 'Turkey_Alalakh_MLBA', 'Turkey_Arslantepe_LateC', 'Syria_Ebla_EMBA', 'Turkey_Ikiztepe_LateC', 'China_SEastAsia_Coastal_LN']
            regions = yregions + tuple(ancientAsianPops)
        
    return list(regions)


In [75]:
ymarkers = set([markers2[yr] for yr in yregions])
nonYemenMarkers = [marker for marker in MarkerType if not marker in ymarkers and not marker in ['star', 'dot']]


In [76]:
from bokeh.io import export_svg

## Ancient - 1240K

In [None]:
dataset = "1240K"
pcaDF = getPCAdata(geoPCA=None , dataset = dataset)
regions = getRegions(pcaDF, dataset, ancient=True)


In [77]:
#plotPCA(pcaDF, regions, regions, width=1400, height=1200, saveFig=f"../Results/full_{dataset}_ancient_PCA.html", nonYemenMarkers=nonYemenMarkers)
plot = plotPCA(pcaDF, regions, regions, width=1400, height=1200, saveFig=f"../Results/full_{dataset}_ancient_PCA.svg", nonYemenMarkers=nonYemenMarkers)

RuntimeError: Neither firefox and geckodriver nor a variant of chromium browser and chromedriver are available on system PATH. You can install the former with 'conda install -c conda-forge firefox geckodriver'.

In [78]:
export_svg(plot, filename="bokehPlot_a1240K.svg")

RuntimeError: Neither firefox and geckodriver nor a variant of chromium browser and chromedriver are available on system PATH. You can install the former with 'conda install -c conda-forge firefox geckodriver'.

## Ancient - Human Origins

In [79]:
dataset = "HO"
pcaDF = getPCAdata(geoPCA=None , dataset = dataset)
regions = getRegions(pcaDF, dataset, ancient=True)
plotPCA(pcaDF, regions, regions, width=1400, height=1200, saveFig=f"../Results/full_{dataset}_ancient_PCA.html", nonYemenMarkers=nonYemenMarkers)

/mnt/Drive1/ahenschel/SambaShare/YemenGenomeAnalysis/Results/full_HO_ancient_PCA.html


## Comprehensive World PCA
PCA was run on complete Reich Human Origin dataset using smartPCA, excluding ancient populations (for spanning the principle components). Plot is interactive.

In [23]:
dataset = "HO"
pcaDF = getPCAdata(geoPCA=None , dataset = dataset)
regions = getRegions(pcaDF, dataset)
plotPCA(pcaDF, regions, regions, width=1400, height=1600, saveFig=f"../Results/full_{dataset}_PCA.html", nonYemenMarkers=nonYemenMarkers)

/mnt/Drive1/ahenschel/SambaShare/YemenGenomeAnalysis/Results/full_HO_PCA.html


## Regional PCA - 4000k (Human Origins)
The below PCA plots where calculated independently on selected populations (others might appear, but the Principle Components are only derived from locations selected by geographic (Haversine) distance.
Here including populations within 4000 km. Populations within 2000km are active, others are greyed out initially. Note that some populations that are >2000k away are closer than some that are <2000k

In [24]:
geoPCA = 4000
dataset ="HO"
pcaDF = getPCAdata(geoPCA , dataset = dataset)
regions = getRegions(pcaDF, dataset, geoPCA)
regionsUnmuted = [line.strip() for line in open(f"../Data/popfile_{dataset}_{2000}.txt")]
plotPCA(pcaDF, regions, regionsUnmuted, width=1400, height=4000, saveFig=f"../Results/region{geoPCA}_PCA.html", nonYemenMarkers=nonYemenMarkers)

/mnt/Drive1/ahenschel/SambaShare/YemenGenomeAnalysis/Results/region4000_PCA.html


## Regional PCA - 3000k (Human Origins)
including populations within 3000 km. Populations within 2000km are active, others are greyed out initially. Again note that some populations that are >2000k away are closer than some that are <2000k

In [25]:
geoPCA = 3000
pcaDF = getPCAdata(geoPCA , dataset = "HO")
regions = getRegions(pcaDF, dataset, geoPCA)
regionsUnmuted = [line.strip() for line in open(f"../Data/popfile_{dataset}_{2000}.txt")]
plotPCA(pcaDF, regions, regionsUnmuted, width=3000, height=1800, saveFig=f"../Results/region{geoPCA}_PCA.html", nonYemenMarkers=nonYemenMarkers)

/mnt/Drive1/ahenschel/SambaShare/YemenGenomeAnalysis/Results/region3000_PCA.html


## Regional PCA - 2000k (Human Origins)
including populations within 2000 km. 

In [15]:
geoPCA = 2000
pcaDF = getPCAdata(geoPCA, dataset = "HO")
regions = getRegions(pcaDF, dataset, geoPCA)
#regionsUnmuted = [line.strip() for line in open(f"../Data/popfile_{dataset}_{2000}.txt")]
plotPCA(pcaDF, regions, regions, width=3000, height=1400, saveFig=f"../Results/region{geoPCA}_PCA.html", nonYemenMarkers=nonYemenMarkers)

/mnt/Drive1/ahenschel/SambaShare/YemenGenomeAnalysis/Results/region2000_PCA.html


## Regional PCA - 1000k (Human Origins)
including populations within 1000 km. 

In [16]:
geoPCA = 1000
pcaDF = getPCAdata(geoPCA, dataset = "HO")
regions = getRegions(pcaDF, dataset, geoPCA)
#regionsUnmuted = [line.strip() for line in open(f"../Data/popfile_{dataset}_{2000}.txt")]
plotPCA(pcaDF, regions, regions, width=3000, height=1400, saveFig=f"../Results/region{geoPCA}_PCA.html", nonYemenMarkers=nonYemenMarkers)

/mnt/Drive1/ahenschel/SambaShare/YemenGenomeAnalysis/Results/region1000_PCA.html


In [None]:
## population selection
## Eg. only ancient from Asia, at least 10 samples
## readReich like in preparePopsF3Adm.py (incl. regions, ie. continents)

#reich = readReich('1240K', allRegions=True)
#lu = lambda p: list(r2[r2.Group_Label==p]['Region'])[0] ## lookup Group_Label -> continent
#ancient10 = [(pop,lu(pop)) for pop, count in Counter(reich.Group_Label).items() if count>10]
#[pop for pop, cont in ancient10 if cont=='AS']


# PCA with Kernel density maps

In addition to normal PCA scatter plots, we draw Kernel density estimation maps, using seaborn.

## KDE plots for four Yemen populations 
Hadramout, Ibb, Tizz and Rsa clearly have distinctive PCs. PCA performed on sampels within 1000k. Those four regions have all at least 12 samples It also clearly shows a different level of diversity. While Tizz is very compact, Hadramaut (known for active seafarers, spice trade) stronger differences.

In [155]:
import matplotlib.colors as mcolors


In [122]:

import seaborn as sns
import matplotlib

mplcolors = itertools.cycle(mcolors.CSS4_COLORS.values())
markers = itertools.cycle("ov^<>12348spP*hH+xXDd|")

ddf = pcaDF
fig, ax = plt.subplots(figsize=(16,12))
gradients = itertools.cycle([ 'Reds', 'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'PuRd',    'RdPu',     'BuPu',       'GnBu', 'PuBu',    'YlGnBu', 'PuBuGn',  'BuGn', 'YlGn'])
gradcols  = itertools.cycle([ 'Red', 'Grey',   'Purple',  'Blue',  'Green',  'Orange',  'fuchsia', 'deeppink', 'royalblue',  'lime', 'orchid',  'olive',  'darkgreen','cyan', 'darkseagreen'])

for pop in yregions:
    Y = ddf[ddf.Region==pop][['PC1', 'PC2']]
    if Y.shape[0] > 8:
        cmap = next(gradients)        
        sns.kdeplot(Y, cmap = cmap, ax=ax, levels=7)
        color=next(gradcols)
        marker=next(markers)
        ax.scatter(Y.PC1, Y.PC2, c=color, label=f'{pop}/{Y.shape[0]}', marker=marker)

ax.legend()
plt.show()

<IPython.core.display.Javascript object>

The above figure shows strong stratification between four selected Yemeni regions: Hadramaut, Ibb, Taizz (Tiz) and Zabid+Al-Hadida (RSA). Although overlaps exist, a clear clustering is apparent.

## Meta analysis with Yemen data from Vyas et al.

In [115]:
plt.savefig("yemen4majorPops.png")

In [123]:
dataset='HO'
regions = [line.strip() for line in open(f"../Data/popfile_{dataset}_{1000}.txt")]
ddf = getPCAdata(dataset = "HO", geoPCA = 1000)

fig2, ax2 = plt.subplots(figsize=(24,12))

for pop in regions:
    #if pop=='Hdr':continue
    Y = ddf[ddf.Region==pop][['PC1', 'PC2']]
    if Y.shape[0] > 8:
        cmap = next(gradients)        
        sns.kdeplot(Y, cmap = cmap, ax=ax2, levels=3)
        color=next(gradcols)
        marker=next(markers)
        ax2.scatter(Y.PC1, Y.PC2, c=color, label=f'{pop}/{Y.shape[0]}', marker=marker)

ax2.legend()
plt.show()
plt.savefig("yemen4majorPops_Vyas1.svg")
plt.savefig("yemen4majorPops_Vyas1.png")

<IPython.core.display.Javascript object>

In [124]:
plt.savefig("yemen4majorPops_Vyas.svg")
plt.savefig("yemen4majorPops_Vyas.png")

## Increasing the context - Shaigi
PCA Plotting for regions within 2000km from Yemen. Remarkably Sudanese Shaigi appear close to Yemen Desert samples, much more so than more expected populaions like Somali.
Wikipedia: They trace their origin to a Hejazi Arab named Shaig who came from the Arabian Peninsula in the 7th century following the Arabian conquest of Egypt.[12] Shaig was a descendant of Abbas (an uncle of prophet Muhammad).

In [132]:
dataset='HO'
regions = [line.strip() for line in open(f"../Data/popfile_{dataset}_{2000}.txt")]
ddf = getPCAdata(dataset = "HO", geoPCA = 2000)

fig2, ax3 = plt.subplots(figsize=(24,12))
             
for pop in regions:
    if pop=='Hdr':continue ## makes it messy
    Y = ddf[ddf.Region==pop][['PC1', 'PC2']]
    print(pop, Y.shape[0])
    if Y.shape[0] >= 7 or pop in ['Shaigi.WGA', 'Eritrea']:

        cmap = next(gradients)        
        sns.kdeplot(Y, cmap = cmap, ax=ax3, levels=5)
        color=next(gradcols)
        marker=next(markers)
        ax3.scatter(Y.PC1, Y.PC2, c=color, label=f'{pop}/{Y.shape[0]}', marker=marker)

ax3.legend()
plt.show()

<IPython.core.display.Javascript object>

Somali 7
Afar.WGA 5
Shaigi.WGA 3
Saudi 8
Yemeni 6
Luhya 0
Jew_Ethiopian 7
Jew_Yemenite 6
Jew_Iraqi 4
Iranian_Bandari 8
Eritrea 3
Yemeni_Highlands 29
Yemeni_Highlands_Raymah 1
Yemeni_Northwest 28
Yemeni_Desert 16
Yemeni_Desert2 13
Jew_Iraqi.DG 0
Luhya.DG 0
Jew_Yemenite.DG 0
Somali.DG 0
Dal 8
Byd 3
Jwf 6
Abyn 6
Amr 6
Tiz 14
Sad 5
San 8
Haj 8
Dhm 6
Mhw 3
Shb 8
Mrb 4
Lahj 5
Ibb 15
Rsa 24


In [163]:
dataset='1240K'
regions = [line.strip() for line in open(f"../Data/popfile_{dataset}_{2000}.txt")]
ddf = getPCAdata(dataset = "HO", geoPCA = 2000)

fig2, ax3 = plt.subplots(figsize=(24,12))
             
for pop in regions:
    if pop=='Hdr':continue ## makes it messy
    Y = ddf[ddf.Region==pop][['PC1', 'PC2']]
    print(pop, Y.shape[0])
    if Y.shape[0] >= 7 and pop!="LWK.SG":

        cmap = next(gradients)        
        sns.kdeplot(Y, cmap = cmap, ax=ax3, levels=5)
        color=next(gradcols)
        marker=next(markers)
        ax3.scatter(Y.PC1, Y.PC2, c=color, label=f'{pop}/{Y.shape[0]}', marker=marker)

ax3.legend()
plt.show()

<IPython.core.display.Javascript object>

Dinka.DG 1
BedouinB.DG 2
Jew_Iraqi.DG 0
Luhya.DG 0
Jew_Yemenite.DG 0
Somali.DG 0
LWK.SG 101
BedouinA.SDG 19
BedouinB.SDG 25
Dal 8
Byd 3
Jwf 6
Abyn 6
Amr 6
Tiz 14
Sad 5
San 8
Haj 8
Dhm 6
Mhw 3
Shb 8
Mrb 4
Lahj 5
Ibb 15
Rsa 24


## Extended Population definitions
Defining populations into larger groups.


In [183]:
regions = [line.strip() for line in open(f"../Data/popfile_{dataset}_{3000}.txt")]
regionDict = {'Levant': ['Syrian', 'Jordanian', 'Lebanese', 'Druze_oAfrican.SDG'],
              'Palestinian': ['Palestinian_o2.SDG', 'Druze.SDG', 'Palestinian_o1.SDG'], 
              'Yemen_Zalloua_NoHdr': ['Dal', 'Byd', 'Jwf', 'Abyn', 'Amr', 'Tiz', 'Sad', 'San', 'Haj', 'Dhm', 'Mhw', 'Shb', 'Mrb', 'Lahj', 'Ibb', 'Rsa'],
           'Bedouin': ['BedouinA.SDG', 'BedouinB.SDG'],
           'Yemen_Vyas': ['Yemeni_Highlands', 'Yemeni_Highlands_Raymah', 'Yemeni_Northwest', 'Yemeni_Desert', 'Yemeni_Desert2'],
           'Gujarati': ['GujaratiD', 'GujaratiB', 'GujaratiA', 'GujaratiC'],
           'Iranian': ['Iranian', 'Jew_Iranian'], 
              #'Iranian2': ['Iranian_Bandari', 'Iranian.DG'],
           #'Somali': ['Somali', 'Eritrea'], 
              'Saudi': ['Saudi'],
           'Shaigi': ['Shaigi.WGA', 'Dinka.DG'][:1]}
           
ddf = getPCAdata(dataset = "HO", geoPCA = 2000)

fig2, ax3 = plt.subplots(figsize=(24,12))
             
for pop, poplist in regionDict.items():
    Y = ddf[ddf.Region.isin(poplist)][['PC1', 'PC2']]
    print(pop, Y.shape[0])

    cmap = next(gradients)        
    sns.kdeplot(Y, cmap = cmap, ax=ax3, levels=5)
    color=next(gradcols)
    marker=next(markers)
    ax3.scatter(Y.PC1, Y.PC2, c=color, label=f'{pop}/{Y.shape[0]}', marker=marker)
ax3.legend()                      
           


<IPython.core.display.Javascript object>

Levant 27
Palestinian 39
Yemen_Zalloua_NoHdr 129
Bedouin 44
Yemen_Vyas 87
Gujarati 19
Iranian 47
Saudi 8
Shaigi 3


<matplotlib.legend.Legend object at 0x7f49b45a0eb8>

In [173]:
ddf = getPCAdata(dataset = "HO", geoPCA = 2000)
regionDict = {'Levant': ['Syrian', 'Jordanian', 'Lebanese', 'Druze_oAfrican.SDG', 'Palestinian_o2.SDG', 'Druze.SDG', 'Palestinian_o1.SDG', 'Lebanese_Christian', 'Lebanese_Muslim', 'Assyrian'], 
              'Yemen_Zalloua': ['Dal', 'Byd', 'Jwf', 'Abyn', 'Hdr', 'Amr', 'Tiz', 'Sad', 'San', 'Haj', 'Dhm', 'Mhw', 'Shb', 'Mrb', 'Lahj', 'Ibb', 'Rsa'],
               'Bedouin': ['BedouinA.SDG', 'BedouinB.SDG'],
              'Saudi': ['Saudi'],
               'Yemen_Vyas': ['Yemeni_Highlands', 'Yemeni_Highlands_Raymah', 'Yemeni_Northwest', 'Yemeni_Desert', 'Yemeni_Desert2'],
               'Iranian': ['Iranian', 'Jew_Iranian', 'Iranian_Bandari', 'Iranian.DG'],
               'Sudan': ['Shaigi.WGA']}
fig2, ax3 = plt.subplots(figsize=(24,12))
             
for pop in 'Bedouin Yemen_Zalloua Levant Sudan Saudi'.split():
    poplist = regionDict[pop]
    Y = ddf[ddf.Region.isin(poplist)][['PC1', 'PC2']]
    print(pop, Y.shape[0])

    cmap = next(gradients)        
    sns.kdeplot(Y, cmap = cmap, ax=ax3, levels=5)
    color=next(gradcols)
    marker=next(markers)
    ax3.scatter(Y.PC1, Y.PC2, c=color, label=f'{pop}/{Y.shape[0]}', marker=marker)
ax3.legend()

<IPython.core.display.Javascript object>

Bedouin 44
Yemen_Zalloua 144
Levant 97
Sudan 3
Saudi 8


<matplotlib.legend.Legend object at 0x7f49b7f9e908>

In [180]:
dataset='HO'
regions = [line.strip() for line in open(f"../Data/popfile_{dataset}_{3000}.txt")]
ddf = getPCAdata(dataset = "HO", geoPCA = 2000)
cs = [(region, Counter(ddf.Region)[region])for region in regions]

    

## PC1 vs PC3
Shaigi still very close to Yemeni Desert2 and Rsa!

In [134]:
dataset='HO'
regions = [line.strip() for line in open(f"../Data/popfile_{dataset}_{2000}.txt")]
ddf = getPCAdata(dataset = "HO", geoPCA = 2000)

fig2, ax3 = plt.subplots(figsize=(24,12))
             
for pop in regions:
    if pop=='Hdr':continue ## makes it messy
    Y = ddf[ddf.Region==pop][['PC1', 'PC3']]
    print(pop, Y.shape[0])
    if Y.shape[0] >= 7 or pop in ['Shaigi.WGA', 'Eritrea']:

        cmap = next(gradients)        
        sns.kdeplot(Y, cmap = cmap, ax=ax3, levels=5)
        color=next(gradcols)
        marker=next(markers)
        ax3.scatter(Y.PC1, Y.PC3, c=color, label=f'{pop}/{Y.shape[0]}', marker=marker)

ax3.legend()
plt.show()

<IPython.core.display.Javascript object>

Somali 7
Afar.WGA 5
Shaigi.WGA 3
Saudi 8
Yemeni 6
Luhya 0
Jew_Ethiopian 7
Jew_Yemenite 6
Jew_Iraqi 4
Iranian_Bandari 8
Eritrea 3
Yemeni_Highlands 29
Yemeni_Highlands_Raymah 1
Yemeni_Northwest 28
Yemeni_Desert 16
Yemeni_Desert2 13
Jew_Iraqi.DG 0
Luhya.DG 0
Jew_Yemenite.DG 0
Somali.DG 0
Dal 8
Byd 3
Jwf 6
Abyn 6
Amr 6
Tiz 14
Sad 5
San 8
Haj 8
Dhm 6
Mhw 3
Shb 8
Mrb 4
Lahj 5
Ibb 15
Rsa 24


In [182]:
fig2, ax3 = plt.subplots(figsize=(24,12))
             
for pop in regions:
    if pop=='Hdr':continue ## makes it messy
    Y = ddf[ddf.Region==pop][['PC1', 'PC2']]
    if Y.shape[0] >= 7 or pop in ['Shaigi.WGA', 'Eritrea']:

        cmap = next(gradients)        
        sns.kdeplot(Y, cmap = cmap, ax=ax3, levels=5)
        color=next(gradcols)
        marker=next(markers)
        ax3.scatter(Y.PC1, Y.PC2, c=color, label=f'{pop}/{Y.shape[0]}', marker=marker)

ax3.legend()
plt.show()

<IPython.core.display.Javascript object>

Armenian_Hemsheni 8
Azeri 17
Tabasaran 10
Darginian 8
Kubachinian 6
Lak 10
Avar_outlier1 0
Avar 8
Avar_outlier2 0
Kaitag 8
Ezid 8
Georgian 23
Brahui 21
Balochi 20
Makrani 20
Sindhi_Pakistan 14
Mbuti 10
Druze 39
BedouinB 19
BedouinA 25
Palestinian 38
BantuKenya 6
Somali 7
Kikuyu 4
Masai 12
Hadza1 4
Afar.WGA 5
Armenian.WGA 3
Azeri.WGA 3
Shaigi.WGA 3
Kurd.WGA 2
Georgian.WGA 2
Assyrian.WGA 4
Iranian 38
Syrian 8
Lebanese 8
Jordanian 9
Saudi 8
Yemeni 6
Egyptian 18
Armenian 10
Lezgin 9
Luhya 0
GujaratiD 5
GujaratiB 5
GujaratiA 4
GujaratiC 5
Luo 8
Datog 3
Jew_Ethiopian 7
Jew_Yemenite 6
Turkish 50
Jew_Iranian 9
Jew_Iraqi 4
Cypriot 8
Jew_Georgian 7
Lebanese_Christian 9
Lebanese_Muslim 11
Iranian_Bandari 8
Assyrian 11
Eritrea 3
Yemeni_Highlands 29
Yemeni_Highlands_Raymah 1
Yemeni_Northwest 28
Yemeni_Desert 16
Yemeni_Desert2 13
Dinka.DG 1
Jordanian.DG 3
Luo.DG 2
Mbuti.DG 4
BedouinB.DG 2
Jew_Iraqi.DG 0
Druze.DG 2
Sindhi_Pakistan.DG 2
Iranian.DG 2
Lezgin.DG 2
Armenian.DG 2
Luhya.DG 0
Brahui.DG 2
Mak

In [80]:
ax.scatter

<bound method Axes.scatter of <matplotlib.axes._subplots.AxesSubplot object at 0x7f49b7e38908>>

In [51]:
plt.show()

In [None]:
regions = ['Sad', 'Amr', 'San', 'Dhm', 'Mhw', 'Haj', 'Hdr', 'Shb', 'Ibb', 'Tiz', 'Jwf', 'Mrb', 'Byd', 'Dal', 'Lahj', 'Abyn', 'Rsa']
fig, ax = plt.subplots(nrows=12, ncols=3, figsize=(16,18))
i,j = 0,0
for pc1 in range(1,10):
    for pc2 in range(pc1+1,10):

        x = ddf[['PC%s'%pc1, 'PC%s'%pc2]].values
        #plt.figure(figsize=(9.5, 8))
        for e in regions:
            n = sum(y == e)
            ax[i][j].set_title('PC %s vs PC %s' %(pc1, pc2))
            ax[i][j].scatter(
                x[y==e].T[0], x[y==e].T[1], label=e,
                c=colors[e],
                #marker=markers[e]
            )
        j+=1
        if j==3: 
            j=0; i+=1
#plt.legend(bbox_to_anchor=(0.9667, -.1), ncol=3)

#ax.tight_layout()

In [None]:
set(y)

In [None]:
markers

In [None]:
ddf.head()

In [None]:

x = ddf[['PC1', 'PC2', 'PC3']].values
y = ddf[f].values

fig = plt.figure(figsize=(9.5, 8))
ax = fig.add_subplot(projection='3d')

for e in order[f]:
    n = sum(y == e)
    ax.scatter(
        x[y==e].T[0], x[y==e].T[1], x[y==e].T[2], label=e,
        color=[colors[f][e]] * n,
        marker=markers[f][e]
    )

ax.view_init(azim=46, elev=-43)
plt.legend(bbox_to_anchor=(0.9667, 0.1667), ncol=3)