# Population Info for Plink's fam files

* filling the first columns (family id) with population ids.
* helps to run Admixtools2 (in R) conveniently (without the need for EIGENSTRAT files, thus no need fixing pop-info into the ind files, fixIndiv)

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Metadata Tables for NGS data
Read and prepare NGS metadata tables

* Arabian Peninsula (APPG, Al Marri et al)
* HGDP NGS (hgdp, bergstrom, Science 2020)
* Yemen 47 set


In [2]:
def popfix(pop):
    if pop.startswith('Bantu'): return 'Bantu'
    return pop

yemen = pd.read_excel("../Metadata/Yemen Codes.xlsx", sheet_name="WGS samples").set_index('sid')
hgdpMeta = "../Data/HGDP/HGDPid_populations.csv.gz"
hgdp = pd.read_csv(hgdpMeta).set_index('Id')
hgdp.population = [popfix(pop) for pop in hgdp.population]
hgdp.population.value_counts()
appg = pd.read_csv("../Metadata/appg.meta.csv", sep=' ').set_index('long_ID')

a1 = appg[['population']]
a1.columns = ['Population']
a1.index.name = 'SampleID'

h1 = hgdp[['population']]
h1.columns = ['Population']
h1.index.name = 'SampleID'

y1 = yemen[['region']]
y1.columns = ['Population']
y1.index.name = 'SampleID'

In [13]:
districts = {'#a0ff00': ["Saada", 'Amran', "Sanaa", 'Dhamar'], 
             '#F39C12': ['Mahwit', 'Hajjah', 'Raymah'], 
             '#229954':['Hadramout', 'Shabwah', 'Al Maharah'],  ## East
             '#A4817A': ['Ibb', 'Taizz'], 
             '#2222FF': ['Jawf', 'Maarib', 'Bayda'], ## North/Desert
             '#E74C3C': ["Al Dhale'e", 'Lahij', 'Abyan', 'Aden'],  ## South/Coast
             '#A0A0A0': ['Hudayda']}  ## West/Coast
districtsR = {}
wgsSamples = {'Lahij': 2, 'Bayda': 2, 'Hudayda': 5}
for k,v in districts.items():
    for gov in v:
        districtsR[gov] = k.lower()
for pop in y1.Population.unique():
    for count in range(wgsSamples.get(pop, 4)):
        print(f'{pop}_{count+1},label,node,#000000,1,bold-italic,{districtsR[pop]}')

Taizz_1,label,node,#000000,1,bold-italic,#a4817a
Taizz_2,label,node,#000000,1,bold-italic,#a4817a
Taizz_3,label,node,#000000,1,bold-italic,#a4817a
Taizz_4,label,node,#000000,1,bold-italic,#a4817a
Bayda_1,label,node,#000000,1,bold-italic,#2222ff
Bayda_2,label,node,#000000,1,bold-italic,#2222ff
Saada_1,label,node,#000000,1,bold-italic,#a0ff00
Saada_2,label,node,#000000,1,bold-italic,#a0ff00
Saada_3,label,node,#000000,1,bold-italic,#a0ff00
Saada_4,label,node,#000000,1,bold-italic,#a0ff00
Mahwit_1,label,node,#000000,1,bold-italic,#f39c12
Mahwit_2,label,node,#000000,1,bold-italic,#f39c12
Mahwit_3,label,node,#000000,1,bold-italic,#f39c12
Mahwit_4,label,node,#000000,1,bold-italic,#f39c12
Amran_1,label,node,#000000,1,bold-italic,#a0ff00
Amran_2,label,node,#000000,1,bold-italic,#a0ff00
Amran_3,label,node,#000000,1,bold-italic,#a0ff00
Amran_4,label,node,#000000,1,bold-italic,#a0ff00
Dhamar_1,label,node,#000000,1,bold-italic,#a0ff00
Dhamar_2,label,node,#000000,1,bold-italic,#a0ff00
Dhamar_3,label

In [9]:
y1.Population.value_counts()

Hudayda      5
Taizz        4
Saada        4
Mahwit       4
Amran        4
Dhamar       4
Maarib       4
Hadramout    4
Abyan        4
Ibb          4
Bayda        3
Lahij        2
Name: Population, dtype: int64

## Metadata Tables for Genotype data
Read and prepare genotype metadata tables

* Reich AARD (1240K/HO)
* Our Yemen


In [28]:
reichset = ['1240K', 'HO'][1]
version=['44.3','50.0'][0]
reich=pd.read_csv(f"../Reich/v{version}_{reichset}_public.anno", sep='\t', encoding='unicode_escape')
reich

Unnamed: 0,Index,Version ID,Master ID,Publication (or OK to use in a paper),Representative contact,"Date mean in BP [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]","Full Date: One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990Â±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE",Group Label,Locality,Country,Lat.,Long.,Data source,Coverage on autosomal targets,SNPs hit on autosomal targets,Sex,"Library type (minus=no.damage.correction, half=damage.retained.at.last.position, plus=damage.fully.corrected, ds=double.stranded.library.preparation, ss=single.stranded.library.preparation)","ASSESSMENT (Xcontam interval is listed if lower bound is >0.005, ""QUESTIONABLE"" if lower bound is 0.01-0.02, ""QUESTIONABLE_CRITICAL"" or ""FAIL"" if lower bound is >0.02) (mtcontam confidence interval is listed if coverage >2 and upper bound is <0.98: 0.9-0.95 is ""QUESTIONABLE""; <0.9 is ""QUESTIONABLE_CRITICAL"", questionable status gets overriden by ANGSD with PASS if upper bound of contamination is <0.01 and QUESTIONABLE if upper bound is 0.01-0.05) (damage for ds.half is ""QUESTIONABLE_CRITICAL/FAIL"" if <0.01, ""QUESTIONABLE"" for 0.01-0.03, and recorded but passed if 0.03-0.05; libraries with fully-treated last base are ""QUESTIONABLE_CRITICAL"" or ""FAIL"" if <0.03, ""QUESTIONABLE"" if 0.03-0.06, and recorded but passed if 0.06-0.1) (sexratio is QUESTIONABLE if [0.03,0.10] or [0.30,0.35); QUESTIONABLE_CRITICAL/FAIL if (0.10,0.30))"
0,1798,MAL-005,MAL-005,SkoglundCell2017,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,0,..,Malawi_Yao,Dedza // Yao,Malawi,-14.166667,34.33333,Fall2015,..,585645,M,..,PASS (genotyping)
1,1799,MAL-009,MAL-009,SkoglundCell2017,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,0,..,Malawi_Yao,Machinga // Yao,Malawi,-14.862605,35.574122,Fall2015,..,582189,M,..,PASS (genotyping)
2,1800,MAL-011,MAL-011,SkoglundCell2017,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,0,..,Malawi_Chewa,Mchinga // Chichewa,Malawi,-14.862605,35.574122,Fall2015,..,579844,M,..,PASS (genotyping)


In [42]:
#print(dict(reich[reich.iloc[:,5]>0]["Group Label"].value_counts()))
ancient='Pakistan_Loebanr_IA','Pakistan_Loebanr_IA','Turkmenistan_Gonur_BA_1','Israel_C','Turkey_N','Turkey_Arslantepe_LateC', \
'Israel_MLBA','Jordan_LBA','Iran_C_TepeHissar','Turkey_CamlibelTarlasi_LateC','Israel_MLBA_lc','Syria_Ebla_EMBA','Iran_GanjDareh_N','Lebanon_IA3.SG','Morocco_Iberomaurusian','Kenya_PastoralN_published', 'Kazakhstan_Central_Saka.SG', 'Jordan_PPNB_published', 'Kenya_PastoralN', 'Iran_HajjiFiruz_C'
for pop in ancient:
    print(f'\\b{pop}\\b')


\bPakistan_Loebanr_IA\b
\bPakistan_Loebanr_IA\b
\bTurkmenistan_Gonur_BA_1\b
\bIsrael_C\b
\bTurkey_N\b
\bTurkey_Arslantepe_LateC\b
\bIsrael_MLBA\b
\bJordan_LBA\b
\bIran_C_TepeHissar\b
\bTurkey_CamlibelTarlasi_LateC\b
\bIsrael_MLBA_lc\b
\bSyria_Ebla_EMBA\b
\bIran_GanjDareh_N\b
\bLebanon_IA3.SG\b
\bMorocco_Iberomaurusian\b
\bKenya_PastoralN_published\b
\bKazakhstan_Central_Saka.SG\b
\bJordan_PPNB_published\b
\bKenya_PastoralN\b
\bIran_HajjiFiruz_C\b


In [43]:
reich.Country.value_counts()

Russia             1705
China              1083
Spain               680
Pakistan            633
Italy               538
                   ... 
North Macedonia       1
Crimea                1
Turkey                1
Isle of Man           1
Solomon Islands       1
Name: Country, Length: 144, dtype: int64

In [23]:
dict(reich.Country.value_counts())
#    print(kv)
    #print("%-40s: %s"%(k,v))

{'Russia': 642,
 'China': 567,
 'Spain': 489,
 'Italy': 472,
 'Pakistan': 438,
 'USA': 434,
 'Great Britain': 385,
 'Germany': 356,
 'Israel': 236,
 'Nigeria': 236,
 'India': 229,
 'France': 198,
 'Hungary': 195,
 'Sweden': 193,
 'Peru': 165,
 'Kenya': 162,
 'Turkey': 155,
 'Kazakhstan': 154,
 'Japan': 147,
 'Puerto Rico': 130,
 'Vietnam': 130,
 'Dominican Republic': 127,
 'Gambia': 115,
 'Finland': 113,
 'Mongolia': 105,
 'Sri Lanka': 103,
 'Colombia': 103,
 'Poland': 101,
 'Barbados': 96,
 'Denmark': 95,
 'Turkmenistan': 92,
 'Bangladesh': 88,
 'Sierra Leone': 87,
 'Czech Republic': 87,
 'Estonia': 86,
 'Ukraine': 83,
 'Switzerland': 80,
 'Iran': 68,
 'Ireland': 63,
 'Cuba': 58,
 'Mexico': 58,
 'China ': 55,
 'Brazil': 50,
 'Uzbekistan': 50,
 'Taiwan': 49,
 'Bulgaria': 48,
 'Kyrgyzstan': 47,
 'Iceland': 46,
 'Papua New Guinea': 46,
 'Norway': 45,
 'Serbia': 45,
 'Portugal': 40,
 'Jordan': 39,
 'Latvia': 38,
 'Lebanon': 37,
 'Vanuatu': 34,
 'Bahamas': 29,
 'Algeria': 29,
 'Greece': 29

In [25]:
reich[reich.Country=='Egypt'].iloc[:,10:20]

Unnamed: 0,"Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform disribution between the two bounds for a contextual date]","Full Date: One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990Â±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE",GroupID,Locality,Country,Lat.,Long.,Data source,No. Libraries,Coverage on autosomal targets
4568,72,"787-546 calBCE (2522Â±24 BP, MAMS-23047)",Egypt_ThirdIntermediatePeriod,Abusir-el Meleq,Egypt,29.9,31.2,1240K,..,0.117729
4569,43,"151 calBCE - 23 calCE (2050Â±24 BP, MAMS-23575)",Egypt_Ptolemaic_contam,Abusir-el Meleq,Egypt,29.9,31.2,1240K,..,0.201752
4570,70,"779-544 calBCE (2513Â±24 BP, MAMS-23617)",Egypt_ThirdIntermediatePeriod,Abusir-el Meleq,Egypt,29.9,31.2,1240K,..,0.880832


In [10]:
populationCol = 'GroupID' #'Group Label'
r1 = reich.set_index('Version ID')
r1 = r1[[populationCol]]
r1.columns = ['Population']
r1.index.name = "SampleID"

r1

Unnamed: 0_level_0,Population
SampleID,Unnamed: 1_level_1
I0626_all,Vietnam_N_all
I0627_all,Vietnam_N_all
I1137_all_published,Vietnam_N_all_published
I1859_all,Vietnam_N_all
I2497_all,Vietnam_BA_all
...,...
VK94.SG,Denmark_Viking.SG
VK95.SG,Iceland_Viking.SG
VK98.SG,Iceland_Viking.SG
VK99.SG,Iceland_Viking.SG


In [11]:
import re

coarse = False
df = pd.read_csv("../Metadata/3577stdy_manifest_3450_190315.csv.gz", skiprows=8, sep=',', skip_blank_lines=True)
df1 = df.dropna(subset=['SUPPLIER SAMPLE NAME'])
regions = [re.findall('\D+', sup)[0] for sup in df1['SUPPLIER SAMPLE NAME']]
df1.loc[:,"Population"] = regions
y2 = df1.set_index('SANGER SAMPLE ID')
y2 = y2[['Population']]
y2 = y2[y2.Population!="Empty"]
y2.index.name = 'SampleID'
if coarse: y2.Population = 'Yemen'
y2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.loc[:,"Population"] = regions


Unnamed: 0_level_0,Population
SampleID,Unnamed: 1_level_1
3577STDY6068360,Dal
3577STDY6068361,Ibb
3577STDY6068362,Rsa
3577STDY6068363,Rsa
3577STDY6068364,Rsa
...,...
3577STDY6068635,Tiz
3577STDY6068640,Rsa
3577STDY6068641,Ibb
3577STDY6068642,Tiz


In [13]:
#a1 = appg[['short_ID', 'population']]
#a1.columns = ['SampleID', 'Population']
#a1 = a1.set_index('SampleID')

#allmeta = pd.concat([y1,h1,a1], axis=0) ## for genotype
allmeta = pd.concat([y2,r1], axis=0)     ## for NGS
allmeta

Unnamed: 0_level_0,Population
SampleID,Unnamed: 1_level_1
3577STDY6068360,Dal
3577STDY6068361,Ibb
3577STDY6068362,Rsa
3577STDY6068363,Rsa
3577STDY6068364,Rsa
...,...
VK94.SG,Denmark_Viking.SG
VK95.SG,Iceland_Viking.SG
VK98.SG,Iceland_Viking.SG
VK99.SG,Iceland_Viking.SG


In [14]:
for k,v in allmeta.Population.value_counts().items():
    print ("%-50s: %s" % (k,v))

Sweden_Viking.SG                                  : 132
GWD.SG                                            : 112
YRI.SG                                            : 108
TSI.SG                                            : 108
CHS.SG                                            : 106
GIH.SG                                            : 106
JPT.SG                                            : 104
ITU.SG                                            : 103
IBS.SG                                            : 103
CHB.SG                                            : 103
LWK.SG                                            : 101
PUR.SG                                            : 100
ESN.SG                                            : 99
CEU.SG                                            : 99
CDX.SG                                            : 99
STU.SG                                            : 99
KHV.SG                                            : 97
FIN.SG                                            : 9

In [15]:
dallmeta = dict(allmeta.Population)

In [16]:
import os
#famfile = '~/YemenGenomeAnalysis_Unused/WGS/Yemeni.HGDP.APPG.mask.autos.AF_fix_QC.fam'
#famfile = '/home/ahenschel/Dropbox/Yemen/YemenGenomeAnalysis/AdmixTools_HO_mind0.7_geno_0.5_pop10/yemen_reichHO_mind0.7_geno_0.5.LD.QC.fam'

famfile = '/home/ahenschel/Dropbox/Yemen/YemenGenomeAnalysis/AdmixTools_1240K_mind0.5_geno_0.5/yemen_reich1240K_mind0.5_geno_0.5.LD.QC.fam'
fambak = famfile + '_bak'
os.system(f'mv {famfile} {fambak}')

0

In [17]:
fam = pd.read_csv(fambak, sep=' ',header=None)
popIds = []
for iid in fam[1]:
    if iid.startswith('urn:wtsi:'): iid = iid.split('_')[-1]

    if iid=='EGAN00001456590': popIds.append('YEMEN')
    elif iid=='longranger222_wgs_27426_APPG7555931_GRCh38_gatk': popIds.append('APPGMISC')
    else: popIds.append(dallmeta[iid])
#    if not iid in dallmeta:
#        print("Not found:", iid)
fam[0] = popIds
fam.to_csv(famfile, header=False, index=False, sep=' ')

In [46]:
l="""Afghanistan.PZ
Algeria.PZ
Armenia.PZ
Bahrein.PZ
Cyprus.PZ
Iran.PZ
Iraq.PZ
Jordan.PZ
KSA.PZ
Kuwait.PZ
Libya.PZ
Palestine.PZ
Turkey.PZ
Yemen.PZ""".split()
l

['Afghanistan.PZ', 'Algeria.PZ', 'Armenia.PZ', 'Bahrein.PZ', 'Cyprus.PZ', 'Iran.PZ', 'Iraq.PZ', 'Jordan.PZ', 'KSA.PZ', 'Kuwait.PZ', 'Libya.PZ', 'Palestine.PZ', 'Turkey.PZ', 'Yemen.PZ']

In [45]:
%pprint

Pretty printing has been turned OFF
