In [1]:
import pandas as pd

# Merge SRA info with sample info from Kraemer et al

In [2]:
sra_info = pd.read_table("../info/example_SRA_info.tsv", header=0, comment="#")
sra_info.head()

Unnamed: 0,date,Experiment,MBases,MBytes,organism,Run,SRA,Accession
0,2017-06-30,ERX2079340,11,5,human metagenome,ERR2020093,ERS1803201,SAMEA104144183
1,2017-07-09,ERX2079341,29,13,human metagenome,ERR2020094,ERS1803202,SAMEA104144184
2,2017-06-30,ERX2079342,10,5,human metagenome,ERR2020095,ERS1803203,SAMEA104144185
3,2017-07-09,ERX2079343,14,6,human metagenome,ERR2020096,ERS1803204,SAMEA104144186
4,2017-07-03,ERX2079344,16,7,pig metagenome,ERR2020097,ERS1803205,SAMEA104144187


In [3]:
sample_info = pd.read_table("../info/example_sample_info.tsv", header=0, comment="#")
sample_info.head()

Unnamed: 0,sample_alias,tax_id,scientific_name,sample_title,sample_description,location number
0,10,1176744,pig metagenome,pig_1_1__,nasal swab from pig,1
1,103,646099,human metagenome,ne_40_0_left_ant,nasal swab from individual without contact to ...,40
2,105,646099,human metagenome,ne_40_0_left_post,nasal swab from individual without contact to ...,40
3,107,646099,human metagenome,ne_41_0_left_ant,nasal swab from individual without contact to ...,41
4,109,646099,human metagenome,ne_41_0_left_post,nasal swab from individual without contact to ...,41


In [4]:
biosamples = {}
with open("../info/example_biosample_info.txt", 'r') as fh:
    for line in fh:
        line = line.rstrip()
        if not line:
            continue
        if "." in line:
            sample_alias = int(line.split(".")[1].lstrip().split(" ")[0].rstrip(";"))
        if "Identifiers" in line:
            SRA = line.split(":")[-1].lstrip()
            biosamples[sample_alias] = SRA
biosamples = pd.DataFrame(biosamples, index=[0]).T
biosamples.columns = ["SRA"]
biosamples.index.name="sample_alias"
biosamples = biosamples.reset_index()
biosamples.head()

Unnamed: 0,sample_alias,SRA
0,2,ERS1803236
1,4,ERS1803290
2,6,ERS1803355
3,8,ERS1803369
4,10,ERS1803200


In [5]:
df = pd.merge(sra_info,biosamples, left_on="SRA", right_on="SRA")
df = pd.merge(df,sample_info, left_on="sample_alias", right_on="sample_alias")

In [6]:
sample_type = []
sample_id = []
sample_position = []

for item in df.sample_title:
    if "air sample" in item:
        st = "air"
        pos = item.split("_")[1]
        si = "air"
        sample_pos = "NA"
    elif "pigf" in item or "cowf" in item or "ne" in item:
        st, pos, si = item.split("_")[0:-2]
        sample_pos = "_".join(item.split("_")[-2:])
    else:
        st, pos, si = item.split("_")[0:-2]
        sample_pos = "NA"
    individual = "{}_{}".format(pos,si)
    sample_type.append(st)
    sample_id.append(individual)
    sample_position.append(sample_pos)

In [7]:
df = df.assign(sample_type=pd.Series(sample_type, index=df.index))
df = df.assign(sample_id=pd.Series(sample_id, index=df.index))
df = df.assign(sample_position=pd.Series(sample_position, index=df.index))

In [8]:
df.to_csv("../info/example_info.tsv", sep="\t")

Select a sample subset from pigfarms, cowfarms and negative controls

In [9]:
ne = df.loc[df.sample_type=="ne"]
pigf = pd.concat([df.loc[df.sample_type.str.contains("pig")],df.loc[df.sample_type.str.contains("air")]])
cowf = df.loc[df.sample_type.str.contains("cow")]

For the negative controls there is only one person sampled per location so choose the three individuals with the largest mean in library size.

In [10]:
ne_ids = list(ne.groupby("sample_id").mean().sort_values("MBases",ascending=False).head(3).index)

For the cow farms, get the mean library size per individual and location and get the three locations with highest per-individual mean size. Then select the individual with the largest mean from each of those sites.

In [11]:
cow_ids = []
individuals = []
locations = []
cowf_mean = cowf.groupby(["sample_id","location number"]).mean().reset_index().sort_values("MBases", ascending=False)
for i in cowf_mean.index:
    r = cowf_mean.loc[i]
    location = r["location number"]
    individual = r["sample_id"]
    if not location in locations:
        if len(locations)<3:
            locations.append(location)
for loc in locations:
    r = cowf.loc[cowf["location number"]==loc]
    cow_ids.append(list(r.groupby("sample_id").mean().sort_values("MBases", ascending=False).head(1).index)[0])    

From the pig farms, select the three locations with largest mean in library size

In [12]:
# First select locations that have air metagenomes
pigf_locs = pigf.loc[pigf.sample_type=="air","location number"].values
# Get locations with highest mean library sizes
pigf_locs_top = pigf.loc[pigf["location number"].isin(pigf_locs)].groupby("location number").mean().sort_values("MBases",ascending=False).head(3).index
# Iterate the locations and for each select 1) the pig with largest library, the human with largest library and the air metagenome
pigf_index = []
for loc in pigf_locs_top:
    r = pigf.loc[pigf["location number"]==loc]
    # Select air metagenome
    pigf_index.append(r.loc[r.sample_type=="air"].index[0])
    # Select pig metagenome
    pigf_index.append(r.loc[r.organism=="pig metagenome"].sort_values("MBases", ascending=False).head(1).index[0])
    # Select human metagenome with highest mean
    human_sample_id = list(r.loc[r.organism=="human metagenome"].groupby("sample_id").mean().sort_values("MBases", ascending=False).head(1).index)[0]
    pigf_index+=list(r.loc[r.sample_id==human_sample_id].index)

In [13]:
subset = pd.concat([pigf.loc[pigf_index],cowf.loc[cowf.sample_id.isin(cow_ids)]])
subset = pd.concat([subset,ne.loc[ne.sample_id.isin(ne_ids)]])
subset.to_csv("../info/example_subset_info.tsv", sep="\t", index=False)