In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import janitor
import numpy as np
import os
import glob

In [2]:
# read in mgx data

path = '/Users/danielle/Documents/thesis/paper-abundance-tables/profiles/*'                     
all_files = [name for name in glob.glob(path)] 

In [3]:
# mgx family

df_from_each_file = []
unclassified_list = []

for f in all_files:
    df = pd.read_csv(f, sep = '\t', skiprows=[0,1,2], usecols = ['#clade_name','relative_abundance']) # read in dataframe 
    
    id_1 = f.split('profiles/')[1] # add sample id from filename 
    id_2 = id_1.split('_S')[0]
    df["sampleid"] = id_2
    
    df.rename(columns = {'#clade_name':'taxa', 'relative_abundance':'abundance'}, inplace = True) 
    
    df = df[df['taxa'].str.contains("\|f__")] # keep families genera
    df = df[~df['taxa'].str.contains("\|g__")] # remove genera

    df["taxa"] = df['taxa'].str.split("\|f__").str[-1]
#     df["taxa"] = df['taxa'].str.split("\|s__").str[0]
    
    # percent unclassified organisms profiled with mgx
    unclassified = sum(df.taxa.str.contains("_unclassified| noname | Candidatus"))/df.shape[0]
    unclassified_list.append(unclassified)
    
    # remove unclassified
    df = df[~df.taxa.str.contains("_unclassified")]
    df = df[~df.taxa.str.contains("_noname")]
    df = df[~df.taxa.str.contains("Candidatus")]
    
    # combine together taxa of the same genera
    df = df.groupby(['taxa', 'sampleid'])['abundance'].sum().reset_index()    
    
    # convert to relative abundance
    df["abundance"] = df["abundance"]/100.0
    
    df_from_each_file.append(df)

In [4]:
mgx_family = pd.concat(df_from_each_file, ignore_index=True) # concat all dataframes together

In [5]:
mgx_family.head()

Unnamed: 0,taxa,sampleid,abundance
0,Acidaminococcaceae,C0047-7E-1A,0.000368
1,Bacteroidaceae,C0047-7E-1A,0.65244
2,Barnesiellaceae,C0047-7E-1A,0.024996
3,Bifidobacteriaceae,C0047-7E-1A,0.004231
4,Eggerthellaceae,C0047-7E-1A,1.2e-05


In [6]:
# read in 16S data
amplicon = pd.read_csv("/Users/danielle/Documents/thesis/paper-abundance-tables/dada2_output_exported/feature-table.txt",
                      skiprows=[0], sep='\t')

In [7]:
amplicon.drop(['#OTU ID'], axis = 1, inplace = True)
amplicon.rename(columns = {"taxonomy": "taxa"}, inplace = True)

In [8]:
cols = list(amplicon.columns)
cols = [cols[-1]] + cols[:-1]
amplicon = amplicon[cols]

In [9]:
# clean taxonomic names
# keep only families
amplicon_family = amplicon.copy()


In [10]:
amplicon_family["taxa"] = amplicon_family['taxa'].str.split("D_4__", expand=True)[1].str.split(";", expand=True)[0]

In [11]:
amplicon_family = amplicon.dropna()

In [12]:
# remove if genus is less than 5 characters
remove = list(set([name for name in amplicon_family["taxa"] if len(name)<5]))

In [13]:
# remove if numbers in genus
remove2 = list(set([name for name in amplicon_family["taxa"] if not name.isalpha()]))
# remove if Candidatus in genus
remove3 = list(set([name for name in amplicon_family["taxa"] if "Candidatus" in name]))
remove_list = remove + remove2 + remove3 + ["uncultured", "Family", "None", "Clade", "Subgroup", "Ruminococcaceae"]
amplicon_family = amplicon_family[~amplicon_family.taxa.isin(remove_list)]

In [14]:
amplicon_family.head()

Unnamed: 0,taxa,C0047-7E-1A,C0052-7E-1A,C0053-8E-1A,C0058-4E-1A,C0059-4E-1A,C0106-6E-1A,C0107-4E-1A,C0123-3E-1A,C0126-9E-1A,...,C1102-1F-1A,C1109-4F-1A,C1115-4F-1A,C1135-4F-1A,C1160-3F-1A,C1177-4F-1A,C1219-3F-1A,C2002-3E-1A,C2014-2E-1A,C2018-3E-1A


In [15]:
exclude = ["sampleid", "dev_stage", "AgeMonths"]

In [16]:
family_amplicon_list = set(amplicon_family["taxa"])
family_mgx_list = set(mgx_family["taxa"])

In [17]:
family_intersection = [taxa for taxa in family_amplicon_list.intersection(family_mgx_list) if taxa not in exclude]
family_union = [taxa for taxa in family_amplicon_list.union(family_mgx_list) if taxa not in exclude]

In [18]:
family_amp_only = [taxa for taxa in family_amplicon_list.difference(family_intersection) if taxa not in exclude]
family_mgx_only = [taxa for taxa in family_mgx_list.difference(family_intersection) if taxa not in exclude]

In [19]:
len(family_intersection) # genera found in both
len(family_amp_only) # genera found in amplicon only
len(family_mgx_only) # genera found in mgx only

0

0

54

In [20]:
# mgx species

df_from_each_file = []
unclassified_list = []

for f in all_files:
    df = pd.read_csv(f, sep = '\t', skiprows=[0,1,2], usecols = ['#clade_name','relative_abundance']) # read in dataframe 
    
    id_1 = f.split('profiles/')[1] # add sample id from filename 
    id_2 = id_1.split('_S')[0]
    df["sampleid"] = id_2
    
    df.rename(columns = {'#clade_name':'taxa', 'relative_abundance':'abundance'}, inplace = True) 
    
    df = df[df['taxa'].str.contains("\|s__")] # keep families genera
    df = df[~df['taxa'].str.contains("\|t__")] # remove genera

    df["taxa"] = df['taxa'].str.split("\|s__").str[-1]
#     df["taxa"] = df['taxa'].str.split("\|s__").str[0]
    
    # percent unclassified organisms profiled with mgx
    unclassified = sum(df.taxa.str.contains("_unclassified| noname | Candidatus"))/df.shape[0]
    unclassified_list.append(unclassified)
    
    # remove unclassified
    df = df[~df.taxa.str.contains("_unclassified")]
    df = df[~df.taxa.str.contains("_noname")]
    df = df[~df.taxa.str.contains("Candidatus")]
    
    # combine together taxa of the same genera
    df = df.groupby(['taxa', 'sampleid'])['abundance'].sum().reset_index()    
    
    # convert to relative abundance
    df["abundance"] = df["abundance"]/100.0
    
    df_from_each_file.append(df)


In [21]:
mgx_species = pd.concat(df_from_each_file, ignore_index=True) # concat all dataframes together

In [22]:
mgx_species.head()

Unnamed: 0,taxa,sampleid,abundance
0,Agathobaculum_butyriciproducens,C0047-7E-1A,0.001349
1,Alistipes_finegoldii,C0047-7E-1A,0.000892
2,Alistipes_putredinis,C0047-7E-1A,0.033009
3,Alistipes_shahii,C0047-7E-1A,0.002187
4,Anaerostipes_hadrus,C0047-7E-1A,0.010047


In [23]:
amplicon.head()

Unnamed: 0,taxa,C0047-7E-1A,C0052-7E-1A,C0053-8E-1A,C0058-4E-1A,C0059-4E-1A,C0106-6E-1A,C0107-4E-1A,C0123-3E-1A,C0126-9E-1A,...,C1102-1F-1A,C1109-4F-1A,C1115-4F-1A,C1135-4F-1A,C1160-3F-1A,C1177-4F-1A,C1219-3F-1A,C2002-3E-1A,C2014-2E-1A,C2018-3E-1A
0,D_0__Bacteria; D_1__Bacteroidetes; D_2__Bacter...,0.0,1163.0,0.0,0.0,0.0,0.0,0.0,214.0,1476.0,...,369.0,0.0,0.0,0.0,9.0,336.0,0.0,577.0,0.0,376.0
1,D_0__Bacteria; D_1__Proteobacteria; D_2__Gamma...,0.0,5.0,42.0,0.0,0.0,7.0,0.0,0.0,0.0,...,4.0,0.0,637.0,593.0,1559.0,0.0,14633.0,18.0,1574.0,13.0
2,D_0__Bacteria; D_1__Bacteroidetes; D_2__Bacter...,0.0,1595.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,...,826.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8823.0
3,D_0__Bacteria; D_1__Bacteroidetes; D_2__Bacter...,0.0,114.0,0.0,0.0,0.0,0.0,0.0,79.0,533.0,...,420.0,19.0,0.0,32.0,0.0,0.0,1801.0,889.0,0.0,0.0
4,D_0__Bacteria; D_1__Bacteroidetes; D_2__Bacter...,1289.0,0.0,0.0,2493.0,5670.0,4056.0,1143.0,0.0,0.0,...,831.0,0.0,0.0,0.0,0.0,13995.0,0.0,198.0,0.0,0.0


In [24]:
# clean taxonomic names
# keep only species
amplicon_species = amplicon
amplicon_species["taxa"] = amplicon_species['taxa'].str.split("D_6__", expand=True)[1]

In [25]:
amplicon_species = amplicon.dropna()

In [26]:
# remove if genus is less than 5 characters
remove = list(set([name for name in amplicon_species["taxa"] if len(name)<5]))

In [27]:
# remove if numbers in genus
remove2 = list(set([name for name in amplicon_species["taxa"] if not name.isalpha()]))
# remove if Candidatus in genus
remove3 = list(set([name for name in amplicon_species["taxa"] if "Candidatus" in name]))
remove_list = remove + remove2 + remove3 + ["uncultured", "Family", "None", "Clade", 
                                            "Subgroup", "Ruminococcaceae"]
amplicon_species = amplicon_species[~amplicon_species.taxa.isin(remove_list)]

In [28]:
amplicon_species.head()

Unnamed: 0,taxa,C0047-7E-1A,C0052-7E-1A,C0053-8E-1A,C0058-4E-1A,C0059-4E-1A,C0106-6E-1A,C0107-4E-1A,C0123-3E-1A,C0126-9E-1A,...,C1102-1F-1A,C1109-4F-1A,C1115-4F-1A,C1135-4F-1A,C1160-3F-1A,C1177-4F-1A,C1219-3F-1A,C2002-3E-1A,C2014-2E-1A,C2018-3E-1A
121,metagenome,50.0,47.0,90.0,247.0,154.0,0.0,0.0,33.0,156.0,...,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,836.0
152,metagenome,0.0,0.0,0.0,941.0,72.0,108.0,0.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2281.0,0.0,0.0
270,unidentified,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447,metagenome,91.0,0.0,10.0,0.0,0.0,95.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
470,metagenome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,71.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
species_amplicon_list = set(amplicon_species["taxa"])
species_mgx_list = set(mgx_species["taxa"])

In [37]:
species_intersection = [taxa for taxa in species_amplicon_list.intersection(species_mgx_list) if taxa not in exclude]
species_union = [taxa for taxa in species_amplicon_list.union(species_mgx_list) if taxa not in exclude]

In [38]:
species_amp_only = [taxa for taxa in species_amplicon_list.difference(species_intersection) if taxa not in exclude]
species_mgx_only = [taxa for taxa in species_mgx_list.difference(species_intersection) if taxa not in exclude]

In [39]:
len(species_intersection) # genera found in both
len(species_amp_only) # genera found in amplicon only
len(species_mgx_only) # genera found in mgx only

0

2

385