In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import janitor
import numpy as np
import rpy2.robjects as robjects

In [None]:
metadata = pd.read_csv("diabimmune_metadata.csv")

In [None]:
metadata = metadata.transpose()

In [None]:
metadata.columns = metadata.iloc[0]
metadata.drop(["Unnamed: 0"], inplace=True)

In [None]:
metadata.head()

In [None]:
# map gid_16s to gid_wgs
# keys are 16S ids, values are mgx ids

id_mapping = pd.Series(metadata.gid_wgs.values,index=metadata.gid_16s).to_dict()

## Importing metagenomics data

### Genus 

In [None]:
# read in mgx data
mgx_genus = pd.read_csv("diabimmune_mgx_genus.csv")
mgx_genus.rename(columns={'taxname': 'taxa'}, inplace=True)

In [None]:
# mgx_genus = mgx[mgx['taxa'].str.contains("\|g__")] # keep genera
# mgx_genus = mgx_genus[~mgx_genus['taxa'].str.contains("\|s__")] # keep species
# mgx_genus["taxa"] = mgx_genus['taxa'].str.split("\|g__").str[-1]
# mgx_genus["taxa"] = mgx_genus['taxa'].str.split("\|s__").str[0]#

In [None]:
# remove taxa that are unclassified or have no name
# "_unclassified"
# "_noname"
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("_unclassified")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("_noname")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("virus")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("Candidatus")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("candidate")]

In [None]:
mgx_genus = mgx_genus.groupby(['taxa']).sum().reset_index() 

In [None]:
mgx_genus.head()

### Families

In [None]:
mgx_family = pd.read_csv("diabimmune_mgx_family.csv")

In [None]:
mgx_family.head()

### Looking at species in metagenomic data

In [None]:
# mgx_species = mgx[mgx['taxa'].str.contains("\|s__")] # keep families genera
# mgx_species = mgx_species[~mgx_species['taxa'].str.contains("\|t__")] # remove genera

# mgx_species["taxa"] = mgx_species['taxa'].str.split("\|s__").str[-1]

In [None]:
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("_unclassified")]
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("_noname")]
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("virus")]
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("Candidatus")]
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("candidate")]

In [None]:
# mgx_species = mgx_species.groupby(['taxa']).sum().reset_index() 

In [None]:
mgx_species = pd.read_csv("diabimmune_mgx_species.csv")
# mgx_genus.rename(columns={'taxname': 'taxa'}, inplace=True)

In [None]:
mgx_species.head()

In [None]:
len(mgx_species)

## Importing amplicon data

### Genus

In [None]:
# read in 16S data

amp_genus = pd.read_csv("karalia_dada2_genera.csv")
amp_genus.rename(columns={'genus': 'taxa'}, inplace=True)

In [None]:
amp_genus.head()

In [None]:
# remove taxa that are unclassified or have no name
# "_unclassified"
# "_noname"
amp_genus = amp_genus[~amp_genus.taxa.str.contains("_unclassified")]
amp_genus = amp_genus[~amp_genus.taxa.str.contains("_noname")]
amp_genus = amp_genus[~amp_genus.taxa.str.contains("virus")]
amp_genus = amp_genus[~amp_genus.taxa.str.contains("Candidatus")]
amp_genus = amp_genus[~amp_genus.taxa.str.contains(r'[0-9]')]
amp_genus = amp_genus[~amp_genus.taxa.str.contains("group")]
amp_genus['taxa'].replace('', np.nan, inplace=True)
amp_genus.dropna(subset=['taxa'], inplace=True)

In [None]:
amp_genus.taxa = amp_genus.taxa = amp_genus.taxa.str.strip('[]')

In [None]:
# remove digits
amp_genus.taxa = amp_genus[~amp_genus.taxa.str.contains(r'\d')]

In [None]:
amp_genus = amp_genus.groupby(['taxa']).sum().reset_index() 

In [None]:
amp_genus = pd.DataFrame(amp_genus).rename(columns=id_mapping)

In [None]:
amp_genus = amp_genus.loc[:, amp_genus.columns.notnull()]

In [None]:
amp_genus.head()

In [None]:
amp_genus.shape

In [None]:
mgx_genus.shape

### amplicon family

In [None]:
# amp_family = amp[amp['taxa'].str.contains("\|f__")] # keep families genera
# amp_family = amp_family[~amp_family['taxa'].str.contains("\|g__")] # remove genera

# amp_family["taxa"] = amp_family['taxa'].str.split("\|f__").str[-1]

In [None]:
# amp_family = amp_family[~amp_family.taxa.str.contains("_unclassified")]
# amp_family = amp_family[~amp_family.taxa.str.contains("_noname")]
# amp_family = amp_family[~amp_family.taxa.str.contains("virus")]
# amp_family = amp_family[~amp_family.taxa.str.contains("Candidatus")]
# amp_family = amp_family[~amp_family.taxa.str.contains(r'[0-9]')]
# amp_family = amp_family[~amp_family.taxa.str.contains("group")]
# amp_family['taxa'].replace('', np.nan, inplace=True)
# amp_family.dropna(subset=['taxa'], inplace=True)

In [None]:
# amp_family = amp_family.groupby(['taxa']).sum().reset_index() 

In [None]:
# amp_family.sort_values(by = "taxa", ascending=True, inplace=True)

In [None]:
amp_family = pd.read_csv("karalia_dada2_family.csv")
amp_family.rename(columns={'family': 'taxa'}, inplace=True)

In [None]:
amp_family.head()

In [None]:
# ratio of unclassified genera for 16S profiling
# sum(amplicon.taxa.str.contains("_unclassified| unidentified| uncultured| group"))/amplicon.shape[0]

In [None]:
# amplicon.to_csv('16S_abundance.csv')

### amplicon species

In [None]:
amp_species = pd.read_csv("karalia_dada2_species.csv")
amp_species.rename(columns={'species': 'taxa'}, inplace=True)

In [None]:
amp_species["taxa"] = amp_species['taxa'].str.replace('s__', '')

In [None]:
amp_species.head()

### looking for the intersection of taxonomy at genus level
#### calculate number of taxa in both vs. either one

In [None]:
# calculate number of taxa in both vs. either one

In [None]:
amplicon_genera_list = set(amp_genus["taxa"])
mgx_genera_list = set(mgx_genus["taxa"])

In [None]:
intersection_genera = amplicon_genera_list.intersection(mgx_genera_list)
union_genera = amplicon_genera_list.union(mgx_genera_list)

In [None]:
len(intersection_genera)
len(amplicon_genera_list)-len(intersection_genera)
len(mgx_genera_list)-len(intersection_genera)

In [None]:
amplicon_genera = amplicon_genera_list.difference(intersection_genera)
mgx_genera = mgx_genera_list.difference(intersection_genera)

In [None]:
len(amplicon_genera)
len(mgx_genera)

### looking for the intersection of taxonomy at family level

In [None]:
amplicon_family_list = set(amp_family["taxa"])
mgx_family_list = set(mgx_family["taxname"])

intersection_family = amplicon_family_list.intersection(mgx_family_list)
union_family = amplicon_family_list.union(mgx_family_list)

len(intersection_family)
len(amplicon_family_list)-len(intersection_family)
len(mgx_family_list)-len(intersection_family)

In [None]:
amplicon_family = amplicon_family_list.difference(intersection_family)
mgx_family = mgx_family_list.difference(intersection_family)

len(amplicon_family)
len(mgx_family)

### looking for the intersection of taxonomy at species level

In [None]:
mgx_species.head()

In [None]:
amp_species.head()

In [None]:
amplicon_species_list = set(amp_species["taxa"])
mgx_species_list = set(mgx_species["taxname"])

intersection_species = amplicon_species_list.intersection(mgx_species_list)
union_species = amplicon_species_list.union(mgx_species_list)

len(intersection_species)
len(amplicon_species_list)-len(intersection_species)
len(mgx_species_list)-len(intersection_species)

In [None]:
amplicon_species = amplicon_species_list.difference(intersection_species)
mgx_species = mgx_species_list.difference(intersection_species)

len(amplicon_species)
len(mgx_species)

### creating long-form dataframe with data from both sequencing methods

In [None]:
mgx_genus.head()

In [None]:
amp_genus.head()

In [None]:
# only keeping samples we have both mgx and 16S profiles for
samples_intersect = set(mgx_genus.columns.values).intersection(set(amp_genus.columns.values))

In [None]:
# remove mothers
children_intersect = [id for id in samples_intersect if not id.startswith("M")]

In [None]:
len(amp_genus.columns.values)

In [None]:
mgx_genus.columns.values

In [None]:
samples_intersect

In [None]:
len(samples_intersect)

In [None]:
metadata.head()

In [None]:
mgx = mgx_genus[children_intersect]
amplicon = amp_genus[children_intersect]

In [None]:
# confirm that we now have the same samples
(amp_genus.columns.values) == (mgx_genus.columns.values)

In [None]:
amplicon_melt = pd.melt(amplicon, id_vars=["taxa"], var_name = "sampleid", value_name = "amplicon_abund")

In [None]:
mgx_melt = pd.melt(mgx, id_vars=["taxa"], var_name = "sampleid", value_name = "mgx_abund")

In [None]:
merged_taxa = pd.merge(amplicon_melt, mgx_melt, on = ["sampleid", "taxa"], how = "outer")

In [None]:
merged_taxa.to_csv('taxa_abundance_comparison.csv')

In [None]:
merged_taxa["abs_diff"] = abs(merged_taxa["amplicon_abund"] - merged_taxa["mgx_abund"])
merged_taxa["tot_diff"] = (merged_taxa["amplicon_abund"] - merged_taxa["mgx_abund"])

In [None]:
merged_taxa.sample(10)

In [None]:
merged_taxa.fillna(0)

In [None]:
amplicon_avg_abund = merged_taxa.groupby("taxa")["amplicon_abund"].mean()
mgx_avg_abund = merged_taxa.groupby("taxa")["mgx_abund"].mean()
taxa_list = sorted(set(merged_taxa["taxa"]))

In [None]:
mean_taxa_abund = pd.DataFrame(
    (zip(taxa_list, amplicon_avg_abund, mgx_avg_abund)),  
    columns = ['taxa','amp_avg_abund', 'mgx_avg_abund'])

In [None]:
mean_taxa_abund["abs_diff"] = abs(mean_taxa_abund["amp_avg_abund"] - mean_taxa_abund["mgx_avg_abund"])
mean_taxa_abund["total_diff"] = mean_taxa_abund["amp_avg_abund"] - mean_taxa_abund["mgx_avg_abund"]

In [None]:
mean_taxa_abund.sort_values("abs_diff", axis = 0, ascending = True, 
                 inplace = True, na_position ='last')

In [None]:
mean_taxa_abund.sample(10)

In [None]:
mean_taxa_abund.to_csv('taxa_difference.csv')

## making giant dataframe of abundances

In [None]:
amp_trans = amplicon.set_index("taxa").transpose()

In [None]:
amp_trans.reset_index(level=0, inplace=True)

In [None]:
amp_trans.rename(columns = {'index':'sampleid'}, inplace = True) 

In [None]:
amp_trans["uid"] = amp_trans["sampleid"].astype(str)+'-amp'

In [None]:
amp_trans["method"] = "amp"

In [None]:
amp_trans.head()

In [None]:
mgx_trans = mgx.set_index("taxa").transpose()

In [None]:
mgx_trans.reset_index(level=0, inplace=True)

In [None]:
mgx_trans.rename(columns = {'index':'sampleid'}, inplace = True)

In [None]:
mgx_trans["uid"] = mgx_trans["sampleid"].astype(str)+'-mgx'

In [None]:
mgx_trans["method"] = "mgx"

In [None]:
mgx_trans.head()

In [None]:
concat_df = pd.concat([mgx_trans,amp_trans], sort=True).reset_index(drop = True)

In [None]:
len(concat_df)

In [None]:
age = pd.read_csv("~/Documents/thesis/analysis/metadatawide.csv")

In [None]:
age = age[['sample','childAgeMonths']]

In [None]:
age["sample"] = age["sample"].str.replace("_",'-')
age.rename(columns = {'sample':'sampleid'}, inplace = True) 

In [None]:
# make age dictionary
agedict = {str(s): {} for s in age["sampleid"]}
for index, row in age.iterrows():
    age_months = row["childAgeMonths"]
    agedict[row["sampleid"]]= age_months

In [None]:
concat_df["AgeMonths"]= concat_df["sampleid"].map(agedict)

In [None]:
cols_to_order = ['uid', 'sampleid',"method", "AgeMonths"]
new_columns = cols_to_order + (concat_df.columns.drop(cols_to_order).tolist())
concat_df = concat_df[new_columns]

In [None]:
concat_df.sample(15)

In [None]:
concat_df.to_csv('transposed_mgxamp_df.csv')

In [None]:
pwd()