In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import janitor
import numpy as np
import matplotlib.pyplot as plt

import os
import glob

In [2]:
# read in original data

path = '/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/*'                     
all_files = [name for name in glob.glob(path)]   

In [3]:
df_from_each_file = []
unclassified_list = []

for f in all_files:
    print(f)
    df = pd.read_csv(f, sep = '\t', skiprows=[0,1,2], usecols = ['#clade_name','relative_abundance']) # read in dataframe 
    
    id_1 = f.split('profiles/')[1] # add sample id from filename 
    id_2 = id_1.split('_S')[0]
    df["sampleid"] = id_2
    
    df.rename(columns = {'#clade_name':'taxa', 'relative_abundance':'abundance'}, inplace = True) 
    
    df = df[df['taxa'].str.contains("\|g__")] # keep genera
    df = df[~df['taxa'].str.contains("\|s__")] # keep species
    df["taxa"] = df['taxa'].str.split("\|g__").str[-1]
    df["taxa"] = df['taxa'].str.split("\|s__").str[0]
    
    # percent unclassified organisms profiled with mgx
    unclassified = sum(df.taxa.str.contains("_unclassified| noname | Candidatus"))/df.shape[0]
    unclassified_list.append(unclassified)
    
    # remove unclassified
    df = df[~df.taxa.str.contains("_unclassified")]
    df = df[~df.taxa.str.contains("_noname")]
    df = df[~df.taxa.str.contains("Candidatus")]
    
    # combine together taxa of the same genera
    df = df.groupby(['taxa', 'sampleid'])['abundance'].sum().reset_index()    
    
    # convert to relative abundance
    df["abundance"] = df["abundance"]/100.0
    
    df_from_each_file.append(df)

/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_1000k_1_profile.tsv
/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_1000k_2_profile.tsv
/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_1000k_3_profile.tsv
/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_1000k_4_profile.tsv
/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_100k_1_profile.tsv
/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_100k_2_profile.tsv
/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_100k_3_profile.tsv
/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_100k_4_profile.tsv
/Users/danielle/Documents/thesis/subsampled_analysis/subsample_2_profiles/C0005_3F_1A_10k_2_profile.tsv


ZeroDivisionError: division by zero

In [None]:
# averaging mean unclassified across all dataframes
np.mean(unclassified_list)*100

In [None]:
original = pd.concat(df_from_each_file, ignore_index=True) # concat all dataframes together

In [None]:
original_reshaped = original.pivot_table(index = "sampleid", values="abundance", columns = "taxa") # pivot

In [None]:
original_reshaped.head()

In [None]:
original_reshaped.columns.name = None

In [None]:
original_reshaped = original_reshaped.rename_axis(None, axis=1).reset_index()

In [None]:
original_reshaped = original_reshaped.fillna(0)  # fill in missing values with 0

In [None]:
original_reshaped["uid"] = original_reshaped["sampleid"].astype(str)+'-original'# add unique identifier

In [None]:
original_reshaped.head(15)

In [None]:
subsample = pd.read_csv("~/Documents/thesis/subsampled_analysis/merged_abundance_table.tsv", sep='\t')

In [None]:
subsample["ID"][8]

In [None]:
# only keep genera
subsample = subsample[subsample['ID'].str.contains("\|g__")]
subsample = subsample[~subsample['ID'].str.contains("\|s__")]
subsample = subsample[~subsample['ID'].str.contains("\|t__")]

In [None]:
# cleaning genera name
subsample["ID"] = subsample['ID'].str.split("\|g__").str[-1]
subsample["ID"] = subsample['ID'].str.split("\|s__").str[0]

In [None]:
subsample.head()

In [None]:
subsample_transposed = subsample.set_index('ID').transpose()

In [None]:
subsample_transposed.reset_index(level=0, inplace=True)

In [None]:
subsample_transposed.rename(columns = {'index':'sampleid'}, inplace = True) 

In [None]:
subsample_transposed.head()

In [None]:
# melt dataframes
subsample_melt = pd.melt(subsample, id_vars=["ID"], var_name = "sampleid", value_name = "abund")

In [None]:
subsample_melt.sample(5)

In [None]:
subsample_melt["replicate"] = subsample_melt['sampleid'].str.split("k_").str[-1]
subsample_melt["replicate"] = subsample_melt['replicate'].str.split("_profile").str[0]

In [None]:
subsample_melt["read_depth"] = subsample_melt['sampleid'].str.split("1A_").str[-1]
subsample_melt["read_depth"] = subsample_melt['read_depth'].str.split("k_").str[0]

In [None]:
subsample_melt.to_csv('subsample_cleaned.csv')

In [None]:
# add metadata to transposed dataframe
subsample_transposed["replicate"] = subsample_transposed['sampleid'].str.split("k_").str[-1]
subsample_transposed["replicate"] = subsample_transposed['replicate'].str.split("_profile").str[0]
subsample_transposed["read_depth"] = subsample_transposed['sampleid'].str.split("1A_").str[-1]
subsample_transposed["read_depth"] =subsample_transposed['read_depth'].str.split("k_").str[0]


In [None]:
subsample_transposed['read_depth'] = (subsample_transposed['read_depth'].astype(str).astype(int))*1000.0

In [None]:
subsample_transposed.head()

In [None]:
# fix '_' in sampleid, sample names
subsample_transposed["sampleid"] = subsample_transposed["sampleid"].str.replace("_",'-')

In [None]:
subsample_transposed["uid"] = subsample_transposed["sampleid"]


In [None]:
subsample_transposed["sampleid"] = subsample_transposed["sampleid"].str.split("-10").str[0]

In [None]:
subsample_transposed.drop(columns=['replicate'], inplace= True)

In [None]:
subsample_transposed.head()

In [None]:
# mgx data
original = pd.read_csv("~/Documents/thesis/analysis/mgx_abundance.csv")

In [None]:
original_transposed = original.set_index("taxa").transpose()
original_transposed.reset_index(level=0, inplace=True)
original_transposed.rename(columns = {'index':'sampleid'}, inplace = True) 
original_transposed["uid"] = original_transposed["sampleid"].astype(str)+'-original'

In [None]:
# adding age metadata
age = pd.read_csv("~/Documents/thesis/theoretical/sorted_babies.csv", index_col = 0)

In [None]:
age.head()

In [None]:
# remove shannon column, change characters in sample names 
age.drop(columns=['shannon'], inplace= True)
age["sample"] = age["sample"].str.replace("_",'-')
age.rename(columns = {'sample':'sampleid', "reads":"read_depth"}, inplace = True) 

In [None]:
age.head(15)

In [None]:
# make age dictionary
agedict = {str(s): {} for s in age["sampleid"]}
for index, row in age.iterrows():
    age_months = row["AgeMonths"]
    agedict[row["sampleid"]]= age_months

In [None]:
# read_depth dictionary
readdict = {str(s): {} for s in age["sampleid"]}
for index, row in age.iterrows():
    reads = row["read_depth"]
    readdict[row["sampleid"]]= reads

In [None]:
# dev_stage dictionary
dev_stage_dict = {str(s): {} for s in age["dev_stage"]}
for index, row in age.iterrows():
    stage = row["dev_stage"]
    dev_stage_dict[row["sampleid"]] = stage

In [None]:
original_reshaped["AgeMonths"]= original_reshaped["sampleid"].map(agedict)
original_reshaped["read_depth"]= original_reshaped["sampleid"].map(readdict)
original_reshaped["dev_stage"]= original_reshaped["sampleid"].map(dev_stage_dict)
original_reshaped["sampling_cat"] = "original depth"

In [None]:
original_reshaped.head(15)

In [None]:
subsample_transposed["AgeMonths"]= subsample_transposed["sampleid"].map(agedict)
subsample_transposed["dev_stage"]= subsample_transposed["sampleid"].map(dev_stage_dict)
subsample_transposed["sampling_cat"] = subsample_transposed["read_depth"]

In [None]:
subsample_transposed.head(15)

In [None]:
concat_df = pd.concat([original_reshaped,subsample_transposed], sort=True).reset_index(drop = True)

In [None]:
cols_to_order = ['uid', 'sampleid', 'read_depth', 'AgeMonths', 'dev_stage', 'sampling_cat']
new_columns = cols_to_order + (concat_df.columns.drop(cols_to_order).tolist())
concat_df = concat_df[new_columns]

In [None]:
# remove mothers
# concat_df = concat_df[~concat_df.sampleid.str.contains("M")]

In [None]:
concat_df.sample(15)

In [None]:
concat_df.to_csv('subsampled_df_2.csv')