# ANCOM

In [1]:
import sys
import os
import shutil

import pandas as pd
from qiime2 import Visualization

In [2]:
qzv_in = "ancom_treatment_full_3.qzv"
outdir = "tmp"

qzv = Visualization.load(qzv_in)
qzv.export_data(outdir)

In [3]:
# import ancom table, add a "group" column to avoid NaNs 
df_ancom = pd.read_csv(f"{outdir}/ancom.tsv", sep="\t", index_col=0)
df_ancom.loc["Group"] = 2 * ["-"]
df_ancom.drop(["W"], axis=1, inplace=True)

# import data table, add a "group" column to avoid NaNs 
df_data = pd.read_csv(f"{outdir}/data.tsv", sep="\t", index_col=0)
df_data.loc["Group"] = 2 * ["-"]

# import percent abundances
df_percent_abundances = pd.read_csv(f"{outdir}/percent-abundances.tsv", sep = "\t", index_col=0)

# mix the three of them
df_ancom_data = pd.concat([df_ancom, df_data], axis=1)

df = pd.concat([df_ancom_data, df_percent_abundances], axis=1)
# df.to_csv(f".tsv", sep="\t")

In [4]:
shutil.rmtree("tmp")

In [5]:
# Get differentially expressed taxa
significative_df = df[df["Reject null hypothesis"] == True].loc[:,["Reject null hypothesis", "clr", "W"]]
significative_taxa = list(significative_df.index)

In [6]:
significative_df

Unnamed: 0,Reject null hypothesis,clr,W


In [7]:
type(significative_taxa)

list

# Prevalence

In [None]:
import sys
import os

import pandas as pd
from qiime2 import Artifact

In [None]:
def normalize_dataframe(dataframe, criteria=0):
    """
    Change the dataframe to an absence-presence matrix
    based on a criteria (by now, a number)
    """
    
    row_number, col_number = dataframe.shape
    
    for row in range(0, row_number):
        for col in range(0, col_number):
            if dataframe.iloc[row, col] >= criteria:
                dataframe.iloc[row, col] = 1
            else:
                dataframe.iloc[row, col] = 0
                
    return dataframe

def create_category_dict(metadata):
    """
    Create, from the metadata dataframe, a dict with
    key: category; val: values in that category
    if only one category, it wont be taken into account
    """
    valid_categories = dict()
    category_names_list = list(metadata.columns)

    # get all different possibilities for each metadata column
    for col_index in range(metadata.shape[1]):
        
        # list from a set to avoid repeating
        groups = (list(set(metadata[category_names_list[col_index]])))
        
        # if more than 1 different category, add it to the dict
        if len(groups) > 1:
            category_name = category_names_list[col_index]
            valid_categories[category_name] = [item for item in groups]

    return valid_categories, category_names_list

In [None]:
qza_in = "collapsed_full_table_raw_lvl_6.qza"
metadata_file = "metadata.tsv"
lvl = 6

In [None]:
try:
    os.mkdir(f"prevalence")
except:
    pass
qza = Artifact.load(qza_in)
counts = qza.view(pd.DataFrame)
metadata = pd.read_csv(
    metadata_file,
    sep='\t',
    header=0,
    index_col=0
    )
full_df = pd.concat([metadata, counts], axis=1)

In [None]:
counts

In [None]:
metadata

In [None]:
full_df

In [None]:
valid_categories, category_names_list = create_category_dict(metadata)

In [None]:
valid_categories

In [None]:
category_names_list

In [None]:
for category, values in valid_categories.items():
    
    prevalence_per_value = []
    
    # print(category)
    for value in values:
        # print(value)
        # Drop metadata columns
        sub_df = full_df[full_df[category] == value].drop(category_names_list, axis=1)
        # Normalize (0: absence, 1: presence)
        norm_df = normalize_dataframe(sub_df, criteria=1)
        norm_df.loc["Prevalence"] = norm_df.sum(axis=0)
        
        row_number, col_number = norm_df.shape
        
        # data
        
        for column in range(0, col_number):
            # Get the relative abundance of each taxon on each group
            norm_df.iloc[row_number-1, column] = norm_df.iloc[row_number-1, column]*100/(row_number-1)
        norm_df.rename({"Prevalence":value}, axis=0, inplace=True)
        
        prevalence_per_value.append(norm_df.loc[value].to_frame().transpose())        
        
    prevalence_df = pd.concat(prevalence_per_value)
    
    prevalence_df.to_csv(f"prevalence_lvl_{lvl}_{category}_{value}_wide.tsv", sep="\t")
    prevalence_df.transpose().to_csv(f"prevalence_lvl_{lvl}_{category}_{value}_long.tsv", sep="\t")
    
    
        

In [None]:
norm_df

In [None]:
prevalence_df

# RELATIVE COUNTS

In [None]:
import shutil
import os
import sys

import pandas as pd
from qiime2 import Artifact

In [None]:
filename = "collapsed_full_table_clean_lvl6.qza"
outdir = "lvl6"

In [None]:
# Open visualization
qza_artifact = Artifact.load(filename)
df = qza_artifact.view(pd.DataFrame)

In [None]:
df

In [None]:
df["Total"] = df.sum(axis=1)

In [None]:
df

In [None]:
rownum, colnum = df.shape
for row in range(rownum-1):
    for col in range(colnum-1):
        df.iloc[row, col] = df.iloc[row, col] * 100 / df.iloc[row, col-1]

In [None]:
df

In [None]:
df["Total"] = df.sum()
# Delete unwanted dirs & files
# Hardcoded but its always the same so
dirs_to_del = ["css", "js", "q2templateassets"]

for folder in dirs_to_del:
    shutil.rmtree(f"{outdir}/{folder}")

files_to_del = ["index.html"]
for file in files_to_del:
    os.remove(f"{outdir}/{file}")

In [None]:
df = pd.read_csv(f"{outdir}/metadata.tsv", sep="\t", header=0, index_col=0)

df = df.drop("#q2:types")
df.to_csv(f"{table_name}.tsv", sep="\t")
df.transpose().to_csv(f"{table_name}_long.tsv", sep="\t")