Due to the large freq of "unknown" in the taxonomic annotation (in the gene level) - we will try to better understand the source of this problem and the alternatives in our hands. 
First, we will compare the freq of "unknown" in relative abundance using GTDB vs. Greengenes. 
Second, we will try to assign taxonomy using MGBC, and see how much improvement do we get. 

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from data.taxonomy_normalizer import NaiveTaxonomyNormalizer, preprocess_filter_rare_taxa


taxa = pd.read_csv('taxonomic_observed_abundance_HaddadOSA.csv').set_index('#SampleID')
metadata = pd.read_csv('relevant_metadata_haddad_osa.csv').set_index('#SampleID')
sotu_greengenes = pd.read_table("original/haddad_6weeks_deblur_otus_rare2k_matched_wtaxa.txt", sep='\t')


In [2]:
control_samples = metadata[metadata.control].index
taxa = taxa.loc[control_samples, :]

In [3]:
DROP_UNKNOWN_TAXA = False
UNKNOWN_TH = 1.0
naive_normalizer = NaiveTaxonomyNormalizer(drop_unknown_taxa=DROP_UNKNOWN_TAXA, unknown_taxa_sample_threshold=UNKNOWN_TH)
taxa = preprocess_filter_rare_taxa(taxa, verbose=False, percentage=10)
relative_abundance = naive_normalizer.normalize(taxa)

In [14]:
sotu_greengenes = pd.read_table("original/haddad_6weeks_deblur_otus_rare2k_matched_wtaxa.txt", sep='\t')
sotu_greengenes = sotu_greengenes.drop(columns=['#OTU ID'])
sotu_greengenes['taxonomy'] = sotu_greengenes['taxonomy'].str.extract(r'.*(g__.+);.*', expand=False).fillna(value='Unknown')
sotu_greengenes = sotu_greengenes.groupby('taxonomy').sum().T
sotu_greengenes = sotu_greengenes.loc[control_samples, :]


In [15]:
taxa_greengenes = preprocess_filter_rare_taxa(sotu_greengenes, verbose=True, percentage=10)
relative_abundance_greengenes = naive_normalizer.normalize(taxa_greengenes)

In [16]:
relative_abundance['Unknown'].describe()


In [17]:
relative_abundance_greengenes['Unknown'].describe()