In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime


In [None]:
pd.options.display.float_format = '{:.3f}'.format

In [None]:
os.makedirs('../results_diversity', exist_ok=True)

today = datetime.today().strftime('%Y-%m-%d')
os.makedirs(f'../results_diversity/{today}', exist_ok=True)

In [None]:
# MAJOR PROCESSING FUNCTIONS
def merge_tables():
    ...
    

In [None]:
# GENERAL VARIABLES
POOL_list = [f'POOL{i}' for i in range(1, 13)]


In [None]:
list_dfs_per[0]

In [None]:
# Attributes 
cutoff_NA_ratio = 0.35

In [None]:
list_dfs_means, list_dfs_per = [], []

# Create the pooled dataframe. We are going to separate mean and percentage to have some representation of two variables.
for POOL in POOL_list:
    df_POOL = pd.read_csv(f'../results_diversity/{today}/{POOL}.diversity_cutoff.tsv', sep='\t', index_col='Unnamed: 0')
    df_POOL.reset_index(inplace=True)

    df_POOL = df_POOL[['index', 'name', 'mean (%)', 'mean']].rename(columns = {'mean (%)': f'mean (%) {POOL}', 'mean': f'mean {POOL}'})
    df_POOL['taxon - genus'] = df_POOL['index'].astype(str) + ' - ' + df_POOL['name']
    
    df_POOL = df_POOL.set_index('taxon - genus')

    list_dfs_means.append(df_POOL[f'mean {POOL}'])
    list_dfs_per.append(df_POOL[f'mean (%) {POOL}'])


df_mean, df_per = pd.concat(list_dfs_means, axis=1), pd.concat(list_dfs_per, axis=1)


# NA cut to keep only species that have only a set of values as NAs
nonNA_index = df_mean[df_mean.isna().sum(1) < int(cutoff_NA_ratio * len(POOL_list))].index

# Then we order by the median of the values (using mean skewed some species much present in a few samples)
df_mean_nonNA = df_mean.loc[nonNA_index]
df_mean_nonNA = df_mean_nonNA.assign(m=df_mean_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_mean_nonNA.to_csv(f'../results_diversity/{today}/mean_nonNA.tsv', sep='\t')


df_per_nonNA = df_per.loc[nonNA_index]
df_per_nonNA = df_per_nonNA.assign(m=df_per_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_per_nonNA.to_csv(f'../results_diversity/{today}/per_nonNA.tsv', sep='\t')



In [None]:
list_dfs = []

# Create the pooled dataframe. We are going to separate mean and percentage to have some representation of two variables.
for POOL in POOL_list:
    df_POOL = pd.read_csv(f'../results_diversity/{today}/{POOL}.diversity_cutoff.tsv', sep='\t', index_col='Unnamed: 0')
    df_POOL.reset_index(inplace=True)

    df_POOL = df_POOL[['index', 'name', 'lineage']]
    df_POOL['taxon - genus'] = df_POOL['index'].astype(str) + ' - ' + df_POOL['name']
    
    df_POOL = df_POOL.set_index('taxon - genus')

    list_dfs.append(df_POOL[['lineage']], )


df = pd.concat(list_dfs).drop_duplicates()
fungi_idx = df.loc[['Fungi' in i for i in df['lineage'].values]].index


df_mean_fungi = df_mean.loc[fungi_idx]
cutoff_NA_ratio = 0.6
df_mean_fungi[df_mean_fungi.isna().sum(1) < int(cutoff_NA_ratio * len(POOL_list))].assign(m=df_per_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)


In [None]:
df_mean_nonNA

In [None]:
df_per_nonNA


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_mean_nonNA.iloc[1:, :]), yticklabels=True, annot=True)
plt.title('log10 mean counts')
plt.tight_layout()
plt.savefig(f'../results_diversity/{today}/heatmap_mean_nonNA_annot.png', dpi=300)



fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_mean_nonNA.iloc[1:, :]), yticklabels=True, annot=False)
plt.title('log10 mean counts')
plt.tight_layout()
plt.savefig(f'../results_diversity/{today}/heatmap_mean_nonNA.png', dpi=300)


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_per_nonNA.iloc[1:, :]), yticklabels=True, annot=True)
plt.title('log10 percentage counts')
plt.tight_layout()
plt.savefig(f'../results_diversity/{today}/heatmap_per_nonNA_annot.png', dpi=300)



fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_per_nonNA.iloc[1:, :]), yticklabels=True, annot=False)
plt.title('log10 percentage counts')
plt.tight_layout()
plt.savefig(f'../results_diversity/{today}/heatmap_per_nonNA.png', dpi=300)

In [None]:
sns.heatmap(np.log10(df_per_nonNA.iloc[1:, :]))


In [None]:
sns.heatmap(np.log10(df_mean_nonNA.iloc[1:, :]))