In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime


In [None]:
pd.options.display.float_format = '{:.3f}'.format

In [None]:
# VARS
dir_diversity_output = '../results_diversity'
dir_reads_fastq = '../data/control_sample/'
fastq_basename = 'POOL.fastq.gz'
dir_results_profiling = '../results_profiling'
pools_file="../data/EM_EVPools/samples_profiling.txt"

In [None]:
today = datetime.today().strftime('%Y-%m-%d')
os.makedirs(f'{dir_diversity_output}/{today}', exist_ok=True)

In [None]:
# GENERAL VARIABLES
POOL_list = !cat {pools_file}
POOL_list += ['ACIDOLA', 'BLACTIS']


In [None]:
# Attributes 
cutoff_NA_ratio = 0.35

In [None]:
# Create the pooled dataframe. We are going to separate mean and percentage to have some representation of two variables.

# The dataframe we are going to show has values of all samples using the raw dataframe, and not the cutoff one. However, to do the filtering we 
# are going to use the cutoff one, because for some species that have discordant read values, in many pools they are discarded, but not in all of them; so
# when doing the heatmap here, they appear on top, when in reality they should have been discarded for not appearing in many datasets

list_dfs_means_raw, list_dfs_per_raw = [], []
list_dfs_means_cutoff, list_dfs_per_raw_cutoff = [], []

list_selected_index = []


for POOL in POOL_list:
    df_POOL_cutoff = pd.read_csv(f'{dir_diversity_output}/{today}/{POOL}.diversity_cutoff.tsv', sep='\t', index_col='Unnamed: 0')
    df_POOL_cutoff.reset_index(inplace=True)
    df_POOL_cutoff = df_POOL_cutoff[['index', 'name', 'mean (%)', 'mean']].rename(columns = {'mean (%)': f'mean (%) {POOL}', 'mean': f'mean {POOL}'})
    df_POOL_cutoff['taxon - genus'] = df_POOL_cutoff['index'].astype(str) + ' - ' + df_POOL_cutoff['name']
    df_POOL_cutoff = df_POOL_cutoff.set_index('taxon - genus')

    df_POOL_raw = pd.read_csv(f'{dir_diversity_output}/{today}/{POOL}.diversity_raw.tsv', sep='\t', index_col='Unnamed: 0')
    df_POOL_raw.reset_index(inplace=True)
    df_POOL_raw = df_POOL_raw[['index', 'name', 'mean (%)', 'mean']].rename(columns = {'mean (%)': f'mean (%) {POOL}', 'mean': f'mean {POOL}'})
    df_POOL_raw['taxon - genus'] = df_POOL_raw['index'].astype(str) + ' - ' + df_POOL_raw['name']
    df_POOL_raw = df_POOL_raw.set_index('taxon - genus')

    list_dfs_means_raw.append(df_POOL_raw[f'mean {POOL}'])
    list_dfs_per_raw.append(df_POOL_raw[f'mean (%) {POOL}'])

    list_dfs_means_cutoff.append(df_POOL_cutoff[f'mean {POOL}'])
    list_dfs_per_raw_cutoff.append(df_POOL_cutoff[f'mean (%) {POOL}'])

    list_selected_index += df_POOL_cutoff.index.tolist()


selected_index = list(set(list_selected_index))
df_mean_raw, df_per_raw = pd.concat(list_dfs_means_raw, axis=1), pd.concat(list_dfs_per_raw, axis=1)
df_mean_raw = df_mean_raw.loc[selected_index]
df_per_raw = df_per_raw.loc[selected_index]

df_mean_cutoff, df_per_cutoff = pd.concat(list_dfs_means_cutoff, axis=1), pd.concat(list_dfs_per_raw_cutoff, axis=1)
df_mean_cutoff = df_mean_cutoff.loc[selected_index]
df_per_cutoff = df_per_cutoff.loc[selected_index]


# NA cut to keep only species that have only a set of values as NAs
nonNA_index = df_mean_cutoff[df_mean_cutoff.isna().sum(1) < int(cutoff_NA_ratio * len(POOL_list))].index

# Then we order by the median of the values (using mean skewed some species much present in a few samples)
df_mean_cutoff_nonNA = df_mean_cutoff.loc[nonNA_index]
df_mean_cutoff_nonNA = df_mean_cutoff_nonNA.assign(m=df_mean_cutoff_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_mean_cutoff_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_cutoff_nonNA.tsv', sep='\t')


df_per_cutoff_nonNA = df_per_cutoff.loc[nonNA_index]
df_per_cutoff_nonNA = df_per_cutoff_nonNA.assign(m=df_per_cutoff_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_per_cutoff_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_cutoff_nonNA.tsv', sep='\t')


# Then do the same in raw, but only with cutoff samples
df_mean_raw_cutoffindex_nonNA = df_mean_raw.loc[df_mean_cutoff_nonNA.index.values]
df_mean_raw_cutoffindex_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_raw_cutoffindex_nonNA.tsv', sep='\t')


df_per_raw_cutoffindex_nonNA = df_per_raw.loc[df_per_cutoff_nonNA.index.values]
df_per_raw_cutoffindex_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_raw_cutoffindex_nonNA.tsv', sep='\t')


# Then do the same in raw as with cutoff
df_mean_raw_nonNA = df_mean_raw
df_mean_raw_nonNA = df_mean_raw_nonNA.assign(m=df_mean_raw_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_mean_raw_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_raw_nonNA.tsv', sep='\t')


df_per_raw_nonNA = df_per_raw
df_per_raw_nonNA = df_per_raw_nonNA.assign(m=df_per_raw_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_per_raw_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_raw_nonNA.tsv', sep='\t')

In [None]:
df_mean_raw_nonNA

In [None]:
df_mean_raw_cutoffindex_nonNA

In [None]:
df_mean_cutoff_nonNA

In [None]:
N = 25
fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_raw_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()

fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()


fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_cutoff_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()

In [None]:
N = 100

fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_mean_nonNA_annot.png', dpi=300)



fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=False, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_mean_nonNA.png', dpi=300)


In [None]:
N = 100

fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_per_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_per_nonNA_annot.png', dpi=300)



fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_per_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=False, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_per_nonNA.png', dpi=300)

In [None]:
## Try a quick wilcoxon test
from scipy.stats import mannwhitneyu, ttest_ind

In [None]:
df_mean_nonNA.iloc[:, 8:12]

In [None]:
# RR [POOL 1-4] vs HC [POOL 8-12]

list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []

for row in range(len(df_mean_raw_cutoffindex_nonNA)):
    condition_vals = df_mean_raw_cutoffindex_nonNA.iloc[row, :4].values
    reference_vals = df_mean_raw_cutoffindex_nonNA.iloc[row, 8:12].values

    res = mannwhitneyu(condition_vals, reference_vals)
    list_pvals_mannwhitney.append(res.pvalue)

    res = ttest_ind(condition_vals, reference_vals)
    list_pvals_ttest.append(res.pvalue)

    L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))




df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,2,3,8,9,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney


df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])

df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]


fig, ax = plt.subplots(1, 1, figsize=(9, 2))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()




In [None]:
# SP vs HC

list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []

for row in range(len(df_mean_raw_cutoffindex_nonNA)):
    condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 4:8].values
    reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 8:12].values

    res = mannwhitneyu(condition_vals, reference_vals)
    list_pvals_mannwhitney.append(res.pvalue)

    res = ttest_ind(condition_vals, reference_vals)
    list_pvals_ttest.append(res.pvalue)

    L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))




df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [4,5,6,7,8,9,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney


df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])

df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]


fig, ax = plt.subplots(1, 1, figsize=(9, 3))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()

In [None]:
# RR vs SP

list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []

for row in range(len(df_mean_raw_cutoffindex_nonNA)):
    condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, :4].values
    reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 4:8].values

    res = mannwhitneyu(condition_vals, reference_vals)
    list_pvals_mannwhitney.append(res.pvalue)

    res = ttest_ind(condition_vals, reference_vals)
    list_pvals_ttest.append(res.pvalue)

    L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))




df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,2,3,4,5,6,7]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney


df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])

df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]


fig, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()


In [None]:
# SEX

list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []

for row in range(len(df_mean_raw_cutoffindex_nonNA)):
    condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, [2,3, 6,7, 10,11]].values  #FEMALE
    reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, [0,1, 4,5, 8,9]].values  #MALE

    res = mannwhitneyu(condition_vals, reference_vals)
    list_pvals_mannwhitney.append(res.pvalue)

    res = ttest_ind(condition_vals, reference_vals)
    list_pvals_ttest.append(res.pvalue)

    L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))




df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,4,5,8,9,2,3,6,7,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney


df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])

df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]

try:
    fig, ax = plt.subplots(1, 1, figsize=(9, 5))
    sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
    plt.title('log10 percentage counts')
    plt.tight_layout()
except:
    print('NO SIGNIFICANT SAMPLES')
