# Analysis of in silico samples

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from scipy.stats import mannwhitneyu



In [None]:
pd.options.display.float_format = '{:.3f}'.format

In [None]:
# VARS
project_name = 'EM_EVPools'
file_16S = f"../data/{project_name}/16_S_datasheet/16S_analysis.xlsx"

date = "2023-12-05"
file_EVs = f"../results_diversity/{date}/df_mean_cutoff_nonNA.tsv"

In [None]:
df_16S = pd.read_excel(file_16S, index_col=0)
df_16S = df_16S.iloc[1:]

# Set columns to be the name of the sample and the type of MS / HC
columns=[(i, j) for i, j in zip(df_16S.iloc[0, :].values, df_16S.columns.values)]
df_16S.columns = pd.MultiIndex.from_tuples(columns)

df_16S = df_16S.iloc[1:].astype(float)
df_16S

In [None]:
df_EVs = pd.read_csv(file_EVs, sep='\t', index_col=0)

df_EVs

In [None]:
names_genres = [i.split(' - ')[1] for i in df_EVs.index[1:]]

In [None]:
for name in names_genres:
    display(name)
    df_name = df_16S.loc[[name in i for i in df_16S.index]]
    display(df_name)


In [None]:
df_16S.loc[:, 'RR'].astype(float).mean(0)

In [None]:
N = 0
for name in names_genres:
    df_name = df_16S.loc[[name in i for i in df_16S.index]]
    if len(df_name) > 0:
        N += 1

In [None]:
fig, axs = plt.subplots(2, N // 2 + 1, figsize=(3 * (N // 2 + 1), 3 * 2))
i = 0

for name in names_genres:
    df_name = df_16S.loc[[name in i for i in df_16S.index]]


    if len(df_name) > 0:
        display(name)
        print('RR', df_name.loc[:, 'RR'].iloc[0, :].mean(), df_name.loc[:, 'RR'].iloc[0, :].std())
        print('HC', df_name.loc[:, 'HC'].iloc[0, :].mean(), df_name.loc[:, 'HC'].iloc[0, :].std())

        val, pval = mannwhitneyu(x = df_name.loc[:, 'HC'].mean().values, y = df_name.loc[:, 'RR'].mean().values)

        print('pval', pval)

        sns.boxplot([df_name.loc[:, 'HC'].mean().values, df_name.loc[:, 'RR'].mean().values], ax=axs.ravel()[i], )
        axs.ravel()[i].set_xticklabels(['HC', 'RR'])
        axs.ravel()[i].set_title(f"{name} (p={pval:.2f})")
        i += 1

plt.tight_layout()

In [None]:
fig, axs = plt.subplots(2, N // 2 + 1, figsize=(3 * (N // 2 + 1), 3 * 2))
i = 0

for name in names_genres:
    df_name = df_16S.loc[[name in i for i in df_16S.index]]


    if len(df_name) > 0:
        display(name)
        print('RR', df_name.loc[:, 'RR'].mean().mean(), df_name.loc[:, 'RR'].mean().std())
        print('HC', df_name.loc[:, 'HC'].mean().mean(), df_name.loc[:, 'HC'].mean().std())

        val, pval = mannwhitneyu(x = df_name.loc[:, 'HC'].mean().values, y = df_name.loc[:, 'RR'].mean().values)

        print('pval', pval)

        sns.boxplot([df_name.loc[:, 'HC'].mean().values, df_name.loc[:, 'RR'].mean().values], ax=axs.ravel()[i], )
        axs.ravel()[i].set_xticklabels(['HC', 'RR'])
        axs.ravel()[i].set_title(f"{name} (p={pval:.2f})")
        i += 1

plt.tight_layout()