In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df = pd.read_csv(r'..\Data_TFG\desk_deskdefault\tfg_counts_desktop.csv',
                 index_col=0)
df

In [None]:
grupos = (pd.DataFrame(df.columns.to_series()
             .str.split('_').to_dict())).iloc[0,:].to_list()
grupos

In [None]:
df.sum(axis=1)

In [None]:
unicos, cuentas = np.unique(grupos, return_counts=True)
criba = np.min(cuentas)
df_criba = df.loc[df.sum(axis=1) > criba, :].T
df_criba

Se determinan las OTUs que tengan mayor coeficiente de variación.

In [None]:
varotus = pd.concat((
    df_criba.mean(axis=0),
    df_criba.std(axis=0),
    df_criba.std(axis=0).div(df_criba.mean(axis=0))
), axis=1)
varotus.columns = ['mean', 'std', 'coefvar']
varotus

In [None]:
varotus.describe()

In [None]:
varotus = varotus.sort_values(ascending=False, by='coefvar')
plt.figure(figsize=(20,7))
varotus.head(50)['coefvar'].plot.bar()
plt.grid()

In [None]:
df_criba = df_criba.loc[:, varotus.head(50).index]
df_criba

Transformación CLR

In [None]:
from scipy.stats.mstats import gmean
df_criba = df_criba.replace(to_replace=0, value=1e-9).div(df_criba.replace(to_replace=0, value=1e-9).apply(gmean, axis=1), axis=0).apply(np.log)  # tabla transformada
df_criba

In [None]:
print(df_criba.sum(axis=1))
print('')
print(df_criba.sum(axis=1).describe())

In [None]:
from sklearn.preprocessing import StandardScaler
escala = StandardScaler()
dfesc = escala.fit_transform(df_criba)

## *Clustering* no supervisado mediante *KMeans*

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=27)
kmeans.fit(dfesc)

In [None]:
pd.DataFrame({
    'actual_group': grupos,
    'predicted_group': kmeans.labels_
})

In [None]:
pd.crosstab(grupos, kmeans.labels_)

## Análisis de componentes principales (PCA)

In [None]:
from sklearn.decomposition import PCA
mipca = PCA(n_components=7)
desc = mipca.fit_transform(dfesc)

In [None]:
plt.bar(x = [f'PC{i}' for i in range(1,8)],
        height = mipca.explained_variance_ratio_)
plt.ylabel('varianza explicada (%)')

In [None]:
import seaborn as sns
sns.scatterplot(x=desc[:,0], y=desc[:,1], hue=grupos)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Análisis de componentes principales')
#plt.xlim((-5,5))
#plt.ylim((30.3,30.4))
plt.legend()
plt.grid()

In [None]:
import seaborn as sns
sns.scatterplot(x=desc[:,0], y=desc[:,2], hue=grupos)
plt.xlabel('PC1')
plt.ylabel('PC3')
plt.title('Análisis de componentes principales')
plt.legend()
plt.grid()

In [None]:
import seaborn as sns
sns.scatterplot(x=desc[:,1], y=desc[:,2], hue=grupos)
plt.xlabel('PC2')
plt.ylabel('PC3')
plt.title('Análisis de componentes principales')
plt.legend()
plt.grid()

## *t-distributed Stochastic Neighbor Embedding* (tSNE)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42, perplexity=10)
esp2d = tsne.fit_transform(dfesc)

In [None]:
sns.scatterplot(x=esp2d[:,0], y=esp2d[:,1], hue=grupos)
plt.title('TSNE')
plt.legend()
plt.grid()

## *Gaussian Mixture Models*

In [None]:
from sklearn.mixture import GaussianMixture
migmm = GaussianMixture(n_components=2, random_state=42)
migmm.fit(dfesc)

In [None]:
pd.crosstab(grupos, migmm.predict(dfesc))

# Diversidad beta

In [None]:
from scipy.spatial.distance import pdist, squareform
distmat = pdist(dfesc, metric='euclidean')
distmat = squareform(distmat)
plt.figure(figsize=(12,8))
sns.heatmap(distmat, cmap='viridis', xticklabels=np.sort(grupo), yticklabels=np.sort(grupo));