# z-score the results

In [4]:
from pandas import DataFrame, read_csv, to_numeric
from scipy.stats import zscore, shapiro
from json import dump
from os import path
from re import sub

def z_scored(df_path):
    # Compute the z-score for each row and return a z-score DataFrame
    print(df_path)
    baseName = path.basename(df_path).replace(".csv", "")
    df = read_csv(df_path).fillna(0)
    if "Sample" in df_path:
        indexName = "Interval" if "prob" not in df_path else "Sample"
        df = df.set_index(indexName).drop("Name", axis="rows").apply(to_numeric, errors='coerce')
        indexName = df.index.name
        df.index = [i.replace("-ABX", "") for i in df.index]
        indexName = df.index.name = indexName
    elif "ASV" in df_path:
        df.set_index("ASV", inplace=True)
        if "prob" in df_path:
            df = df.drop("Name", axis="rows").apply(to_numeric, errors='coerce')

    # run normality score first to assess the validity of a z-score
    normalTests = [shapiro(row.values) for index, row in df.iterrows()]
    normalcy_test_dic = dict(zip(df.index, normalTests)) 
    # display(normalcy_test_dic)
    with open(f"{baseName}_normalcies.json", 'w') as jsonOut:
        dump(normalcy_test_dic, jsonOut, indent=3)
    z_scores = df.apply(zscore, axis="rows")
    return DataFrame(z_scores, columns=df.columns, index=df.index)


# generate the sample z-Score matrices
from glob import glob
for sample_table in glob("nboutput/Sample*probInteractions.csv"):
    zScore_sample = z_scored(sample_table)
    zScore_sample.to_csv(sample_table.replace("probInteractions", "probInteractions_zScore"))

# zScore_samples = z_scored("SampleIntervalMetaboliteInteractions.csv")
# zScore_ASVs = z_scored("ASVMetaboliteInteractions.csv")

nboutput/Sampleuptake_probInteractions.csv
nboutput/Sampleexcretion_probInteractions.csv
nboutput/Samplegrowth_probInteractions.csv


# Visualizing the z-scored SMIPPs as t-SNE plots

## defining the color code bins based on methane flux

In [4]:
from pandas import read_excel

data = read_excel("data/Cliff_Sample_Metadata_BGC_NMR.xlsx")
display(data)
data.to_csv("data/Cliff_Sample_Metadata_BGC_NMR.csv")

Unnamed: 0,New_index,Sample_name,Sample,Site,Core,Depth,Restoration,Hydrol,Date,Lat,...,Succinate,Sucrose,Thymidine,Trehalose,Trimethylamine,Tryptophan,Tyrosine,Uracil,Uridine,Valine
0,1,Historic_R2A_A_D1,R2A_A_D1,R2A,A,D1,Reference,Tidal marsh,2014-07-11,37.496219,...,11.4,11.0,0.0,20.6,0.8,0.0,2.2,10.7,4.0,70.6
1,2,Historic_R2A_A_D2,R2A_A_D2,R2A,A,D2,Reference,Tidal marsh,2014-07-11,37.496219,...,2.0,8.8,0.0,12.1,0.2,0.0,0.0,0.0,0.0,6.6
2,3,Historic_R2A_B_D1,R2A_B_D1,R2A,B,D1,Reference,Tidal marsh,2014-07-11,37.496044,...,8.6,287.8,5.3,17.4,1.0,0.0,1.6,3.5,6.2,61.6
3,4,Historic_R2A_B_D2,R2A_B_D2,R2A,B,D2,Reference,Tidal marsh,2014-07-11,37.496044,...,1.0,454.6,0.0,22.0,0.0,0.0,0.0,0.0,0.0,8.1
4,5,Historic_R2A_C_D1,R2A_C_D1,R2A,C,D1,Reference,Tidal marsh,2014-07-11,37.496546,...,11.0,3219.9,0.0,51.7,0.0,0.0,0.0,0.0,0.0,41.1
5,6,Historic_R2A_C_D2,R2A_C_D2,R2A,C,D2,Reference,Tidal marsh,2014-07-11,37.496546,...,2.0,27.7,0.0,24.5,0.1,0.0,0.0,0.0,0.7,4.1
6,7,restored_SF2_A_D1,SF2_A_D1,SF2,A,D1,Restored,Managed pond,2014-07-17,37.497143,...,46.1,77.4,36.6,16.1,0.8,7.8,44.6,13.0,13.3,86.9
7,8,restored_SF2_A_D2,SF2_A_D2,SF2,A,D2,Restored,Managed pond,2014-07-17,37.497143,...,,,,,,,,,,
8,9,restored_SF2_B_D1,SF2_B_D1,SF2,B,D1,Restored,Managed pond,2014-07-17,37.497202,...,,,,,,,,,,
9,10,restored_SF2_B_D2,SF2_B_D2,SF2,B,D2,Restored,Managed pond,2014-07-17,37.497202,...,,,,,,,,,,


In [4]:
from sklearn.preprocessing import KBinsDiscretizer
from collections import Counter
from pandas import read_csv
from numpy import array

data = read_csv("data/Cliff_Sample_Metadata_BGC_NMR.csv")
methane = array(data["CH4_umol_m2_d"].to_list()).reshape(-1, 1)
binning = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans')
methaneBins = binning.fit_transform(methane).reshape(-1)
orderedBins = dict(sorted(Counter(dict(zip(methane.reshape(-1), methaneBins))).items(), key=lambda item: item[1], reverse=True))
display(orderedBins)

reversedOrderedBins = {}
for flux, code in orderedBins.items():
    if code in reversedOrderedBins:
        reversedOrderedBins[code].append(flux)
    else:  reversedOrderedBins[code] = [flux]
    
print()
display(reversedOrderedBins)
legendMap = {f"{min(fluxes)} to {max(fluxes)}": colorMap[code] for code, fluxes in reversedOrderedBins.items()}
print()
display(legendMap)

# color exchanges
colorMap = {
    3.0: "purple",
    2.0: "blue",
    1.0: "green",
    0.0: "red"
}
ch4_to_sample = {} #dict(zip(data["CH4_umol_m2_d"].to_list(), data["Sample"].to_list()))
colorsList = {k: colorMap[v] for k,v in orderedBins.items()} # {ch4_to_sample[k]: colorMap[v] for k,v in orderedBins.items()}
for index, row in data.iterrows():
    ch4_to_sample[row["Sample"]] = colorsList[row["CH4_umol_m2_d"]]
    
print()    
display(ch4_to_sample)
print()
display(colorsList)

{1607.09: 3.0,
 1588.57: 3.0,
 1347.13: 2.0,
 1327.18: 2.0,
 737.32: 1.0,
 490.47: 1.0,
 -24.3: 0.0,
 -1.76: 0.0,
 -7.1: 0.0,
 4.43: 0.0,
 41.45: 0.0,
 10.5: 0.0}




{3.0: [1607.09, 1588.57],
 2.0: [1347.13, 1327.18],
 1.0: [737.32, 490.47],
 0.0: [-24.3, -1.76, -7.1, 4.43, 41.45, 10.5]}




{'1588.57 to 1607.09': 'purple',
 '1327.18 to 1347.13': 'blue',
 '490.47 to 737.32': 'green',
 '-24.3 to 41.45': 'red'}




{'R2A_A_D1': 'red',
 'R2A_A_D2': 'red',
 'R2A_B_D1': 'red',
 'R2A_B_D2': 'red',
 'R2A_C_D1': 'red',
 'R2A_C_D2': 'red',
 'SF2_A_D1': 'red',
 'SF2_A_D2': 'red',
 'SF2_B_D1': 'red',
 'SF2_B_D2': 'red',
 'SF2_C_D1': 'red',
 'SF2_C_D2': 'red',
 'R1_A_D1': 'green',
 'R1_A_D2': 'green',
 'R1_B_D1': 'blue',
 'R1_B_D2': 'blue',
 'R1_C_D1': 'green',
 'R1_C_D2': 'green',
 'R2_A_D1': 'blue',
 'R2_A_D2': 'blue',
 'R2_B_D1': 'purple',
 'R2_B_D2': 'purple',
 'R2_C_D1': 'purple',
 'R2_C_D2': 'purple'}




{1607.09: 'purple',
 1588.57: 'purple',
 1347.13: 'blue',
 1327.18: 'blue',
 737.32: 'green',
 490.47: 'green',
 -24.3: 'red',
 -1.76: 'red',
 -7.1: 'red',
 4.43: 'red',
 41.45: 'red',
 10.5: 'red'}

## producing the t-SNE plots

In [None]:
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot, colors, cm
from pandas import read_csv, Series
from adjustText import adjust_text
from sklearn.manifold import TSNE
from json import load, dump
from glob import glob
from re import sub

# Function to create and plot t-SNE
# Maybe make the above ASV matrix binary: 1 or 0
def create_tsne_plot(data_path, labels=None, title=None, export_name=None, taxonomical_level="order", perplexity=None,
                     titleFont=30, axesFont=20, annotationSize=12, nodeSize=30, legendSize=8, markers=None, nodeColors=None,
                     colorScheme="turbo", colorsHash=None, legendMap=None):
    # load and process the data
    if "_probInteractions" in data_path:   baseName = data_path.replace("_probInteractions", "")
    elif "MetaboliteInteractions" in data_path:  baseName = data_path.replace("MetaboliteInteractions", "")
    elif "zScore" in data_path:   baseName = data_path
    baseName = baseName.replace(".csv", "")
    baseName = baseName.split("/")[-1]
    print(f"The {baseName} figure at {perplexity} perplexity is being generated")

    # define clustering parameters
    s = nodeSize or 10
    if "Fluxes" in data_path:
        indexName = "name"
        perplexity = perplexity or 5
        s = nodeSize or 40
    elif "Sample" in data_path:
        indexName = "Sample"
        perplexity = perplexity or 15
        s = nodeSize or 70
        if "Interval" in data_path:
            indexName = "Interval"
            perplexity = perplexity or 5
            s = nodeSize or 120
    # process the df
    df = read_csv(data_path).set_index(indexName).fillna(0)
    if "Fluxes" in data_path:
        df.drop(["equation", "id"], axis=1, inplace=True)
        # df.drop("Name", axis=0, inplace=True)
        legend_title = f"Taxonomical {taxonomical_level.capitalize()}"
        diet_labels, text_labels, labels = [], [], []
        for asv, diet, day in list([col.split("_") for col in df.columns]):
            diet_labels.append(diet)   ;   text_labels.append(day)
            labels.append(taxonomy[asv][taxonomical_level])
        df.columns = list(map(str, labels))
        df.index = list(map(str, df.index))
        df = df.T
        # display(df)
    elif "Sample" in data_path:
        if "zScore" not in data_path:  df.drop("Name", axis=0, inplace=True)
        legend_title = "CH4 fluxes"
        labels = []
        for i in df.index:
            if "SF" in i:
                labels.append("Restored")
            elif "R2A" in i:
                labels.append("Reference")
            else:
                labels.append("Unrestored")
    
    # define the chart labels and corresponding color codes
    df = df.astype(float)
    chart_labels = Series(df.index if labels is None else labels)
    codes = chart_labels.astype('category').cat.codes

    # create the tSNE mapping
    tsne_model = TSNE(n_components=2, random_state=42, perplexity=perplexity, n_iter=5000, learning_rate=50)
    tsne_results = tsne_model.fit_transform(df)
    
    # visualize the tSNE visualization
    title = f"t-SNE of {baseName}" if title is None else title
    pyplot.figure(figsize=(18, 18))
    pyplot.xlabel('t-SNE Dimension 1', fontsize=axesFont)
    pyplot.ylabel('t-SNE Dimension 2', fontsize=axesFont)
    pyplot.title(title, fontsize=titleFont)
    export_name = export_name or f"{baseName}_tSNE"
    # cmap="winter"  ;   all options are here https://matplotlib.org/stable/users/explain/colors/colormaps.html
    xs, ys = tsne_results[:, 0], tsne_results[:, 1]
    if markers is None:
        ## create the scatter elements and legend contents
        scatter = pyplot.scatter(xs, ys, c=chart_labels.astype('category').cat.codes, cmap='brg', alpha=0.5, s=s)
        handles, _ = scatter.legend_elements()
        pyplot.legend(handles=handles, labels=chart_labels.astype('category').cat.categories.tolist(),
                      title=legend_title, prop={'size': legendSize}, title_fontsize=axesFont)
    else:
        ## create the scatter elements
        labelIndex = dict(zip(chart_labels, codes)) if "Sample" in data_path else dict(zip(labels, codes))
        df['tsne-2d-one'], df['tsne-2d-two'] = xs, ys
        codes.index = df.index
        ## define the legend contents
        # if "Sample" in data_path:
        #     nodeColors = nodeColors or {"Restored":"red", "Unrestored":"blue", "Reference": "green"}
        #     for diet, marker in markers.items():
        #         # print(codes, labelIndex)
        #         subset = df[codes == labelIndex[diet]]
        #         scatter = pyplot.scatter(subset['tsne-2d-one'], subset['tsne-2d-two'], marker=marker, s=s, color=nodeColors[diet])
        #     handles = [pyplot.Line2D([0], [0], color=nodeColors[diet], marker=marker, markersize=s/10, label=diet, linestyle="None")
        #                for diet, marker in markers.items()]
        # elif "Fluxes" in data_path:
        if True:
            # norm = colors.Normalize(vmin=codes.min(), vmax=codes.max())
            # colormap = cm.get_cmap(colorScheme, len(labels))
            for i, (index, row) in enumerate(df.iterrows()):
                scatter = pyplot.scatter(row['tsne-2d-one'], row['tsne-2d-two'], s=s, label=labels[i], marker=markers[labels[i]],
                                         color=colorsHash[index], edgecolor="black")
            handles = [pyplot.Line2D([0], [0], color=color, marker="s", markersize=s/20, label=sample, linestyle="None")
                       for sample, color in legendMap.items()]
        pyplot.legend(handles=handles, title=legend_title, prop={'size': legendSize}, title_fontsize=axesFont, labelspacing=1.5)
    ## Add labels to the sample points
    text_labels = df.index if "Sample" in data_path else text_labels 

    # Add labels for each point
    from adjustText import adjust_text
    texts = [pyplot.text(xs[i], ys[i], label, fontsize=annotationSize, ha='center', va='center') for i, label in enumerate(text_labels)]
    adjust_text(texts, arrowprops=dict(arrowstyle='->', color='black'), expand=(2.3, 2.3))
    print(export_name)
    pyplot.savefig(export_name, bbox_inches='tight')#, pad_inches=10)

    
# generate the sample plots
for sample_table in glob("nboutput/Sample*probInteractions_zScore.csv"):
    df = read_csv(sample_table)
    for perplexity in list(range(len(df.index)-1, 3, -5)):
        baseName = f"{sample_table.replace('.csv', '')}_tSNE_{perplexity}"
        simulation = sample_table.replace("Sample", '').replace("_probInteractions_zScore.csv", '').split("/")[0]
        create_tsne_plot(sample_table, perplexity=perplexity, title=f"{simulation.capitalize()} MMIPPs", export_name=baseName,
                         titleFont=30, axesFont=20, nodeSize=400, annotationSize=15, legendSize=15,
                         markers={"Restored":'^', "Unrestored":'o', "Reference": 'X'}, colorsHash=ch4_to_sample,
                         legendMap=legendMap
                        )
    break
        
        

The Sampleuptake_zScore figure at 23 perplexity is being generated
nboutput/Sampleuptake_probInteractions_zScore_tSNE_23
The Sampleuptake_zScore figure at 18 perplexity is being generated
nboutput/Sampleuptake_probInteractions_zScore_tSNE_18
The Sampleuptake_zScore figure at 13 perplexity is being generated
nboutput/Sampleuptake_probInteractions_zScore_tSNE_13
