In [1]:
# Include sslearn folder
import sys
import pickle as pkl
import numpy as np
import os
from SequenceEncoding import SequenceEncoding
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, weightedtau
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from itertools import combinations, permutations
from baycomp import two_on_multiple, HierarchicalTest

In [2]:
results_dir = "results"
encodings = ["One_hot", "One_hot_6_bit", "Binary_5_bit", "Hydrophobicity_matrix",
             "Meiler_parameters", "Acthely_factors", "PAM250", "BLOSUM62",
             "Miyazawa_energies", "Micheletti_potentials", "AESNN3",
             "ANN4D"]
masks = ["relative",
        "relativex2",
        "relativex10",
        "relativex0.5",
        "relativex0.1",
        "shannon",
        "shannonx2",
        "shannonx10",
        "shannonx0.5",
        "shannonx0.1",
        "lockless",
        "locklessx2",
        "locklessx10",
        "locklessx0.5",
        "locklessx0.1",
        "1-shannon",
        "inverted_relative",
        "inverted_shannon",
        "inverted_lockless",
        "normalized_relative",
        "normalized_shannon",
        "normalized_lockless",
        "variants_emphasis_weight_0.25",
        "variants_emphasis_weight_0.5",
        "variants_emphasis_weight_0.75",
        "variants_emphasis_weight_1.5",
        "variants_emphasis_weight_2",
        "variants_emphasis_weight_5",
        "variants_gaussian_emphasis_weight_0.25",
        "variants_gaussian_emphasis_weight_0.5",
        "variants_gaussian_emphasis_weight_0.75",
        "variants_gaussian_emphasis_weight_1.5",
        "variants_gaussian_emphasis_weight_2",
        "variants_gaussian_emphasis_weight_5",
        "random"]
labeled_sizes_list = [1] #, 0.75, 0.5, 0.25, 0.1, 0.05, 0.03, 0.01]

In [4]:
experiments_type = "masking_experiments"
# Create results dataframe
df = pd.DataFrame(columns=['Dataset', 'Labeled', 'Train_size', 'Encoding', 'MSE', 'Spearman_r', 'Weighted_tau'])
for dataset_results_folder in os.listdir(results_dir):
    # Only the ones starting by mask
    if dataset_results_folder.startswith(experiments_type):
        # If is dir
        if os.path.isdir(os.path.join(results_dir, dataset_results_folder)):    
            global_pred_dict = dict()
            dataset_results_dir = os.path.join(results_dir, dataset_results_folder)
            dataset_name = "_".join(dataset_results_folder.split(experiments_type+"_")[1].split("_")[:-1])
            for enc in encodings:

                global_pred_dict[enc] = dict()   
                for mask in masks:
                    global_pred_dict[f'{enc}_masked_{mask}'] = dict()

                for labeled_size in labeled_sizes_list:
                    with open(f'{dataset_results_dir}/pred_dict_{enc}_{labeled_size}.pickle', 'rb') as f:
                        global_pred_dict[enc][labeled_size] = pkl.load(f)
                    for mask in masks:
                        with open(f'{dataset_results_dir}/pred_dict_{enc}_masked_{mask}_{labeled_size}.pickle', 'rb') as f:
                            global_pred_dict[f'{enc}_masked_{mask}'][labeled_size] = pkl.load(f)

            for encoding, labeled_sizes in global_pred_dict.items():
                for labeled_size, folds in labeled_sizes.items():
                    for fold, results in folds.items():
                        y_proba = results["y_proba"]
                        y_test = results["original_y_test"]
                        train_size = results["train_len"]
                        mse = mean_squared_error(y_test, y_proba)
                        rmse = np.sqrt(mse)
                        spearman_r = spearmanr(y_test, y_proba)[0]

                        weighted_tau = weightedtau(y_test, y_proba)[0]
                        enc_value = encoding.split("_masked")[0]        
                        mask_value = encoding.split("_masked_")[1] if "masked" in encoding else "unmasked"
                        df = pd.concat([df, pd.DataFrame({'Dataset': dataset_name,
                                                        'Labeled': labeled_size, 
                                                        'Train_size': train_size, 
                                                        'Encoding': enc_value,
                                                        'Mask': mask_value,
                                                        'MSE': mse, 
                                                        'RMSE': rmse,
                                                        'Spearman_r': spearman_r,
                                                        'Weighted_tau': weighted_tau
                                                        }, index=[0])], ignore_index=True)

FileNotFoundError: [Errno 2] No such file or directory: 'results/masking_experiments_0.6M_BMIMI_Ridge/pred_dict_One_hot_masked_relative/2_1.pickle'

In [None]:
# Export df to csv
df.to_csv("results/masking_experiments_results.csv", index=False)

In [None]:
# Read csv
df = pd.read_csv("results/masking_experiments_results.csv")

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from baycomp_plotting import tern

selected_metric = "Spearman_r"

n_masks = len(df["Mask"].unique().tolist())
n_encodings = len(df["Encoding"].unique().tolist())

# Create a figure with masks x encodings
for encoding in df["Encoding"].unique().tolist():
    # For every mask except unmasked
    for mask in df["Mask"].unique().tolist():

        if mask != "unmasked":
            
            df_mask = df[(df["Labeled"] == 1) & (df["Encoding"] == encoding) & ((df["Mask"] == "unmasked") | (df["Mask"] == mask))]
            df_mask = df_mask.groupby(["Dataset", "Mask"]).agg(list).reset_index()
            
            unmasked_matrix = np.array([np.array(x) for x in df_mask[df_mask["Mask"] == "unmasked"][selected_metric].values]).T
            masked_matrix = np.array([np.array(x) for x in df_mask[df_mask["Mask"] == mask][selected_metric].values]).T
            
            posterior = HierarchicalTest(unmasked_matrix, masked_matrix, rope=0.01);
            fig = tern(posterior, l_tag="unmasked", r_tag=mask)
            # Change title
            fig.axes[0].set_title(f"{encoding}\n{mask}", fontsize=50)
            # Make figure bigger
            fig.set_size_inches(12, 12)
            # Put title closer to the plot
            fig.subplots_adjust(top=0.85)
            fig.savefig("results/figs/bayesian_"+encoding+"_"+mask+"_"+selected_metric+".png")


In [None]:
import cv2
def generateBigPlot(encodings, masks, metric_name, figs_dir): 
    
    aux_img = cv2.imread(os.path.join(figs_dir, "bayesian_"+encodings[0]+"_"+masks[0]+"_"+metric_name+".png"))
    aux_img[:] = (242, 242, 242)
    
    final = None 
    for idx, encoding in enumerate(encodings): 
        row = None 
        for idx2, mask in enumerate(masks): 
            
            img = cv2.imread(os.path.join(figs_dir, "bayesian_"+encoding+"_"+mask+"_"+metric_name+".png"))
                
            if idx2==0: 
                row = img
            else: 
                row = cv2.hconcat([row, img])
                
        if idx == 0: 
            final = row
        else: 
            final = cv2.vconcat([final, row])
            
    cv2.imwrite(os.path.join(figs_dir, 'bayesian_comparative_'+metric_name+'.png'),final)

In [None]:
generateBigPlot(encodings, masks, "Spearman_r", "results/figs/")

In [None]:
# Get all values for Labeled = 1, Encoding = One_hot
df_one_hot = df[(df["Labeled"] == 1) & (df["Encoding"] == "One_hot") & ((df["Mask"] == "unmasked") | (df["Mask"] == "normalized_relative"))]
df_one_hot.groupby(["Dataset", "Mask"]).mean()

In [None]:
# Get the best Spearman_r mean value for each dataset
best_df = df.groupby(["Dataset", "Mask"]).mean()

# Print the best mask for each dataset
for dataset in best_df.index.get_level_values(0).unique().tolist():
    print(dataset, best_df.loc[dataset].idxmax()[selected_metric])

In [None]:
# Get the best Spearman_r mean value for each dataset
best_df = df.groupby(["Dataset", "Mask"]).mean()

# Print the best mask for each dataset
for dataset in best_df.index.get_level_values(0).unique().tolist():
    print(dataset, best_df.loc[dataset].idxmax()[selected_metric])

# Extrapolation/generalization

In [None]:
experiments_type = "masking_extrapolation_experiments"
# Create results dataframe
extrapolation_df = pd.DataFrame(columns=['Dataset', 'Labeled', 'Train_size', 'Encoding', 'MSE', 'Spearman_r', 'Weighted_tau'])
for dataset_results_folder in os.listdir(results_dir):
    # Only the ones starting by mask
    if dataset_results_folder.startswith(experiments_type):
        # If is dir
        if os.path.isdir(os.path.join(results_dir, dataset_results_folder)):    
            global_pred_dict = dict()
            dataset_results_dir = os.path.join(results_dir, dataset_results_folder)
            dataset_name = "_".join(dataset_results_folder.split(experiments_type+"_")[1].split("_")[:-1])
            for enc in encodings:

                global_pred_dict[enc] = dict()   
                for mask in masks:
                    global_pred_dict[f'{enc}_masked_{mask}'] = dict()

                for labeled_size in labeled_sizes_list:
                    with open(f'{dataset_results_dir}/pred_dict_{enc}_{labeled_size}.pickle', 'rb') as f:
                        global_pred_dict[enc][labeled_size] = pkl.load(f)
                    for mask in masks:
                        with open(f'{dataset_results_dir}/pred_dict_{enc}_masked_{mask}_{labeled_size}.pickle', 'rb') as f:
                            global_pred_dict[f'{enc}_masked_{mask}'][labeled_size] = pkl.load(f)

            for encoding, labeled_sizes in global_pred_dict.items():
                for labeled_size, folds in labeled_sizes.items():
                    for fold, results in folds.items():
                        y_proba = results["y_proba"]
                        y_test = results["original_y_test"]
                        train_size = results["train_len"]
                        mse = mean_squared_error(y_test, y_proba)
                        rmse = np.sqrt(mse)
                        spearman_r = spearmanr(y_test, y_proba)[0]
                        weighted_tau = weightedtau(y_test, y_proba)[0]
                        enc_value = encoding.split("_masked")[0]        
                        mask_value = encoding.split("_masked_")[1] if "masked" in encoding else "unmasked"
                        extrapolation_df = pd.concat([extrapolation_df, pd.DataFrame({'Dataset': dataset_name,
                                                        'Labeled': labeled_size, 
                                                        'Train_size': train_size, 
                                                        'Encoding': enc_value,
                                                        'Mask': mask_value,
                                                        'MSE': mse, 
                                                        'RMSE': rmse,
                                                        'Spearman_r': spearman_r,
                                                        'Weighted_tau': weighted_tau
                                                        }, index=[0])], ignore_index=True)

In [None]:
extrapolation_df

In [None]:
from baycomp_plotting import tern

# encodings = ["One_hot"]
encodings = ["One_hot", "One_hot_6_bit", "Binary_5_bit", "Hydrophobicity_matrix",
             "Meiler_parameters", "Acthely_factors", "PAM250", "BLOSUM62",
             "Miyazawa_energies", "Micheletti_potentials", "AESNN3",
             "ANN4D"]
masks = ["relative", "shannon", "lockless", "inverted_relative", "inverted_shannon", "inverted_lockless", "normalized_relative", "normalized_shannon", "normalized_lockless", "random"]

selected_metric = "Spearman_r"

n_masks = len(df["Mask"].unique().tolist())
n_encodings = len(df["Encoding"].unique().tolist())

# Create a figure with masks x encodings
for encoding in df["Encoding"].unique().tolist():
    # For every mask except unmasked
    for mask in df["Mask"].unique().tolist():

        if mask != "unmasked":
            
            df_mask = df[(df["Labeled"] == 1) & (df["Encoding"] == encoding) & ((df["Mask"] == "unmasked") | (df["Mask"] == mask))]
            df_mask = df_mask.groupby(["Dataset", "Mask"]).agg(list).reset_index()
            
            unmasked_matrix = np.array([np.array(x) for x in df_mask[df_mask["Mask"] == "unmasked"][selected_metric].values]).T
            masked_matrix = np.array([np.array(x) for x in df_mask[df_mask["Mask"] == mask][selected_metric].values]).T
            
            posterior = HierarchicalTest(unmasked_matrix, masked_matrix, rope=0.01)
            fig = tern(posterior, l_tag="unmasked", r_tag=mask)
            # Change title
            fig.axes[0].set_title(f"{encoding}\n{mask}", fontsize=50)
            # Make figure bigger
            fig.set_size_inches(12, 12)
            # Put title closer to the plot
            fig.subplots_adjust(top=0.85)
            fig.savefig("results/figs/extrapolation/bayesian_"+encoding+"_"+mask+"_"+selected_metric+".png")
