In [3]:
%load_ext autoreload
%autoreload 2

import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# get data
data = pd.read_pickle("./data/datasets/islets_rna_prot_dataset.pkl")
sample_metadata = pd.read_pickle("./data/datasets/islets_rna_prot_sample_metadata.pkl")
feature_metadata = pd.read_pickle("./data/datasets/islets_rna_prot_feature_metadata.pkl")

sys.path.append("src.py")
from src import Utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### [F09] U-Plot of correlation coefficients between proteomics and transcriptomics + correlation significance 

'U-Plot': Volcanoplot of protein-transcriptome correlation coefficient for each protein. Adjusted p-value is preferred. 

In [4]:
marker_ids = pd.read_csv("./marker_ids.csv", header = None, sep = "\t", names = ["gene", "id"])
gene_to_id = {k : v for k, v in zip(marker_ids['gene'], marker_ids['id'])}
id_to_gene = {k : v for k, v in zip(marker_ids['id'], marker_ids['gene'])}

In [8]:
# this has to be done per cell type!!!
for ct in ['alpha', 'beta', 'delta']:

    # select data for cell type
    data_filtered = data.copy()
    data_filtered = data_filtered[sample_metadata['sample'] == ct]
    sample_metadata_filtered = sample_metadata[sample_metadata['sample'] == ct]

    # filter features for 70 % data completeness
    max_missing = 0.3
    mval_percentage = data_filtered.isna().mean()
    missingness = mval_percentage >= max_missing
    data_filtered = data_filtered.loc[:, ~missingness]

    # split data: proteomics
    df_p = data_filtered[sample_metadata_filtered['readout'] == 'proteomics']
    df_p.index = df_p.index.str.replace("_proteomics", "")

    # split data: transcriptomics
    df_t = data_filtered[sample_metadata_filtered['readout'] == 'transcriptomics']
    df_t.index = df_t.index.str.replace("_transcriptomics", "")

    # compute pairwise correlations: each protein's transcriptomics vs proteomics profile
    pairwise_correlation_df = Utils.pairwise_correlation(
        data_a = df_p,
        data_b = df_t,
    )

    # add protein labels
    pairwise_correlation_df['protein'] = pairwise_correlation_df.index
    pairwise_correlation_df['gene'] = pairwise_correlation_df['protein'].map(id_to_gene, na_action = 'ignore')

    # Add gene names

    # # visualize in scatterplot
    # f, a, _ = Utils.new_volcano(
    #     data = pairwise_correlation_df,
    #     x_col = 'r',
    #     y_col = 'p',
    #     neg_log10_y = True,
    #     selected_labels = list(marker_ids['id'].values),
    #     label_display_col = 'gene',
    #     label_lookup_col = 'protein',
    #     lim_lower = -0.75,
    #     lim_upper = 0.75,
    #     y_threshold = -np.log10(0.05),
    #     max_labels = 0,
    #     title = ct,
    #     xlabel = "PCC (Proteomics/RNAseq)",
    #     ylabel = f"-log10(p)",
    #     dodge_labels = 'sides',
    #     left_right_label_kwargs = {
    #         'y_spacing_factor' : 1,
    #         'xlim_padding_factor' : 0.5,
    #         'highest_label_y_fraction' : 0.5,
    #     },
    # )

    # # save plot
    # p.save_figure(
    #     fig = f,
    #     filename = f"figure_09_protein_protein_corr_volcano_{ct}_new.svg",
    #     output_dir = "./assets/20250423_revision_figures/",
    #     dpi = 300,
    #     nature_width = "single",
    #     nature_height = "single",
    # )


AttributeError: type object 'Utils' has no attribute 'new_volcano'