In [1]:
"""
The purpose of this Jupyter notebook is to identify individual genes/
proteins for which PPI-guided intensity refinement is successful.

This is accomplished by computing the differences between unrefined and
refined intensities. Proteins whose intensity difference exceeds a
certain threshold and are present in the test set are eligible.
"""

'\nThe purpose of this Jupyter notebook is to identify individual genes/\nproteins for which PPI-guided intensity refinement is successful.\n\nThis is accomplished by computing the differences between unrefined and\nrefined intensities. Proteins whose intensity difference exceeds a\ncertain threshold and are present in the test set are eligible.\n'

In [23]:
import os

import napari
import numpy as np
import pandas as pd
from PIL import Image

### Loading Data into DataFrames

In [3]:
# Load the intersection between the screen subset and the test set
intersection_screen_subset_test_set_path = (
    "gene_and_protein_labels_intersection_screen_subset_test_set.tsv"
)

intersection_screen_subset_test_set_df = pd.read_csv(
    intersection_screen_subset_test_set_path,
    sep="\t"
)

In [4]:
# Load IQM-normalized unrefined intensities
iqm_norm_unrefined_ints_path = (
    "/Users/jacobanter/Documents/Code/VACV_screen/Processing_Dharmacon_"
    "pooled_genome_1_and_2_subset/refined_intensities_and_related_data/"
    "IQM-normalized_intensities_min-max_normalized.tsv"
)

iqm_norm_unrefined_ints_df = pd.read_csv(
    iqm_norm_unrefined_ints_path,
    sep="\t",
    usecols=[
        "Name",
        "UniProt_IDs",
        "dIntensity_cPathogen_eMean_oVoronoiCells_nZScore",
        "dIntensity_cLatePathogen_eMean_oVoronoiCells_nZScore"
    ],
    index_col=["Name", "UniProt_IDs"]
)

In [5]:
# Load IQM-normalized and median-refined intensities
iqm_norm_median_refined_ints_path = (
    "/Users/jacobanter/Documents/Code/VACV_screen/Processing_Dharmacon_"
    "pooled_genome_1_and_2_subset/refined_intensities_and_related_data/"
    "IQM-normalized_intensities_median_refined.tsv"
)

iqm_norm_median_refined_ints_df = pd.read_csv(
    iqm_norm_median_refined_ints_path,
    sep="\t",
    usecols=[
        "Name",
        "UniProt_IDs",
        "dIntensity_cPathogen_eMean_oVoronoiCells_nZScore",
        "dIntensity_cLatePathogen_eMean_oVoronoiCells_nZScore"
    ],
    index_col=["Name", "UniProt_IDs"]
)

### Computing Intensity Differences

In [6]:
# Both DataFrames have identical MultiIndices and also identical column
# names, which is why computing the differences is straightforward
int_diffs_df = (
    iqm_norm_unrefined_ints_df - iqm_norm_median_refined_ints_df
)

In [7]:
# Determine the maximum as well as the minimum differences
# Note that the `.min()` and `.max()` methods return Serieses
max_series = int_diffs_df.max()
max_early_diff = max_series.iloc[0]
max_late_diff = max_series.iloc[1]

min_series = int_diffs_df.min()
min_early_diff = min_series.iloc[0]
min_late_diff = min_series.iloc[1]

print(
    f"Maximum difference for early intensities: {max_early_diff:.3f}\n"
    f"Minimum difference for early intensities: {min_early_diff:.3f}\n"
    "\n"
    f"Maximum difference for late intensities: {max_late_diff:.3f}\n"
    f"Minimum difference for late intensities: {min_late_diff:.3f}"
)

Maximum difference for early intensities: 0.514
Minimum difference for early intensities: 0.000

Maximum difference for late intensities: 0.166
Minimum difference for late intensities: 0.000


### Identifying Eligible Proteins

In [8]:
# As a first step, filter the DataFrame to retain only proteins also
# present in the test set
int_diffs_df = int_diffs_df[
    int_diffs_df
    .index
    .get_level_values("UniProt_IDs")
    .isin(intersection_screen_subset_test_set_df["UniProt_IDs"])
]

In [9]:
# Perform a sanity check
# The intersection between the screen subset and the test set comprises
# 62 proteins; thus the filtered DataFrame's length should also equal 62
assert len(int_diffs_df) == 62, "The filtering step was not successful!"

In [10]:
# Determine the maximum as well as the minimum differences of the
# filtered DataFrame
filtered_max_series = int_diffs_df.max()
filtered_max_early_diff = filtered_max_series.iloc[0]
filtered_max_late_diff = filtered_max_series.iloc[1]

filtered_min_series = int_diffs_df.min()
filtered_min_early_diff = filtered_min_series.iloc[0]
filtered_min_late_diff = filtered_min_series.iloc[1]

print(
    f"Maximum difference for early intensities: {filtered_max_early_diff:.3f}\n"
    f"Minimum difference for early intensities: {filtered_min_early_diff:.3f}\n"
    "\n"
    f"Maximum difference for late intensities: {filtered_max_late_diff:.3f}\n"
    f"Minimum differece for late intensities: {filtered_min_late_diff:.3f}"
)

Maximum difference for early intensities: 0.339
Minimum difference for early intensities: 0.006

Maximum difference for late intensities: 0.061
Minimum differece for late intensities: 0.001


In [11]:
# Fortunately, after filtering, there still are proteins with
# significant intensity differences
# To be more precise, significant intensity differences are present for
# early intensities
# Identify proteins associated with such significant intensity
# differences
high_diff_prots = int_diffs_df[
    int_diffs_df["dIntensity_cPathogen_eMean_oVoronoiCells_nZScore"]
    >=
    0.304
].index.get_level_values("UniProt_IDs")

print(
    "Number of proteins with significant intensity differences: "
    f"{len(high_diff_prots)}"
)

Number of proteins with significant intensity differences: 6


In [12]:
# Print the corresponding UniProt IDs and their associated PPI label
# (i.e. 0 or 1)
for uniprot_id in high_diff_prots:
    # Retrieve the PPI label
    ppi_label = intersection_screen_subset_test_set_df.loc[
        intersection_screen_subset_test_set_df["UniProt_IDs"]
        ==
        uniprot_id,
        "Label"
    ].iloc[0]
    print(uniprot_id.ljust(12), ppi_label, sep="")

Q53HB9      0
Q9NY93      0
B2RD09      0
A8K0P8      0
Q68CQ4      0
E9PS41      0


### Generating Microscopy Images for Proteins Associated with<br>Significant Intensity Differences

In [13]:
# Conveniently enough, the proteins associated with the largest
# intensity differences all have negative PPI labels
# This implies that PPI-guided intensity refinement correctly reduces
# the measured intensities, thereby reducing false discoveries
# Now, generate microscopy images for these proteins
# In order to retrieve the images, the location of the corresponding
# proteins/genes in the microwell plates must be known
# Note that Q53HB9 and Q9NY93 are encoded by the same gene
# A8K0P8 and Q68CQ4 are also encoded by the same gene
#
# Q53HB9 (Gene DDX56):
# Barcode (i.e. plate ID): DZ08-2M
# Position: N15
#
# B2RD09 (Gene NSUN5):
# Barcode: DZ11-2M
# Position: C09

# A8K0P8 (Gene UTP25):
# Barcode: DZ48-2M
# Position: I14

# E9PS41 (Gene WDR74):
# Barcode: DZ56-2M
# Position: K04

In [19]:
# Define the paths to the images of the individual proteins/genes
images_dir = "microscopy_images_successful_refinements"

raw_dir_list = [
    "Q53HB9_plate_DZ08-2M_well_N15",
    "A8K0P8_plate_DZ48-2M_well_I14",
    "B2RD09_plate_DZ11-2M_well_C09",
    "E9PS41_plate_DZ56-2M_well_K04"
]

# Also define visualization settings for the four different channels
visual_settings = {
    "DAPI": {
        "colormap": "blue",
        "contrast_limits": [0, 371],
        "opacity": 1,
        "gamma": 2,
        "name_in_file": "DAPI"
    },
    "eGFP": {
        "colormap": "green",
        "contrast_limits": [62, 2175],
        "opacity": 1,
        "gamma": 1.3,
        "name_in_file": "GFP"
    },
    "mCherry": {
        "colormap": "red",
        "contrast_limits": [56, 3300],
        "opacity": 1,
        "gamma": 1.3,
        "name_in_file": "RFP"
    },
    "Cy5": {
        "colormap": "gray",
        "contrast_limits": [190, 1200],
        "opacity": 1,
        "gamma": 1,
        "name_in_file": "CY5"
    }
}

In [15]:
viewer = napari.Viewer()

In [29]:
# Iterate over the four image directories
for dir in raw_dir_list:
    uniprot_id = dir.split("_")[0]
    plate_id = dir.split("_")[2]
    well_name = dir.split("_")[-1]

    raw_image_dir_path = os.path.join(images_dir, dir)

    superimposed_image_dir_path = os.path.join(
        images_dir,
        f"{uniprot_id}_superimposed"
    )

    if not os.path.exists(superimposed_image_dir_path):
        os.makedirs(superimposed_image_dir_path)
    
    # For one well, nine different images/fields of view have been taken
    # A superimposed image is generated for each of them
    # Iterate over the nine fields of view
    for i in range(1, 10):
        # Iterate over the four channels (DAPI, Cy5, eGFP and mCherry)
        for channel, settings in visual_settings.items():
            channel_name_in_file = settings["name_in_file"]
            image_file_name = (
                f"b{plate_id}_w{well_name}_s{i}_z1_t1_"
                f"c{channel_name_in_file}_u1.jp2"
            )
            path_image_of_current_channel = os.path.join(
                raw_image_dir_path,
                image_file_name
            )

            # Load the image via the Pillow library and convert it into
            # a NumPy array
            # The latter is necessary as Napari operates on NumPy arrays
            with Image.open(path_image_of_current_channel) as im:
                im.load()
            im = np.array(im)

            # Add the image of the current channel as layer to Napari
            viewer.add_image(
                im,
                name=channel,
                colormap=settings["colormap"],
                blending="additive",
                contrast_limits=settings["contrast_limits"],
                opacity=settings["opacity"],
                gamma=settings["gamma"]
            )
        
        # All four channels have been superimposed, so the final image
        # can now be taken
        # Save the image only as PNG
        # Unfortunately, saving a composited SVG of all layers is not
        # supported by Napari
        viewer.reset_view()
        viewer.screenshot(
            os.path.join(
                superimposed_image_dir_path,
                f"superimposed_image_uniprot_id_{uniprot_id}_view_{i}.png"
            ),
            canvas_only=True
        )

        # Clear all current layers so that the next image can be
        # generated
        viewer.layers.clear()

In [30]:
# The following fields of view are chosen for the publication:
# Q53HB9: 5
# B2RD09: 4
# A8K0P8: 2
# E9PS41: 7