In [1]:
"""
The purpose of this Jupyter notebook is to check for the presence of
plate subsets in the screen, i.e. sets of plates with identical
contents. This allows to perform conventional Z-scoring rather than
IQM-normalization. The latter turned out to reshuffle the gene intensity
ranks, thereby making hit identification less robust.
"""

'\nThe purpose of this Jupyter notebook is to check for the presence of\nplate subsets in the screen, i.e. sets of plates with identical\ncontents. This allows to perform conventional Z-scoring rather than\nIQM-normalization. The latter turned out to reshuffle the gene intensity\nranks, thereby making hit identification less robust.\n'

In [16]:
from collections import defaultdict

import numpy as np
import pandas as pd

### Investigating the Original Version of the Screen

In [17]:
# Load the original version of the screen TSV file
path_VACV_screen = (
    "/Users/jacobanter/Documents/Code/VACV_screen/VacciniaReport_"
    "20170223-0958_ZScored.csv"
)

screen_df = pd.read_csv(
    path_VACV_screen,
    sep="\t"
)

  screen_df = pd.read_csv(


In [18]:
# Remove control plates, i.e. retain plates with PLATE_TYPE
# "ScreeningPlate"
screen_df = screen_df[
    screen_df["PLATE_TYPE"] == "ScreeningPlate"
]

In [19]:
# Additionally, only retain plates belonging to the "VACCINIA-DP-G1" and
# "VACCINIA-DP-G2" experiments
screen_df = screen_df[
    (screen_df["Experiment"] == "VACCINIA-DP-G1")
    |
    (screen_df["Experiment"] == "VACCINIA-DP-G2")
]

In [20]:
# Plate IDs are stored in the `Barcode` column
# Identify unique plate IDs
plate_ids = screen_df["Barcode"].unique()
n_plates = len(plate_ids)

print(
    f"Number of plates comprised in the screen: {n_plates}"
)

Number of plates comprised in the screen: 114


In [7]:
# Unfortunately, the `Name` column is not fully populated
# Instead, it is populated depending on whether the respective
# experiment was successful or not
# Thus, empty cells in the `Name` column are filled with the
# corresponding gene names
# To this end, the `ID_openBIS` column is utilized, which is fully
# populated irrespective of the experiment's outcome
# As a first step, a dictionary is generated mapping the values in
# `ID_openBIS` to the corresponding values in `Name`
mapping = (
    screen_df
    .dropna(subset=["Name"])
    .set_index("ID_openBIS")["Name"]
    .to_dict()
)

In [8]:
# As a second step, use the mapping generated above to fill missing
# `Name` values
screen_df["Name"] = screen_df["Name"].fillna(
    screen_df["ID_openBIS"].map(mapping)
)

In [9]:
assert (~screen_df["Name"].isna()).all(), (
    "Some cells in `Name` still are empty!"
)

AssertionError: Some cells in `Name` still are empty!

In [10]:
# As it turns out, some cells in the `Name` column still are empty
# Determine what the `ID_openBIS` values associated with these cells are
empty_cells_ID_openBIS = screen_df.loc[
    screen_df["Name"].isna(),
    "ID_openBIS"
].unique()

print(empty_cells_ID_openBIS)
print(f"{len(empty_cells_ID_openBIS):,}")

['ATP6V1A' 'DHARMACON_L-013610-00' 'DHARMACON_L-006337-00' ...
 'DHARMACON_L-025253-01' 'DHARMACON_L-020237-02' 'DHARMACON_L-020515-01']
1,592


In [21]:
# Trying to fill the `Name` column is not straightforward, which is why
# the `ID_openBIS` column itself is used
# Quickly verify that the `ID_openBIS` column has no empty cells
assert (~screen_df["ID_openBIS"].isna()).all(), (
    "The 'ID_openBIS' column does have empty cells!"
)

In [22]:
# Now, based on the `ID_openBIS` column, determine the plate content for
# each and every plate
content_per_plate = [
    frozenset(screen_df.loc[
        screen_df["Barcode"] == plate_id,
        "ID_openBIS"
    ])
    for plate_id in plate_ids
]

In [23]:
# Now, in case there indeed are plate subsets with identical plate
# content, the set of the `content_per_plate` list must be shorter than
# the list
list_length = len(content_per_plate)
set_length = len(set(content_per_plate))

print(
    f"The list has a length of {list_length:,}, whereas the "
    f"set has a length of {set_length:,}."
)

The list has a length of 114, whereas the set has a length of 57.


In [24]:
# Fortunately, there indeed are plate subsets with identical plate
# content
# Specifically, there apparently are 57 plate subsets
# Now, as a follow-up step, determine which plates are identical, i.e.
# determine the identity of plates having identical contents
# To this end, the `defaultdict` class is leveraged
plates_by_content = defaultdict(list)

for plate_id in plate_ids:
    content = frozenset(
        screen_df.loc[
            screen_df["Barcode"] == plate_id,
            "ID_openBIS"
        ]
    )
    plates_by_content[content].append(plate_id)

In [26]:
print(len(plates_by_content))
for key, value in plates_by_content.items():
    print(value)

57
['DZ01-2M', 'DZ01-2N']
['DZ02-2M', 'DZ02-2N']
['DZ03-2M', 'DZ03-2N']
['DZ04-2M', 'DZ04-2N']
['DZ05-2M', 'DZ05-2N']
['DZ06-2M', 'DZ06-2N']
['DZ07-2M', 'DZ07-2N']
['DZ08-2M', 'DZ08-2N']
['DZ09-2M', 'DZ09-2N']
['DZ10-2M', 'DZ10-2N']
['DZ11-2M', 'DZ11-2N']
['DZ12-2M', 'DZ12-2N']
['DZ13-2M', 'DZ13-2N']
['DZ14-2M', 'DZ14-2N']
['DZ15-2M', 'DZ15-2N']
['DZ16-2M', 'DZ16-2N']
['DZ17-2M', 'DZ17-2N']
['DZ18-2M', 'DZ18-2N']
['DZ19-2M', 'DZ19-2N']
['DZ20-2M', 'DZ20-2N']
['DZ21-2M', 'DZ21-2N']
['DZ22-2M', 'DZ22-2N']
['DZ23-2M', 'DZ23-2N']
['DZ24-2M', 'DZ24-2N']
['DZ25-2M', 'DZ25-2N']
['DZ26-2M', 'DZ26-2N']
['DZ27-2M', 'DZ27-2N']
['DZ28-2M', 'DZ28-2N']
['DZ29-2M', 'DZ29-2N']
['DZ30-2M', 'DZ30-2N']
['DZ31-2M', 'DZ31-2N']
['DZ32-2M', 'DZ32-2N']
['DZ33-2M', 'DZ33-2N']
['DZ34-2M', 'DZ34-2N']
['DZ35-2M', 'DZ35-2N']
['DZ36-2M', 'DZ36-2N']
['DZ37-2M', 'DZ37-2N']
['DZ38-2M', 'DZ38-2N']
['DZ39-2M', 'DZ39-2N']
['DZ40-2M', 'DZ40-2N']
['DZ41-2M', 'DZ41-2N']
['DZ42-2M', 'DZ42-2N']
['DZ43-2M', 'DZ43-2N']
['DZ44-2

In [34]:
# When incubating microtiter plates, the non-uniform thermal
# conductivity gives rise to a temperature gradient, with peripheral
# parts of the microtiter plate reaching a higher temperature than the
# center part (bathtub effect/edge effect)
# For this reason, replicates of specific treatments often have
# different locations
# Verify whether this indeed is the case
# This is only done for a couple of treatments

def pick_siRNA_treatment(dataframe):
    treatment_name = dataframe.loc[
        dataframe["WellType"] == "POOLED_SIRNA",
        "ID_openBIS"
    ].iloc[0]

    treatment_pos = dataframe.loc[
        dataframe["ID_openBIS"] == treatment_name,
        "WellName"
    ].iloc[0]

    return treatment_name, treatment_pos

for _, plate_id_pair in plates_by_content.items():
    # Extract the individual plates
    plate_1_df = screen_df[screen_df["Barcode"] == plate_id_pair[0]]
    plate_2_df = screen_df[screen_df["Barcode"] == plate_id_pair[1]]

    # Pick the name of the first siRNA treatment and check whether the
    # location of the replicate is different
    treatment_name, treatment_pos_1 = pick_siRNA_treatment(plate_1_df)
    treatment_pos_2 = plate_2_df.loc[
        plate_2_df["ID_openBIS"] == treatment_name,
        "WellName"
    ].iloc[0]

    if treatment_pos_1 == treatment_pos_2:
        print(
            f"For plates {plate_id_pair[0]} and {plate_id_pair[1]}, "
            f"the position of the first treatment {treatment_name} is "
            "identical across plates!"
        )
    else:
        print(
            f"For plates {plate_id_pair[0]} and {plate_id_pair[1]}, "
            f"the position of the first treatment {treatment_name} is "
            "different across plates!"
        )

For plates DZ01-2M and DZ01-2N, the position of the first treatment DHARMACON_L-009271-00 is identical across plates!
For plates DZ02-2M and DZ02-2N, the position of the first treatment DHARMACON_L-007591-01 is identical across plates!
For plates DZ03-2M and DZ03-2N, the position of the first treatment DHARMACON_L-010544-00 is identical across plates!
For plates DZ04-2M and DZ04-2N, the position of the first treatment DHARMACON_L-006466-00 is identical across plates!
For plates DZ05-2M and DZ05-2N, the position of the first treatment DHARMACON_L-015793-00 is identical across plates!
For plates DZ06-2M and DZ06-2N, the position of the first treatment DHARMACON_L-011008-00 is identical across plates!
For plates DZ07-2M and DZ07-2N, the position of the first treatment DHARMACON_L-016313-01 is identical across plates!
For plates DZ08-2M and DZ08-2N, the position of the first treatment DHARMACON_L-011689-00 is identical across plates!
For plates DZ09-2M and DZ09-2N, the position of the firs

In [41]:
# As it turns out, the position of the first treatment is identical
# across plates
# Thus, the investigation is extended to all treatments
for _, (plate_id_1, plate_id_2) in plates_by_content.items():
    # Extract the individual plates
    plate_1_df = screen_df[screen_df["Barcode"] == plate_id_1]
    plate_2_df = screen_df[screen_df["Barcode"] == plate_id_2]
    
    # For each plate, create a dictionary mapping the treatments to
    # their position on the respective plate
    treatment_dict_1 = (
        plate_1_df.set_index("ID_openBIS")["WellName"].to_dict()
    )
    treatment_dict_2 = (
        plate_2_df.set_index("ID_openBIS")["WellName"].to_dict()
    )

    # Check whether the locations differ
    if treatment_dict_1 != treatment_dict_2:
        print(
            f"For plates {plate_id_1} and {plate_id_2}, the locations "
            "of some treatments differ!"
        )

### Investigating the Processed Screen Subset

In [42]:
# The Dharmacon Pooled Genome 1 & 2 subset has been subjected to some
# processing, such as updating gene names and adding UniProt accessions
# The presence of plate subsets is also checked for this updated version
# of the screen subset
updated_screen_subset_path = (
    "/Users/jacobanter/Documents/Code/VACV_screen/Processing_Dharmacon_"
    "pooled_genome_1_and_2_subset/Dharmacon_pooled_G1_G2_screening_"
    "plates_subset_with_missing_UniProt_IDs_Z-scored.tsv"
)

updated_screen_subset_df = pd.read_csv(
    updated_screen_subset_path,
    sep="\t"
)

In [43]:
# Identify unique plate IDs, which are stored in the `Barcode` column
updated_plate_ids = updated_screen_subset_df["Barcode"].unique()
n_plates_updated = len(updated_plate_ids)

print(
    "Number of plates comprised in the updated screen subset: "
    f"{n_plates_updated}"
)

Number of plates comprised in the updated screen subset: 114


In [44]:
# As above, the plate contents are determined based on the `ID_openBIS`
# column
content_per_plate_updated = [
    frozenset(
        updated_screen_subset_df.loc[
            updated_screen_subset_df["Barcode"] == plate_id,
            "ID_openBIS"
        ]
    )
    for plate_id in updated_plate_ids
]

In [45]:
updated_list_length = len(content_per_plate_updated)
updated_set_length = len(set(content_per_plate_updated))

print(
    f"The list has a length of {updated_list_length:,}, whereas the "
    f"set has a length of {updated_set_length:,}."
)

The list has a length of 114, whereas the set has a length of 57.


In [46]:
# Determine which plates are identical, i.e. determine the identity of
# plates having identical contents
plates_by_content_updated = defaultdict(list)

for plate_id in updated_plate_ids:
    content = frozenset(
        updated_screen_subset_df.loc[
            updated_screen_subset_df["Barcode"] == plate_id,
            "ID_openBIS"
        ]
    )
    plates_by_content_updated[content].append(plate_id)

In [47]:
assert len(plates_by_content_updated) == 57, (
    "Something went wrong while determining plate subsets!"
)

In [55]:
# Now, as a final step, check whether the plate subsets are identical
# between the original screen subset and the updated screen subset
orig_plate_subsets = list(plates_by_content.values())
updated_plate_subsets = list(plates_by_content_updated.values())

assert orig_plate_subsets == updated_plate_subsets, (
    "The plate subsets are not identical between the original screen "
    "subset and the updated screen subset!"
)