In [1]:
"""
The purpose of this Jupyter notebook is to investigate the controls
comprised in the subsets "Dharmacon pooled Genome 1" ("Experiment" value
"VACCINIA-GP-G1") and "Dharmacon pooled Genome 2" ("Experiment" value
"VACCINIA-GP-G2").

To be more precise, it is checked which controls are present in the
individual assay plates (note the distinction between chequerboard
plates and assay plates/screening plates: the former are characterised
by the fact of exclusively containing controls, whereas the latter
harbour both controls and siRNA interrogations).

Subsequently, the variability of the controls across the individual
plates is determined in two ways: The first way involves using the raw
intensity values. For each plate, control and column, the mean intensity
as well as the standard deviation are computed. Based on ...
"""

'\nThe purpose of this Jupyter notebook is to investigate the controls\ncomprised in the subsets "Dharmacon pooled Genome 1" ("Experiment" value\n"VACCINIA-GP-G1") and "Dharmacon pooled Genome 2" ("Experiment" value\n"VACCINIA-GP-G2").\n\nTo be more precise, it is checked which controls are present in the\nindividual assay plates (note the distinction between chequerboard\nplates and assay plates/screening plates: the former are characterised\nby the fact of exclusively containing controls, whereas the latter\nharbour both controls and siRNA interrogations).\n\nSubsequently, the variability of the controls across the individual\nplates is determined in two ways: The first way involves using the raw\nintensity values. For each plate, control and column, the mean intensity\nas well as the standard deviation are computed. Based on ...\n'

In [2]:
import numpy as np
import pandas as pd

In [3]:
path_to_VACV_report = (
    "/Users/jacobanter/Documents/Code/VACV_screen/VacciniaReport_"
    "20170223-0958_ZScored_conc_and_NaN_adjusted.csv"
)

VACV_report_df = pd.read_csv(
    path_to_VACV_report,
    sep="\t"
)

  VACV_report_df = pd.read_csv(


In [4]:
# Filter out the two Dharmacon pooled subsets (note that DP-G1 and DP-G2
# represent technical replicates/duplicates, i.e. DP-G1 and DP-G2
# represent the first and second experiment, respectively)
# Only screening plates are supposed to be considered
DP_G1_df = VACV_report_df[
    (VACV_report_df["Experiment"] == "VACCINIA-DP-G1")
    &
    (VACV_report_df["PLATE_TYPE"] == "ScreeningPlate")
]

DP_G2_df = VACV_report_df[
    (VACV_report_df["Experiment"] == "VACCINIA-DP-G2")
    &
    (VACV_report_df["PLATE_TYPE"] == "ScreeningPlate")
]

In [5]:
# Verify that these two subsets encompass exclusively screening plates
print(np.unique(DP_G1_df["PLATE_TYPE"]))
print(np.unique(DP_G2_df["PLATE_TYPE"]))

['ScreeningPlate']
['ScreeningPlate']


In [6]:
# Now, determine the individual plates comprised in each subset
plates_DP_G1 = np.unique(DP_G1_df["Barcode"])
plates_DP_G2 = np.unique(DP_G2_df["Barcode"])

print(
    "Amount of plates comprised in each subset:\n"
    f"DP-G1: {len(plates_DP_G1)}\n"
    f"DP-G2: {len(plates_DP_G2)}"
)

Amount of plates comprised in each subset:
DP-G1: 57
DP-G2: 57


In [7]:
# As DP-G1 and DP-G2 represent technical replicates/duplicates, it is
# conceivable that they encompass the same plates; this is checked
# Set operations are utilised for this purpose
plates_DP_G1_set = set(plates_DP_G1)
plates_DP_G2_set = set(plates_DP_G2)

plates_intersection = plates_DP_G1_set & plates_DP_G2_set

if (
    (len(plates_intersection) == len(plates_DP_G1))
    and
    (len(plates_intersection) == len(plates_DP_G2))
):
    print("The two replicates share the same plates!")
elif len(plates_intersection) > 0:
    print(
        "The two replicates share some, but not all plates!"
    )
else:
    print(
        "The two replicates do not share any plates!"
    )

The two replicates do not share any plates!


In [8]:
# Now, determine the controls contained in each plate
controls_per_plate_DP_G1 = []
controls_per_plate_DP_G2 = []

n_controls_per_plate_DP_G1 = []
n_controls_per_plate_DP_G2 = []

for plate_id_G1, plate_id_G2 in zip(plates_DP_G1, plates_DP_G2):
    subset_G1_df = DP_G1_df[
        (DP_G1_df["Barcode"] == plate_id_G1)
        &
        (DP_G1_df["WellType"] == "CONTROL")
    ]

    controls_per_plate_DP_G1.append(
        np.unique(subset_G1_df["Name"])
    )

    n_controls_G1 = len(np.unique(subset_G1_df["Name"]))
    n_controls_per_plate_DP_G1.append(n_controls_G1)

    subset_G2_df = DP_G2_df[
        (DP_G2_df["Barcode"] == plate_id_G2)
        &
        (DP_G2_df["WellType"] == "CONTROL")
    ]

    controls_per_plate_DP_G2.append(
        np.unique(subset_G2_df["Name"])
    )

    n_controls_G2 = len(np.unique(subset_G2_df["Name"]))
    n_controls_per_plate_DP_G2.append(n_controls_G2)

In [9]:
assert (
    np.array(n_controls_per_plate_DP_G1)
    ==
    np.array(n_controls_per_plate_DP_G2)
).all(), (
    "The amounts of controls per plate do not match between the "
    "duplicates!"
)

print(n_controls_per_plate_DP_G1)
print(n_controls_per_plate_DP_G2)

[12, 12, 12, 13, 13, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13]
[12, 12, 12, 13, 13, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13]


In [10]:
# In addition to a numerical check, the precise identity of the controls
# is also determined
# Bear in mind that when comparisong two ragged arrays via `==`,
# comparison of the arrays as a whole rather than element-wise
# comparison is performed, returning a single boolean value instead of
# the usual boolean array
# Therefore, element-wise comparison has to be accomplished e.g. by
# means of list comprehensions
assert np.array([
    element_G1 == element_G2
    for plate_list_G1, plate_list_G2 in
    zip(controls_per_plate_DP_G1, controls_per_plate_DP_G2)
    for element_G1, element_G2 in zip(plate_list_G1, plate_list_G2)
]).all(), (
    "The two replicates do not have all or no controls at all in "
    "common!"
)

In [11]:
# Astonishingly, there is a complete match regarding the controls for
# each and every plate
# The "core controls", i.e. controls shared by all plates are determined
control_sets = [
    set(control_per_plate) for control_per_plate in (
        controls_per_plate_DP_G1 + controls_per_plate_DP_G2
    )
]

assert len(control_sets) == (
    len(controls_per_plate_DP_G1)
    +
    len(controls_per_plate_DP_G2)
), "Something went wrong during the generation of the sets!"

core_controls_set = set.intersection(*control_sets)

print(
    f"The \"core controls\" encompass {len(core_controls_set)} controls."
)

The "core controls" encompass 12 controls.


In [12]:
print(
    "The precise identity of the \"core controls\" is as follows:"
)

for control in core_controls_set:
    print(control)

The precise identity of the "core controls" is as follows:
RAC1
ARPC3
MOCK
PSMC3
PAK1
SCRAMBLED
CDC42
ATP6V1A
GFP Duplex III
PSMA6
TSG101
KIF11


In [13]:
# For each of the "core controls", the mean intensity as well as the
# standard deviation are computed on a per-plate as well as on a
# per-column basis
# Also bear in mind that duplicates are available (Genome 1 and Genome
# 2)
# To this end, the columns containing the raw intensities are determined
early_raw_int_cols = [
    column for column in VACV_report_df.columns
    if ("dIntensity" in column) and ("Late" not in column)
    and ("nZScore" not in column)
]
late_raw_int_cols = [
    column for column in VACV_report_df.columns
    if ("dIntensity" in column) and ("Late" in column)
    and ("nZScore" not in column)
]

In [14]:
# Create a Pandas DataFrame to populate with the computed values
# The DataFrame is supposed to encompass the following information for
# each control on a per-replicate, per-plate and per-column basis:
# mean intensity and standard deviation
# To this end, a MultiIndex is created
indices = [
    ["DP-G1"] * len(core_controls_set) * len(plates_DP_G1)
    +
    ["DP-G2"] * len(core_controls_set) * len(plates_DP_G2),
    [
        plate_id
        for plate_id in plates_DP_G1
        for _ in range(len(core_controls_set))
    ]
    +
    [
        plate_id
        for plate_id in plates_DP_G2
        for _ in range(len(core_controls_set))
    ],
    (list(core_controls_set) * len(plates_DP_G1)
    +
    list(core_controls_set) * len(plates_DP_G2))
]

# Verify that all sublists encompass the same amount of elements
assert len(set(list(map(len, indices)))) == 1, (
    "The indices for the MultiIndex have not been constructed "
    "correctly!"
)

multi_index = pd.MultiIndex.from_arrays(
    indices,
    names=["Replicate", "Plate_ID", "Control_name"]
)

df_col_names = [
    "_".join([col_name, suffix])
    for col_name in (early_raw_int_cols + late_raw_int_cols)
    for suffix in ["mean", "std"]
]

raw_control_vals_df = pd.DataFrame(
    columns=df_col_names,
    index=multi_index
)

In [15]:
for core_control in core_controls_set:
    for plate_id in plates_DP_G1:
        for i, col_name in enumerate(early_raw_int_cols + late_raw_int_cols):
            current_subset = DP_G1_df.loc[
                (DP_G1_df["Barcode"] == plate_id)
                &
                (DP_G1_df["Name"] == core_control),
                col_name
            ]

            current_mean = np.nanmean(current_subset)
            current_std = np.nanstd(current_subset)

            raw_control_vals_df.loc[
                ("DP-G1", plate_id, core_control),
                df_col_names[2 * i]
            ] = current_mean
            raw_control_vals_df.loc[
                ("DP-G1", plate_id, core_control),
                df_col_names[2 * i + 1]
            ] = current_std

    for plate_id in plates_DP_G2:
        for i, col_name in enumerate(early_raw_int_cols + late_raw_int_cols):
            current_subset = DP_G2_df.loc[
                (DP_G2_df["Barcode"] == plate_id)
                &
                (DP_G2_df["Name"] == core_control),
                col_name
            ]

            current_mean = np.nanmean(current_subset)
            current_std = np.nanstd(current_subset)

            raw_control_vals_df.loc[
                ("DP-G2", plate_id, core_control),
                df_col_names[2 * i]
            ] = current_mean
            raw_control_vals_df.loc[
                ("DP-G2", plate_id, core_control),
                df_col_names[2 * i + 1]
            ] = current_std

In [16]:
# Verify that the DataFrame does not contain any `NaN` values
assert np.count_nonzero(
    np.isnan(raw_control_vals_df.to_numpy(dtype=np.float64))
) == 0, (
    "The DataFrame contains `NaN` entries!"
)

In [17]:
raw_control_vals_df.to_csv(
    "mean_and_std_of_intensity_per_col_DP-G1_and_DP-G2_raw_vals.tsv",
    sep="\t",
    header=True,
    index=True
)

In [18]:
# Compute the variability, i.e. the standard deviation across the plates
# for each control
# Additionally, the mean of the mean intensities across all plates is
# computed for each control
# Both the mean and the variability are computed for each control on a
# per-column basis
# Again, the results are kept track of in a DataFrame
# While the columns of the DataFrame are the different early and late
# intensities, the row labels (indices) are the different controls
plate_variability_raw_vals_df = pd.DataFrame(
    columns=df_col_names,
    index=list(core_controls_set)
)

In [19]:
for control in core_controls_set:
    for early_col in early_raw_int_cols:
        # Gather the mean raw intensities for the current column across
        # all plates and replicates
        mean_raw_ints_across_plates = raw_control_vals_df.loc[
            (slice(None), slice(None), control),
            "_".join([early_col, "mean"])
        ]

        mean_raw_ints_mean = np.nanmean(
            mean_raw_ints_across_plates.to_numpy()
        )
        plate_variability_raw_vals_df.loc[
            control, early_col + "_mean"
        ] = mean_raw_ints_mean

        mean_raw_ints_std = np.nanstd(
            mean_raw_ints_across_plates.to_numpy()
        )
        plate_variability_raw_vals_df.loc[
            control, early_col + "_std"
        ] = mean_raw_ints_std

    for late_col in late_raw_int_cols:
        mean_raw_ints_across_plates = raw_control_vals_df.loc[
            (slice(None), slice(None), control),
            "_".join([late_col, "mean"])
        ]

        mean_raw_ints_mean = np.nanmean(
            mean_raw_ints_across_plates.to_numpy()
        )
        plate_variability_raw_vals_df.loc[
            control, late_col + "_mean"
        ] = mean_raw_ints_mean

        mean_raw_ints_std = np.nanstd(
            mean_raw_ints_across_plates.to_numpy()
        )
        plate_variability_raw_vals_df.loc[
            control, late_col + "_std"
        ] = mean_raw_ints_std

In [20]:
plate_variability_raw_vals_df.to_csv(
    "mean_and_std_across_plates_of_controls_DP-G1_and_DP-G2_raw_"\
    "intensity_vals.tsv",
    sep="\t",
    header=True,
    index=True
)

In [21]:
# Now, the variability of the controls across plates is computed based
# on Z-scores
# To this end, the Z-scores have to be computed in an initial step
# Computation of Z-scores is performed on a per-plate basis as well as
# on a per-column basis
# Compute the mean as well as the standard deviation
early_mean_and_std_per_plate_and_per_col_G1 = []
late_mean_and_std_per_plate_and_per_col_G1 = []

early_mean_and_std_per_plate_and_per_col_G2 = []
late_mean_and_std_per_plate_and_per_col_G2 = []


for plate_id in plates_DP_G1:
    current_plate_early_vals = []
    current_plate_late_vals = []

    for early_col in early_raw_int_cols:
        current_subset = DP_G1_df.loc[
            DP_G1_df["Barcode"] == plate_id,
            early_col
        ]
        
        current_early_mean = np.nanmean(current_subset)
        current_early_std = np.nanstd(current_subset)

        current_plate_early_vals.append(
            (current_early_mean, current_early_std)
        )
    
    early_mean_and_std_per_plate_and_per_col_G1.append(
        current_plate_early_vals
    )

    for late_col in late_raw_int_cols:
        current_subset = DP_G1_df.loc[
            DP_G1_df["Barcode"] == plate_id,
            late_col
        ]

        current_late_mean = np.nanmean(current_subset)
        current_late_std = np.nanstd(current_subset)

        current_plate_late_vals.append(
            (current_late_mean, current_late_std)
        )
    
    late_mean_and_std_per_plate_and_per_col_G1.append(
        current_plate_late_vals
    )


for plate_id in plates_DP_G2:
    current_plate_early_vals = []
    current_plate_late_vals = []

    for early_col in early_raw_int_cols:
        current_subset = DP_G2_df.loc[
            DP_G2_df["Barcode"] == plate_id,
            early_col
        ]

        current_early_mean = np.nanmean(current_subset)
        current_early_std = np.nanstd(current_subset)

        current_plate_early_vals.append(
            (current_early_mean, current_early_std)
        )

    early_mean_and_std_per_plate_and_per_col_G2.append(
        current_plate_early_vals
    )

    for late_col in late_raw_int_cols:
        current_subset = DP_G2_df.loc[
            DP_G2_df["Barcode"] == plate_id,
            late_col
        ]

        current_late_mean = np.nanmean(current_subset)
        current_late_std = np.nanstd(current_subset)

        current_plate_late_vals.append(
            (current_late_mean, current_late_std)
        )
    
    late_mean_and_std_per_plate_and_per_col_G2.append(
        current_plate_late_vals
    )

In [22]:
# Perform a couple of sanity checks

# The length of the top-level lists is supposed to equal the respective
# amount of plates
assert (
    (
        len(early_mean_and_std_per_plate_and_per_col_G1)
        ==
        len(plates_DP_G1)
    )
    and
    (
        len(late_mean_and_std_per_plate_and_per_col_G1)
        ==
        len(plates_DP_G1)
    )
    and
    (
        len(early_mean_and_std_per_plate_and_per_col_G2)
        ==
        len(plates_DP_G2)
    )
    and
    (
        len(late_mean_and_std_per_plate_and_per_col_G2)
        ==
        len(plates_DP_G2)
    )
), "The top-level lists do not have the expected lengths!"

# The length of the sublists is supposed to equal the amount of early
# and late columns, respectively
assert (
    all([
        len(sublist) == len(early_raw_int_cols)
        for sublist in early_mean_and_std_per_plate_and_per_col_G1
    ])
    and
    all([
        len(sublist) == len(late_raw_int_cols)
        for sublist in late_mean_and_std_per_plate_and_per_col_G1
    ])
    and
    all([
        len(sublist) == len(early_raw_int_cols)
        for sublist in early_mean_and_std_per_plate_and_per_col_G2
    ])
    and
    all([
        len(sublist) == len(late_raw_int_cols)
        for sublist in late_mean_and_std_per_plate_and_per_col_G2
    ])
), "The sublists do not have the expected length!"

In [23]:
# Now that the mean and standard deviation values have been computed and
# the sanity checks have successfully been passed, the actual Z-scoring
# is performed via vectorised operations

# However, prior to Z-scoring, the names of columns harbouring Z-scored
# values must be determined
early_Z_scored_int_cols = [
    column for column in VACV_report_df.columns
    if ("dIntensity" in column) and ("Late" not in column)
    and ("nZScore" in column)
]

late_Z_scored_int_cols = [
    column for column in VACV_report_df.columns
    if ("dIntensity" in column) and ("Late" in column)
    and ("nZScore" in column)
]

In [24]:
# Z-scoring for the first replicate
for plate_id, early_vals_per_col, late_vals_per_col in zip(
    plates_DP_G1,
    early_mean_and_std_per_plate_and_per_col_G1,
    late_mean_and_std_per_plate_and_per_col_G1
):
    for early_val_pair, early_raw_col, early_Z_scored_col in zip(
        early_vals_per_col, early_raw_int_cols, early_Z_scored_int_cols
    ):
        # Bear in mind that in order to avoid label alignment issues,
        # the right-hand side of the assignment statement should be
        # converted to a NumPy array
        DP_G1_df.loc[
            DP_G1_df["Barcode"] == plate_id,
            early_Z_scored_col
        ] = (
            (DP_G1_df.loc[
                DP_G1_df["Barcode"] == plate_id,
                early_raw_col
            ] - early_val_pair[0])
            /
            early_val_pair[1]
        ).to_numpy()

    for late_val_pair, late_raw_col, late_Z_scored_col in zip(
        late_vals_per_col, late_raw_int_cols, late_Z_scored_int_cols
    ):
        DP_G1_df.loc[
            DP_G1_df["Barcode"] == plate_id,
            late_Z_scored_col
        ] = (
            (DP_G1_df.loc[
                DP_G1_df["Barcode"] == plate_id,
                late_raw_col
            ] - late_val_pair[0])
            /
            late_val_pair[1]
        ).to_numpy()

# Z-scoring for the second replicate
for plate_id, early_vals_per_col, late_vals_per_col in zip(
    plates_DP_G2,
    early_mean_and_std_per_plate_and_per_col_G2,
    late_mean_and_std_per_plate_and_per_col_G2
):
    for early_val_pair, early_raw_col, early_Z_scored_col in zip(
        early_vals_per_col, early_raw_int_cols, early_Z_scored_int_cols
    ):
        DP_G2_df.loc[
            DP_G2_df["Barcode"] == plate_id,
            early_Z_scored_col
        ] = (
            (DP_G2_df.loc[
                DP_G2_df["Barcode"] == plate_id,
                early_raw_col
            ] - early_val_pair[0])
            /
            early_val_pair[1]
        ).to_numpy()

    for late_val_pair, late_raw_col, late_Z_scored_col in zip(
        late_vals_per_col, late_raw_int_cols, late_Z_scored_int_cols
    ):
        DP_G2_df.loc[
            DP_G2_df["Barcode"] == plate_id,
            late_Z_scored_col
        ] = (
            (DP_G2_df.loc[
                DP_G2_df["Barcode"] == plate_id,
                late_raw_col
            ] - late_val_pair[0])
            /
            late_val_pair[1]
        ).to_numpy()

In [25]:
# Now, the procedure is analogous to that of the raw intensity values,
# i.e. a DataFrame is created storing for each control the mean
# intensity as well as the standard deviation on a per-plate basis
df_Z_scored_col_names = [
    "_".join([col_name, suffix])
    for col_name in early_Z_scored_int_cols + late_Z_scored_int_cols
    for suffix in ["mean", "std"]
]

Z_scored_control_vals_df = pd.DataFrame(
    columns=df_Z_scored_col_names,
    index=multi_index
)

for core_control in core_controls_set:
    for plate_id in plates_DP_G1:
        for i, col_name in enumerate(
            early_Z_scored_int_cols + late_Z_scored_int_cols
        ):
            current_subset = DP_G1_df.loc[
                (DP_G1_df["Barcode"] == plate_id)
                &
                (DP_G1_df["Name"] == core_control),
                col_name
            ]

            current_mean = np.nanmean(current_subset)
            current_std = np.nanstd(current_subset)

            Z_scored_control_vals_df.loc[
                ("DP-G1", plate_id, core_control),
                df_Z_scored_col_names[2 * i]
            ] = current_mean
            Z_scored_control_vals_df.loc[
                ("DP-G1", plate_id, core_control),
                df_Z_scored_col_names[2 * i + 1]
            ] = current_std

    for plate_id in plates_DP_G2:
        for i, col_name in enumerate(
            early_Z_scored_int_cols + late_Z_scored_int_cols
        ):
            current_subset = DP_G2_df.loc[
                (DP_G2_df["Barcode"] == plate_id)
                &
                (DP_G2_df["Name"] == core_control),
                col_name
            ]

            current_mean = np.nanmean(current_subset)
            current_std = np.nanstd(current_subset)

            Z_scored_control_vals_df.loc[
                ("DP-G2", plate_id, core_control),
                df_Z_scored_col_names[2 * i]
            ] = current_mean
            Z_scored_control_vals_df.loc[
                ("DP-G2", plate_id, core_control),
                df_Z_scored_col_names[2 * i + 1]
            ] = current_std

In [26]:
# Verify that the DataFrame does not contain any `NaN` values
assert np.count_nonzero(
    np.isnan(Z_scored_control_vals_df.to_numpy(dtype=np.float64))
) == 0, "The DataFrame contains `NaN` entries!"

In [27]:
Z_scored_control_vals_df.to_csv(
    "mean_and_std_of_intensity_per_col_DP-G1_and_DP-G2_Z_scored_vals.tsv",
    sep="\t",
    header=True,
    index=True
)

In [28]:
# Compute the variability, i.e. the standard deviation across the plates
# for each control
plate_variability_Z_scored_vals_df = pd.DataFrame(
    columns=df_Z_scored_col_names,
    index=list(core_controls_set)
)

In [29]:
for control in core_controls_set:
    for column in early_Z_scored_int_cols + late_Z_scored_int_cols:
        # Gather the mean Z-scored intensities for the current column
        # across all plates and replicates
        mean_Z_scored_ints_across_plates = Z_scored_control_vals_df.loc[
            (slice(None), slice(None), control),
            "_".join([column, "mean"])
        ]

        mean_Z_scored_ints_mean = np.nanmean(
            mean_Z_scored_ints_across_plates.to_numpy()
        )
        plate_variability_Z_scored_vals_df.loc[
            control, column + "_mean"
        ] = mean_Z_scored_ints_mean

        mean_Z_scored_ints_std = np.nanstd(
            mean_Z_scored_ints_across_plates.to_numpy()
        )
        plate_variability_Z_scored_vals_df.loc[
            control, column + "_std"
        ] = mean_Z_scored_ints_std

In [30]:
plate_variability_Z_scored_vals_df.to_csv(
    "mean_and_std_across_plates_of_controls_DP-G1_and_DP-G2_Z_scored_"\
    "intensity_vals.tsv",
    sep="\t",
    header=True,
    index=True
)

In [31]:
# As a final step, the variability based on raw intensity values is
# compared to the variability based on Z-scored intensity values
# This is done for each core control
# To be more precise, a dictionary is created in order to keep track of
# the columns with the lowest variability for each control
# Thus, the dictionary keys are the core controls, whereas the
# dictionary values are tuples containing the column names with the
# lowest early and late variability, respectively
lowest_var_dict = {}

for control in core_controls_set:
    # Determine the column with the lowest variability for early
    # intensities
    # Bear in mind that the comparison has to be performed for both raw
    # and Z-scored values
    raw_early_variability_per_col = plate_variability_raw_vals_df.loc[
        control, [col + "_mean" for col in early_raw_int_cols]
    ]
    Z_scored_early_variability_per_col = plate_variability_Z_scored_vals_df.loc[
        control, [col + "_mean" for col in early_Z_scored_int_cols]
    ]

    if (
        raw_early_variability_per_col.min()
        <
        Z_scored_early_variability_per_col.min()
    ):
        min_early_col = early_raw_int_cols[
            np.argmin(raw_early_variability_per_col)
        ]
    else:
        min_early_col = early_Z_scored_int_cols[
            np.argmin(Z_scored_early_variability_per_col)
        ]

    # Determine the column with the lowest variability for late
    # intensities
    raw_late_variability_per_col = plate_variability_raw_vals_df.loc[
        control, [col + "_mean" for col in late_raw_int_cols]
    ]
    Z_scored_late_variability_per_col = plate_variability_Z_scored_vals_df.loc[
        control, [col + "_mean" for col in late_Z_scored_int_cols]
    ]

    if (
        raw_late_variability_per_col.min()
        <
        Z_scored_late_variability_per_col.min()
    ):
        min_late_col = late_raw_int_cols[
            np.argmin(raw_late_variability_per_col)
        ]
    else:
        min_late_col = late_Z_scored_int_cols[
            np.argmin(Z_scored_late_variability_per_col)
        ]
    
    lowest_var_dict[control] = (min_early_col, min_late_col)

In [32]:
# Create yet another dictionary to count the occurrences of the
# individual columns
col_counts_dict = {
    "early": {},
    "late": {}
}

for _, (lowest_early_col, lowest_late_col) in lowest_var_dict.items():
    if lowest_early_col in col_counts_dict["early"].keys():
        col_counts_dict["early"][lowest_early_col] += 1
    else:
        col_counts_dict["early"][lowest_early_col] = 1

    if lowest_late_col in col_counts_dict["late"].keys():
        col_counts_dict["late"][lowest_late_col] += 1
    else:
        col_counts_dict["late"][lowest_late_col] = 1

In [33]:
# Determine the column with the highest frequency for early and late
# respectively
for time in ["early", "late"]:
    columns = list(col_counts_dict[time].keys())
    frequencies = list(col_counts_dict[time].values())

    most_frequent_col = columns[np.argmax(frequencies)]

    print(
        "The most frequent column with the lowest variability in the\n"
        f"case of {time} intensities is {most_frequent_col}."
    )

The most frequent column with the lowest variability in the
case of early intensities is dIntensity_cPathogen_eMean_oCells.
The most frequent column with the lowest variability in the
case of late intensities is dIntensity_cLatePathogen_eMean_oCells.


In [15]:
# Out of curiosity, the maximum as well as the minimum raw intensity
# values are determined for each early and late category (nucleus,
# perinucleus, whole cell and Voronoi) within the DP-G1 and DP-G2
# subsets

# Minimum and maximum for DP-G1
print("Minimum and maximum raw intensities for DP-G1:\n")
for early_raw_col in early_raw_int_cols:
    min_val = DP_G1_df[early_raw_col].min()
    max_val = DP_G1_df[early_raw_col].max()

    print(
        f"{early_raw_col}: minimum: {min_val}, maximum: {max_val}"
    )
print()

# Minimum and maximum for DP-G2
print("Minimum and maximum raw intensities for DP-G2:\n")
for late_raw_col in late_raw_int_cols:
    min_val = DP_G2_df[late_raw_col].min()
    max_val = DP_G2_df[late_raw_col].max()

    print(
        f"{late_raw_col}: minimum: {min_val}, maximum: {max_val}"
    )

Minimum and maximum raw intensities for DP-G1:

dIntensity_cPathogen_eMean_oNuclei: minimum: 0.00896937, maximum: 0.22475199
dIntensity_cPathogen_eMean_oPeriNuclei: minimum: 0.00865132, maximum: 0.17134869
dIntensity_cPathogen_eMean_oCells: minimum: 0.00816141, maximum: 0.10740024
dIntensity_cPathogen_eMean_oVoronoiCells: minimum: 0.00861938, maximum: 0.14279419

Minimum and maximum raw intensities for DP-G2:

dIntensity_cLatePathogen_eMean_oNuclei: minimum: 0.01554131, maximum: 0.08299077
dIntensity_cLatePathogen_eMean_oPeriNuclei: minimum: 0.01479614, maximum: 0.0729166
dIntensity_cLatePathogen_eMean_oCells: minimum: 0.01163862, maximum: 0.05874576
dIntensity_cLatePathogen_eMean_oVoronoiCells: minimum: 0.01481506, maximum: 0.0693099


In [13]:
# Out of curiosity, it is investigated whether the individual plates
# also have the actual siRNA investigations (non-controls) in common
investigations_per_plate_DP_G1 = []
investigations_per_plate_DP_G2 = []

n_investigations_per_plate_DP_G1 = []
n_investigations_per_plate_DP_G2 = []

for plate_id_G1, plate_id_G2 in zip(plates_DP_G1, plates_DP_G2):
    subset_G1_df = DP_G1_df[
        (DP_G1_df["Barcode"] == plate_id_G1)
        &
        (DP_G1_df["WellType"] == "POOLED_SIRNA")
    ]

    investigations_per_plate_DP_G1.append(
        np.unique(subset_G1_df["Name"])
    )

    n_investigations_G1 = len(np.unique(subset_G1_df["Name"]))
    n_investigations_per_plate_DP_G1.append(n_investigations_G1)

    subset_G2_df = DP_G2_df[
        (DP_G2_df["Barcode"] == plate_id_G2)
        &
        (DP_G2_df["WellType"] == "POOLED_SIRNA")
    ]

    investigations_per_plate_DP_G2.append(
        np.unique(subset_G2_df["Name"])
    )

    n_investigations_G2 = len(np.unique(subset_G2_df["Name"]))
    n_investigations_per_plate_DP_G2.append(n_investigations_G2)

In [14]:
assert (
    np.array(n_investigations_G1)
    ==
    np.array(n_investigations_G2)
).all(), (
    "The amounts of investigations per plate do not match between the "
    "duplicates!"
)

print(n_controls_per_plate_DP_G1)
print(n_controls_per_plate_DP_G2)

[12, 12, 12, 13, 13, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13]
[12, 12, 12, 13, 13, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13]


In [None]:
# In addition to a numerical check, the precise identity if the
# investigations is also determined
# Bear in mind that when comparing two ragged arrays via `==`,
# comparison of the arrays as a whole rather than element-wise
# comparison is performed, returning a single boolean value instead of
# the usual boolean array
# Therefore, element-wise comparison has to be accomplished e.g. by
# means of list comprehensions
assert np.array([
    element_G1 == element_G2
    for plate_list_G1, plate_list_G2 in
    zip(investigations_per_plate_DP_G1, investigations_per_plate_DP_G2)
    for element_G1, element_G2 in zip(plate_list_G1, plate_list_G2)
]).all(), (
    "The two replicates do not have all or no investigations at all in "
    "common!"
)