In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import dask.dataframe as dd



In [2]:
def compute_Z_prime(pos_mean, pos_std, neg_mean, neg_std):
    """
    This function computes the Z' score for a positive and a negative
    control according to the formula provided in the publication
    "Plaque2.0—A High-Throughput Analysis Framework to Score Virus-Cell
    Transmission and Clonal Cell Expansion" by Yakimovich et al.

    Parameters
    ----------
    pos_mean: float
        Mean of the positive control.
    pos_std: float
        Standard deviation of the positive control.
    neg_mean: float
        Mean of the negative control.
    neg_std: float
        Standard deviation of the negative control.

    Returns
    -------
    Z_prime: float
        Z' score computed according to the abovementioned publication.
    """
    Z_prime = (
        1
        -
        (2 * pos_std + 2 * neg_std)
        /
        abs(pos_mean - neg_mean)
    )

    return Z_prime
    

In [3]:
# As a preliminary step, the Z' scores are computed for the control
# wells of the kinome subscreen
# Hence, the first step involves loading the respective subscreen
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

In [4]:
# Now, extract the subscreen of interest, i.e. the control wells of the
# kinome subscreen
# It is also important to keep in mind that two different types of
# plates have been employed within the framework of the screen, which
# are "CheckerBoard" and "ScreeningPlate"
# The difference is that "CheckerBoard" plates serve the purpose of
# verifying the proper functioning of the hardware, whereas plates of
# the type "ScreeningPlate" have been used for biological tests in lieu
# of hardware tests, i.e. for the actual siRNA-mediated gene knockdowns
# To be more precise, plates of the type "ScreeningPlates" contain two
# types of wells: On the one hand, they harbour control wells, which, as
# opposed to the controls in the case of "CheckerBoard" plates, do not
# deal with the proper functioning of the hardware, but with the
# knockdown of target genes the effect of which is known; hence, this
# type of control is not about the hardware aspect, but the biology
# aspect
# On the other hand, plates of the type "ScreeningPlate" harbour
# treatment wells, i.e. in these wells, genes are knocked down the role
# of which in the pathway/process of interest is not known yet
# Plates of the type "ScreeningPlate" are selected for the purpose of
# Z' score computation
# Although reliable measurements have not been obtained from all wells,
# i.e. there are wells whose value for the feature "WELL_QUALITY_STATUS"
# is "BAD", unreliable measurements are included for now
# Note that due to operator precedence, i.e. "&" having precedence over
# equality checks, the equality checks must be surrounded by equality
# checks
kinome_control_screening_plates_df = main_csv_df.loc[
    (main_csv_df["GENESET"] == "Kinome")
    &
    (main_csv_df["WellType"] == "CONTROL")
    &
    (main_csv_df["PLATE_TYPE"] == "ScreeningPlate")
]

In [5]:
ID_list = (
    np.unique(kinome_control_screening_plates_df["ID_openBIS"]).tolist()
)

In [None]:
"""
Questions to ask Artur:
1.) The feature "ID_openBIS" distinguishes between "eGFP" on the one
hand and ordinary "GFP" on the other hand. I thought that early genes
are connected to eGFP and late genes to mCherry as fluorescent markers,
so why is ordinary GFP targeted in addition? And what is the difference
between eGFP and ordinary GFP?
2.) Artur gave two different explanations regarding the control type
"MOCK": On the one hand, he stated that the MOCK control involved the
addition of no viruses such that no intensity is expected. On the other
hand, he stated that the MOCK control consisted of infecting cells
without siRNA treatment. The mean intensities of early and late genes
are comparable to those attained by the SCRAMBLED control, thereby
confirming that the latter of the two explanations applies to the MOCK
control.
3.) Artur told me late gene intensity has been measured with another
fluorescence channel; what kind of implications does this have regarding
Z' score computation?
"""

In [6]:
# Now, extract the individual controls comprised in the kinome subscreen
# Controls for which their effect is known and hence for which
# expectations regarding their intensities exist are the following:
# ARAC is the abbreviation for cytosine arabinoside, it is a so-called
# antimetabolite which is incorporated into DNA during DNA replication
# in lieu of cytidine triphosphate; it inhibits DNA repair, thereby
# ultimately resulting in apoptosis; Vaccinia virus does not incorporate
# its genome into the host's genome as e.g. retroviruses do and does
# therefore not rely on host cell replication for the production and
# assembly of progeny constituents; nevertheless, the Wikipedia page
# about cytosine arabinoside claimes (without reference) that ARAC is
# able to inhibit Vaccinia virus replication; consequently, it might
# serve as a (weak) negative control
#
# ARPC3 is the abbreviation for Actin-related protein 2/3 complex
# subunit 3; as its name already suggests, the protein represents one of
# seven subunits of the Arp2/3 protein complex; the Vaccinia virus is
# known to interact with the actin cytoskeleton and inhibition of the
# Arp2/3 complex has been shown to limit vaccinia virus infection;
# hence, ARPC3 might serve as a negative control
#
# ATP6V1A: This gene encodes a subunit of the enzyme vascuolar ATPase,
# which effects the acidification of eukaryotic intracellular
# organelles; siRNA-mediated knockdown of ATP6V1A prevents virus entry,
# which makes insofar sense as both mature virions (MV) and
# extracellular virions (EV) of vaccinia viruses rely on the
# acidification of the macropinosome for fusion with the macropinosome
# membrane to take place
# Thus, it is sensible to employ ATP6V1A as negative control, i.e.
# control for an unsuccessful host infection
#
# CDC42 is the abbreviation for cell division control protein 42 homolog
# and, as its name already suggests, is involved in the regulation of
# the cell cycle; in contrast to other viruses, such as retroviruses,
# Vaccinia virus does not incorporate its genome into the host cell's
# genome and does therefore not rely on host cell replication for the
# poduction of progeny constituents; instead, replication exclusively
# takes place in the cytoplasm; hence, while knockdown of CDC42
# presumably does not inhibit Vaccinia virus replication altogether, it
# is conceivable that the knockdown at least impairs viral replication
# as less host cells are available
#
# CDH4 is the abbreviation for Cadherin-4; hence, it is a so-called
# cadherin, i.e. a cell adhesion molecule; cadherins are characterised
# by the fact that their activity depends on calcium, hence their name
# (the term cadherin is a blend word/portmanteau of "calcium-dependent
# adhesion"); ...
#
# CHX is the abbreviation for cyclohexamine; it inhibits translation, or
# more precisely translation elongation, in eukaryotic organisms; hence,
# even though infection of host cells with Vaccinia virus may be
# successful, production and assembly of virus progeny constituents does
# not take place or is at least severely impaired; it is therefore
# sensible to utilise CHX as negative control
#
# EGFP is the abbreviation for ...
#
# FRAP1 is the gene encoding the protein mTOR, i.e. "mammalian Target of
# Rapamycin"; a brief glance at the scientific literature reveals that
# mTOR is intertwined with the vaccinia virus life cycle in a multitude
# of ambiguous ways; however, as mTOR is a downstream target of PI3K/Akt
# signalling, it is sensible to assume that just as with ITGB1
# knockdown, knockdow of mTOR will impede virus entry; this assumption
# is corroborated by the mean intensities for both early and late genes,
# thereby justifying the usage of FRAP1 as a negative control
#
# GFP is the abbreviation for ...
#
# ITGB1 is the abbreviation for integrin beta 1, which is a cell surface
# receptor; integrin beta 1 is known to mediate vaccinia virus entry
# through activation of PI3K/Akt signalling
# (https://journals.asm.org/doi/full/10.1128/jvi.06860-11); hence,
# siRNA-mediated knowkdown of ITGB1 impedes vaccinia virus entry and it
# is sensible to employ ITGB1 as negative control
#
# PSMA6 is the abbreviation for Proteasome subunit alpha type-6, which ...
#
# PXN is the abbreviation for paxillin, the function of which is to
# adhere cells to the extracellular matrix; ...
#
# RAC1 is the abbreviation for Ras-related C3 botulinum toxin substrate
# 1, which ...
#
# SCRAMBLED: These wells involve transfection of cells with
# non-targeting siRNAs, i.e. siRNAs that do not effect any gene
# knockdown
# Consequently, infection of the cells with vaccinia virus is not
# impaired, and it is sensible to employ the SCRAMBLED control wells as
# positive control, i.e. control for a successful host infection

In [7]:
# In order to render the typing process less arduous, the extraction of
# the individual controls is not accomplished explicitly, but by looping
# through the list containing the IDs
individual_control_dfs = []

for ID in ID_list:
    individual_control_dfs.append(
        kinome_control_screening_plates_df.loc[
            kinome_control_screening_plates_df["ID_openBIS"] == ID
        ]
    )

In [8]:
# Extract the intensity values of the entire cell for both early and
# late genes
# Analogous to the extraction of the individua control data frames, the
# extraction of intensity values is not performed explicitly, but by
# looping
early_int_cells_list = []
late_int_cells_list = []

for control_df in individual_control_dfs:
    early_int_cells_list.append(
        control_df["dIntensity_cPathogen_eMean_oCells_nZScore"]
    )

    late_int_cells_list.append(
        control_df["dIntensity_cLatePathogen_eMean_oCells_nZScore"]
    )

In [9]:
# Now, compute for each and every control for both early and late genes
# the mean as well as the standard deviation
early_mean_list = []
early_std_list = []

for early_intensity_series in early_int_cells_list:
    # Bear in mind that in order to obtain actual values, computation of
    # Dask objects has to be triggered
    early_mean_list.append(np.mean(early_intensity_series).compute())
    early_std_list.append(np.std(early_intensity_series).compute())

late_mean_list = []
late_std_list = []

for late_intensity_series in late_int_cells_list:
    late_mean_list.append(np.mean(late_intensity_series).compute())
    late_std_list.append(np.std(late_intensity_series).compute())

In [16]:
# The mean intensities for early and late genes are visually inspected
# for all controls
# Determine the longest gene name in order to appropriately set the
# width of the gene name column
max_gene_name_len = max([len(name) for name in ID_list])
second_column_name = "Early mean:"
second_column_len = len(second_column_name)
third_column_name = "Late mean:"
third_column_len = len(third_column_name)

# Bear in mind that the default value of the "sep" argument of the print
# function is used, which is a single space character
# Hence, in order to determine the total amount of characters one row
# encompasses, two characters have to be added in order to take into
# account the transition from the first to the second column and the
# transition from the second to the third column, respectively
# Also keep in mind the two additional spaces incorporated between
# columns
total_row_len = (
    max_gene_name_len + 2 + 1 + second_column_len + 2 + 1 + third_column_len
)

print(
    "Gene:".ljust(max_gene_name_len + 2),
    second_column_name.ljust(second_column_len + 2),
    third_column_name
)

for i, control_name in enumerate(ID_list):
    current_control_early_mean = early_mean_list[i]
    current_control_late_mean = late_mean_list[i]
    print("-" * total_row_len)
    print(
        control_name.ljust(max_gene_name_len + 2),
        str(np.round(
            current_control_early_mean, 3
        )).ljust(second_column_len + 2),
        np.round(current_control_late_mean, 3)
    )

Gene:          Early mean:   Late mean:
---------------------------------------
ABI1           1.051         0.674
---------------------------------------
ALLSTARDEATH   -1.241        4.323
---------------------------------------
ARPC3          -0.502        -0.236
---------------------------------------
ATP6V1A        0.233         0.499
---------------------------------------
CDC42          0.714         1.388
---------------------------------------
CDH4           0.71          0.229
---------------------------------------
EGFP           -2.806        -0.2
---------------------------------------
FRAP1          -1.113        -0.29
---------------------------------------
GFP            -2.641        0.007
---------------------------------------
ITGB1          -0.784        -0.152
---------------------------------------
KIF11          0.504         3.049
---------------------------------------
MAP3K7         0.904         0.966
---------------------------------------
MET            0.83

In [19]:
# Now, a couple of Z' scores are computed
# The only two positive controls identified thus far are SCRAMBLED and
# MOCK
SCRAMBLED_index = ID_list.index("SCRAMBLED")
SCRAMBLED_early_mean = early_mean_list[SCRAMBLED_index]
SCRAMBLED_early_std = early_std_list[SCRAMBLED_index]
SCRAMBLED_late_mean = late_mean_list[SCRAMBLED_index]
SCRAMBLED_late_std = late_std_list[SCRAMBLED_index]

MOCK_index = ID_list.index("MOCK")
MOCK_early_mean = early_mean_list[MOCK_index]
MOCK_early_std = early_std_list[MOCK_index]
MOCK_late_mean = late_mean_list[MOCK_index]
MOCK_late_std = late_std_list[MOCK_index]

# Z' score for ARPC3 as negative control
ARPC3_index = ID_list.index("ARPC3")
ARPC3_early_mean = early_mean_list[ARPC3_index]
ARPC3_early_std = early_std_list[ARPC3_index]
ARPC3_late_mean = late_mean_list[ARPC3_index]
ARPC3_late_std = late_std_list[ARPC3_index]

Z_prime_early_ARPC3_SCRAMBLED = compute_Z_prime(
    SCRAMBLED_early_mean, SCRAMBLED_early_std,
    ARPC3_early_mean, ARPC3_early_std
)
Z_prime_late_ARPC3_SCRAMBLED = compute_Z_prime(
    SCRAMBLED_late_mean, SCRAMBLED_late_std,
    ARPC3_late_mean, SCRAMBLED_late_std
)

Z_prime_early_ARPC3_MOCK = compute_Z_prime(
    MOCK_early_mean, MOCK_early_std,
    ARPC3_early_mean, ARPC3_early_std
)
Z_prime_late_ARPC3_MOCK = compute_Z_prime(
    MOCK_late_mean, MOCK_late_std,
    ARPC3_late_mean, ARPC3_late_std
)

print(
    "Z' scores for the following combinations of positive and "
    "negative controls:\n"
    f"SCRAMBLED-ARPC3 early: {np.round(Z_prime_early_ARPC3_SCRAMBLED, 3)}\n"
    f"SCRAMBLED-ARPC3 late: {np.round(Z_prime_late_ARPC3_SCRAMBLED, 3)}\n"
    f"MOCK-ARPC3 early: {np.round(Z_prime_early_ARPC3_MOCK, 3)}\n"
    f"MOCK-ARPC3 late: {np.round(Z_prime_late_ARPC3_MOCK, 3)}"
)
print()

# Z' score for ATP6V1A as negative control
ATP6V1A_index = ID_list.index("ATP6V1A")
ATP6V1A_early_mean = early_mean_list[ATP6V1A_index]
ATP6V1A_early_std = early_std_list[ATP6V1A_index]
ATP6V1A_late_mean = late_mean_list[ATP6V1A_index]
ATP6V1A_late_std = late_std_list[ATP6V1A_index]

Z_prime_early_ATP6V1A_SCRAMBLED = compute_Z_prime(
    SCRAMBLED_early_mean, SCRAMBLED_early_std,
    ATP6V1A_early_mean, ATP6V1A_early_std
)
Z_prime_late_ATP6V1A_SCRAMBLED = compute_Z_prime(
    SCRAMBLED_late_mean, SCRAMBLED_late_std,
    ATP6V1A_late_mean, ATP6V1A_late_std
)

Z_prime_early_ATP6V1A_MOCK = compute_Z_prime(
    MOCK_early_mean, MOCK_early_std,
    ATP6V1A_early_mean, ATP6V1A_early_std
)
Z_prime_late_ATP6V1A_MOCK = compute_Z_prime(
    MOCK_late_mean, MOCK_late_std,
    ATP6V1A_late_mean, ATP6V1A_late_std
)

print(
    "Z' scores for the following combinations of positive and "
    "negative controls:\n"
    f"SCRAMBLED-ATP6V1A early: {np.round(Z_prime_early_ATP6V1A_SCRAMBLED, 3)}\n"
    f"SCRAMBLED-ATP6V1A late: {np.round(Z_prime_late_ATP6V1A_SCRAMBLED, 3)}\n"
    f"MOCK-ATP6V1A early: {np.round(Z_prime_early_ATP6V1A_MOCK, 3)}\n"
    f"MOCK-ATP6V1A late: {np.round(Z_prime_late_ATP6V1A_MOCK, 3)}"
)
print()

# Z' score for FRAP1 as negative control
FRAP1_index = ID_list.index("FRAP1")
FRAP1_early_mean = early_mean_list[FRAP1_index]
FRAP1_early_std = early_std_list[FRAP1_index]
FRAP1_late_mean = late_mean_list[FRAP1_index]
FRAP1_late_std = late_std_list[FRAP1_index]

Z_prime_early_FRAP1_SCRAMBLED = compute_Z_prime(
    SCRAMBLED_early_mean, SCRAMBLED_early_std,
    FRAP1_early_mean, FRAP1_early_std
)
Z_prime_late_FRAP1_SCRAMBLED = compute_Z_prime(
    SCRAMBLED_late_mean, SCRAMBLED_late_std,
    FRAP1_late_mean, FRAP1_late_std
)

Z_prime_early_FRAP1_MOCK = compute_Z_prime(
    MOCK_early_mean, MOCK_early_std,
    FRAP1_early_mean, FRAP1_early_std
)
Z_prime_late_FRAP1_MOCK = compute_Z_prime(
    MOCK_late_mean, MOCK_late_std,
    FRAP1_late_mean, FRAP1_late_std
)

print(
    "Z' scores for the following combinations of positive and "
    "negative controls:\n"
    f"SCRAMBLED-FRAP1 early: {np.round(Z_prime_early_FRAP1_SCRAMBLED, 3)}\n"
    f"SCRAMBLED-FRAP1 late: {np.round(Z_prime_late_FRAP1_SCRAMBLED, 3)}\n"
    f"MOCK-FRAP1 early: {np.round(Z_prime_early_FRAP1_MOCK, 3)}\n"
    f"MOCK-FRAP1 late: {np.round(Z_prime_late_FRAP1_MOCK, 3)}"
)
print()

# Z' score for ITGB1 as negative control
ITGB1_index = ID_list.index("ITGB1")
ITGB1_early_mean = early_mean_list[ITGB1_index]
ITGB1_early_std = early_std_list[ITGB1_index]
ITGB1_late_mean = late_mean_list[ITGB1_index]
ITGB1_late_std = late_std_list[ITGB1_index]

Z_prime_early_ITGB1_SCRAMBLED = compute_Z_prime(
    SCRAMBLED_early_mean, SCRAMBLED_early_std,
    ITGB1_early_mean, ITGB1_early_std
)
Z_prime_late_ITGB1_SCRAMBLED = compute_Z_prime(
    SCRAMBLED_late_mean, SCRAMBLED_late_std,
    ITGB1_late_mean, ITGB1_late_std
)

Z_prime_early_ITGB1_MOCK = compute_Z_prime(
    MOCK_early_mean, MOCK_early_std,
    ITGB1_early_mean, ITGB1_early_std
)
Z_prime_late_ITGB1_MOCK = compute_Z_prime(
    MOCK_late_mean, MOCK_late_std,
    ITGB1_late_mean, ITGB1_late_std
)

print(
    "Z' scores for the following combinations of positive and "
    "negative controls:\n"
    f"SCRAMBLED-ITGB1 early: {np.round(Z_prime_early_ITGB1_SCRAMBLED, 3)}\n"
    f"SCRAMBLED-ITGB1 late: {np.round(Z_prime_late_ITGB1_SCRAMBLED, 3)}\n"
    f"MOCK-ITGB1 early: {np.round(Z_prime_early_ITGB1_MOCK, 3)}\n"
    f"MOCK-ITGB1 late: {np.round(Z_prime_late_ITGB1_MOCK, 3)}"
)

Z' scores for the following combinations of positive and negative controls:
SCRAMBLED-ARPC3 early: -2.191
SCRAMBLED-ARPC3 late: -2.586
MOCK-ARPC3 early: -2.257
MOCK-ARPC3 late: -5.052

Z' scores for the following combinations of positive and negative controls:
SCRAMBLED-ATP6V1A early: -11.219
SCRAMBLED-ATP6V1A late: -12.981
MOCK-ATP6V1A early: -9.906
MOCK-ATP6V1A late: -112.396

Z' scores for the following combinations of positive and negative controls:
SCRAMBLED-FRAP1 early: -0.356
SCRAMBLED-FRAP1 late: -1.806
MOCK-FRAP1 early: -0.452
MOCK-FRAP1 late: -3.574

Z' scores for the following combinations of positive and negative controls:
SCRAMBLED-ITGB1 early: -0.895
SCRAMBLED-ITGB1 late: -2.304
MOCK-ITGB1 early: -0.995
MOCK-ITGB1 late: -4.718


In [2]:
# Load the CSV file
train_test_df = pd.read_csv("180828_train_test.csv")

# Retrieve rows bearing the name "SCRAMBLED", i.e. representing siRNA
# having no effect -> negative control
# Note that in the context of the manuscript, this data type is referred
# to as "average"
average_df = train_test_df.loc[train_test_df["Name"] == "SCRAMBLED"]

# Retrieve rows bearing the name "GFP Duplex III", i.e. representing the
# targeted knock-down of the GFP gene
# Consequently, "GFP Duplex III" represents the positive control
gfp_duplex_df = train_test_df.loc[train_test_df["Name"] == "GFP Duplex III"]

# Likewise, retrieve columns for the remaining categories, which are
# early and late genes low, early genes high and late genes low, early
# genes low and late genes high, early and late genes high, cell death
# and mock
class_2_df = train_test_df.loc[train_test_df["Class"] == 2]
class_3_df = train_test_df.loc[train_test_df["Class"] == 3]
class_4_df = train_test_df.loc[train_test_df["Class"] == 4]
# Note that due to operator precedence, i.e. "&" having precedence over
# equality checks, the equality checks must be surrounded by parentheses
class_5_df = train_test_df.loc[
    (train_test_df["Class"] == 5) & (train_test_df["Name"] != "Mock")
]
cell_death_df = train_test_df.loc[train_test_df["Class"] == 6]
mock_df = train_test_df.loc[train_test_df["Name"] == "Mock"]

In [4]:
# Compute the Z' score exemplarily for "GFP Duplex III" as positive
# control and "SCRAMBLED" as negative control
# As a preliminary step, the intensity values of the peri-nucleus are
# extracted for both early and late genes
# Additionally, for the sake of convenience, the extracted data series
# are converted into NumPy arrays
gfp_duplex_early_int_perinuc = gfp_duplex_df["dIntensity_cPathogen_eMean_oPeriNuclei_nZScore"].to_numpy()
gfp_duplex_late_int_perinuc = gfp_duplex_df["dIntensity_cLatePathogen_eMean_oPeriNuclei_nZScore"].to_numpy()

average_early_int_perinuc = average_df["dIntensity_cPathogen_eMean_oPeriNuclei_nZScore"].to_numpy()
average_late_int_perinuc = average_df["dIntensity_cLatePathogen_eMean_oPeriNuclei_nZScore"].to_numpy()

gfp_duplex_early_mean = np.mean(gfp_duplex_early_int_perinuc)
gfp_duplex_early_std = np.std(gfp_duplex_early_int_perinuc)

gfp_duplex_late_mean = np.mean(gfp_duplex_late_int_perinuc)
gfp_duplex_late_std = np.std(gfp_duplex_late_int_perinuc)

average_early_mean = np.mean(average_early_int_perinuc)
average_early_std = np.std(average_early_int_perinuc)

average_late_mean = np.mean(average_late_int_perinuc)
average_late_std = np.std(average_late_int_perinuc)

print(f"Positive control (GFP Duplex) early mean and std: {gfp_duplex_early_mean, gfp_duplex_early_std}")
print(f"Negative control (SCRAMBLED) early mean and std: {average_early_mean, average_early_std}")
print()
print(f"Positive control (GFP Duplex) late mean and std: {gfp_duplex_late_mean, gfp_duplex_late_std}")
print(f"Negative control (SCRAMBLED) late mean and std: {average_late_mean, average_late_std}")

Z_prime_early = compute_Z_prime(
    gfp_duplex_early_mean,
    gfp_duplex_early_std,
    average_early_mean,
    average_early_std
)
Z_prime_late = compute_Z_prime(
    gfp_duplex_late_mean,
    gfp_duplex_late_std,
    average_late_mean,
    average_late_std
)

print(Z_prime_early)
print(Z_prime_late)

Positive control (GFP Duplex) early mean and std: (0.07858209514814343, 0.024904375138366207)
Negative control (SCRAMBLED) early mean and std: (0.2693598656935352, 0.04430401822853731)

Positive control (GFP Duplex) late mean and std: (0.05958844995024221, 0.01686860452587924)
Negative control (SCRAMBLED) late mean and std: (0.06502387037258052, 0.014771989411226984)
0.274460612795171
-10.642372246706357


In [5]:
# For each category, extract the intensity values of the entire cell for
# both early and late genes
# For the sake of convenience, the extracted data series are converted
# into NumPy arrays
average_early_int_cell = average_df["dIntensity_cPathogen_eMean_oCells_nZScore"].to_numpy()
average_late_int_cell = average_df["dIntensity_cLatePathogen_eMean_oCells_nZScore"].to_numpy()

class_2_early_int_cell = class_2_df["dIntensity_cPathogen_eMean_oCells_nZScore"].to_numpy()
class_2_late_int_cell = class_2_df["dIntensity_cLatePathogen_eMean_oCells_nZScore"].to_numpy()

class_3_early_int_cell = class_3_df["dIntensity_cPathogen_eMean_oCells_nZScore"].to_numpy()
class_3_late_int_cell = class_3_df["dIntensity_cLatePathogen_eMean_oCells_nZScore"].to_numpy()

class_4_early_int_cell = class_4_df["dIntensity_cPathogen_eMean_oCells_nZScore"].to_numpy()
class_4_late_int_cell = class_4_df["dIntensity_cLatePathogen_eMean_oCells_nZScore"].to_numpy()

class_5_early_int_cell = class_5_df["dIntensity_cPathogen_eMean_oCells_nZScore"].to_numpy()
class_5_late_int_cell = class_5_df["dIntensity_cLatePathogen_eMean_oCells_nZScore"].to_numpy()

cell_death_early_int_cell = cell_death_df["dIntensity_cPathogen_eMean_oCells_nZScore"].to_numpy()
cell_death_late_int_cell = cell_death_df["dIntensity_cLatePathogen_eMean_oCells_nZScore"].to_numpy()

mock_early_int_cell = mock_df["dIntensity_cPathogen_eMean_oCells_nZScore"].to_numpy()
mock_late_int_cell = mock_df["dIntensity_cLatePathogen_eMean_oCells_nZScore"].to_numpy()

In [6]:
# Compute Z' score exemplarily for class 2 with average as negative
# control
class_2_early_mean = np.mean(class_2_early_int_cell)
class_2_early_std = np.std(class_2_early_int_cell)

class_2_late_mean = np.mean(class_2_late_int_cell)
class_2_late_std = np.std(class_2_late_int_cell)

average_early_mean = np.mean(average_early_int_cell)
average_early_std = np.std(average_early_int_cell)

average_late_mean = np.mean(average_late_int_cell)
average_late_std = np.std(average_late_int_cell)

Z_prime_early = (
    1
    -
    (3 * class_2_early_std + 3 * average_early_std)
    /
    abs(class_2_early_mean - average_early_mean)
)
Z_prime_late = (
    1
    -
    (3 * class_2_late_std + 3 * average_late_std)
    /
    abs(class_2_late_mean - average_late_mean)
)

print(f"Z' score class 2 early genes: {Z_prime_early}")
print(f"Z' score class 2 late genes: {Z_prime_late}")

Z' score class 2 early genes: -6.558581961073892
Z' score class 2 late genes: -12.057857737179544
