In [1]:
# The technical support of Thermo Fischer has information regarding
# whether their siRNAs exhibit off-target effects or they target the
# alleged target at all
# Thus, it is tried to obtain this kind of information for all siRNAs
# from Thermo Fischer (Ambion has been purchased by Thermo Fischer)

In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

siRNA_df = pd.read_csv(
    "VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA.csv",
    sep="\t",
    dtype=dtype_dict
)

In [3]:
ambion_subset = siRNA_df[
    siRNA_df["Manufacturer"] == "Ambion"
]

ambion_siRNA_types = ambion_subset["WellType"].unique()

print(
    "Unique siRNA types comprised in the Ambion (Thermo Fisher) "
    f"subset: {', '.join(ambion_siRNA_types)}"
)

Unique siRNA types comprised in the Ambion (Thermo Fisher) subset: SIRNA


In [4]:
ambion_unique_cat_numbers = ambion_subset["Catalog_number"].unique()

# The 10 catalogue numbers Thermo Fischer has already given information
# about are not comprised in the DataFrame and hence don't have to be
# filtered out
# This is due to the fact that only targets that are valid, i.e. not
# withdrawn by NCBI, and named are included in the DataFrame
print(
    "Amount of unique catalogue numbers comprised in the Ambion "
    f"subset: {len(ambion_unique_cat_numbers)}"
)

Amount of unique catalogue numbers comprised in the Ambion subset: 5864


In [11]:
# Save the 5.864 unique catalogue numbers to a text file
# Bear in mind that the "with" context manager is preferred when working
# with files as it automatically takes care of closing files, even in
# case of errors/exceptions
# Conveniently enough, Python provides the .writelines() method, which
# writes a sequence of strings (typically a list) to a file
# However, contrary to what its name suggests, it does not add newline
# characters by default between sequence elements, but they have to be
# added manually
# The procedure below appends the newline character (\n) to each list
# element except the last one
# Using the underscore as separation character is possible since it does
# not occur in any of the catalogue numbers
ambion_cat_nums_str = "\n_".join(ambion_unique_cat_numbers)
ambion_unique_cat_numbers_with_newline = ambion_cat_nums_str.split("_")

with open("ambion_catalogue_numbers.txt", "w") as f:
    f.writelines(ambion_unique_cat_numbers_with_newline)

In [7]:
# Identify the individual vendors/manufacturers
vendors = siRNA_df["Manufacturer"].unique()

print(
    f"Unique siRNA/esiRNA vendors: {', '.join(vendors)}"
)

Unique siRNA/esiRNA vendors: Dharmacon, Qiagen, Ambion, Sigma


In [8]:
# As a next step, the customer servive of Qiagen is contacted
qiagen_subset = siRNA_df[
    siRNA_df["Manufacturer"] == "Qiagen"
]

qiagen_siRNA_types = qiagen_subset["WellType"].unique()

print(
    "Unique siRNA types comprised in the Qiagen subset: "
    f"{', '.join(qiagen_siRNA_types)}"
)

Unique siRNA types comprised in the Qiagen subset: SIRNA


In [9]:
qiagen_unique_cat_numbers = qiagen_subset["Catalog_number"].unique()

print(
    "Amount of unique catalogue numbers comprised in the Qiagen "
    f"subset: {len(qiagen_unique_cat_numbers)}"
)

Amount of unique catalogue numbers comprised in the Qiagen subset: 70301


In [13]:
# Investigate whether catalogue numbers are provided for all siRNAs
print("Not available" in qiagen_unique_cat_numbers)

False


In [16]:
# Save the 70.301 unique catalogue numbers to a text file
# Bear in mind that the "with" context manager is preferred for the
# purpose of working with files as it automatically takes care of
# closing files, even in case of errors/exceptions
# Again, the underscore is introduced between the catalogue numbers in
# conjunction with the newline character and subsequently used as
# separation character
# This is possible since the underscore does not occur in any of the
# catalogue numbers
qiagen_cat_nums_str = "\n_".join(qiagen_unique_cat_numbers)
qiagen_unique_cat_nums_with_newline = qiagen_cat_nums_str.split("_")

with open("qiagen_catalogue_numbers.txt", "w") as f:
    f.writelines(qiagen_unique_cat_nums_with_newline)

In [17]:
# Now, the customer servive of Sigma is turned to
# Note that ...
sigma_subset = siRNA_df[
    siRNA_df["Manufacturer"] == "Sigma"
]

sigma_siRNA_types = sigma_subset["WellType"].unique()

print(
    "Unique siRNA types comprised in the Sigma subset: "
    f"{', '.join(sigma_siRNA_types)}"
)

Unique siRNA types comprised in the Sigma subset: ESIRNA


In [19]:
sigma_unique_cat_nums = sigma_subset["Catalog_number"].unique()

print(
    "Amount of unique catalogue numbers comprised in the Sigma subset: "
    f"{len(sigma_unique_cat_nums)}"
)

Amount of unique catalogue numbers comprised in the Sigma subset: 258


In [24]:
# Investigate whether catalogue numbers are provided for all esiRNAs
print("Not available" in sigma_unique_cat_nums)

False


In [25]:
# Save the 258 unique catalogue numbers to a text file
# Again, as the underscore does not occur in the catalogue numbers, it
# is introduced in conjunction with the newline character between the
# catalogue numbers and subsequently used as separation character
sigma_cat_nums_str = "\n_".join(sigma_unique_cat_nums)
sigma_unique_cat_nums_with_newline = sigma_cat_nums_str.split("_")

# Bear in mind that the "with" context manager is preferred for the
# purpose of working with files as it automatically takes care of
# closing it, even in case of errors/exceptions
with open("sigma_catalogue_numbers.txt", "w") as f:
    f.writelines(sigma_unique_cat_nums_with_newline)

In [27]:
# Finally, address the catalogue numbers of Dharmacon
dharmacon_subset = siRNA_df[
    siRNA_df["Manufacturer"] == "Dharmacon"
]

dharmacon_siRNA_types = dharmacon_subset["WellType"].unique()

print(
    "Unique siRNA types comprised in the Dharmacon subset: "
    f"{', '.join(dharmacon_siRNA_types)}"
)

Unique siRNA types comprised in the Dharmacon subset: POOLED_SIRNA, SIRNA


In [28]:
dharmacon_unique_cat_nums = dharmacon_subset["Catalog_number"].unique()

print(
    "Amount of unique catalogue numbers comprised in the Dharmacon "
    f"subset: {len(dharmacon_unique_cat_nums)}"
)

Amount of unique catalogue numbers comprised in the Dharmacon subset: 20856


In [31]:
# Investigate whether catalogue numbers are provided for all siRNAs
print("Not available" in dharmacon_unique_cat_nums)

False


In [32]:
# Save the 20.856 unique catalogue numbers to a text file
# In the case of Dharmacon, the underscore does indeed occur in some
# catalogue numbers and therefore cannot be used as separation character
# It is resorted to the vertical bar (pipe)
dharmacon_cat_nums_str = "\n|".join(dharmacon_unique_cat_nums)
dharmacon_unique_cat_nums_with_newline = dharmacon_cat_nums_str.split("|")

# Bear in mind that the "with" context manager is preferred for the
# purpose of working with files as it automatically takes care of
# closing them, even in case of errors/exceptions
with open("dharmacon_catalogue_numbers.txt", "w") as f:
    f.writelines(dharmacon_unique_cat_nums_with_newline)