In [1]:
import numpy as np
import dask.dataframe as dd
import mechanicalsoup



In [2]:
# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

In [6]:
# Determine the unique target genes knocked down via siRNA in this
# screen
# To this end, the feature "ID_openBIS" is employed
ID_list = np.unique(main_csv_df["ID_openBIS"]).tolist()

# The list is way too large to display it in the IDE, which is why it is
# saved to a file
# Bear in mind that with statements are preferred for the purpose of
# working with files as they automatically take care of closing files,
# even in case of exceptions/errors
with open("unique_gene_IDs_entire_screen.txt", "w") as f:
    for i, ID in enumerate(ID_list):
        if i == 0:
            f.write(ID)
        else:
            f.write(f"\n{ID}")

In [7]:
# Repeat this procedure for the features "Name" and
# "PublicationLink_material"
name_list = np.unique(main_csv_df["Name"]).tolist()

with open("unique_gene_names_entire_screen.txt", "w") as f:
    for i, name in enumerate(name_list):
        if i == 0:
            f.write(name)
        else:
            f.write(f"\n{name}")

publink_mat_list = np.unique(
    main_csv_df["PublicationLink_material"]
).tolist()

with open(
    "unique_entries_PublicationLink_material_entire_screen.txt", "w"
) as f:
    for i, entry in enumerate(publink_mat_list):
        if i == 0:
            f.write(entry)
        else:
            f.write(f"\n{entry}")

In [9]:
# The feature "ID_openBIS" contains IDs the respective siRNAs can be
# accessed with on the manufacturer's website
# The feature "WellType" harbours information as to the agent by which
# the perturbation was effected (e.g. SIRNA, CONTROL, COMPOUND,
# MIRNA_INHIBITOR, etc.)

In [3]:
# As a first step, only data is worked with originating from treatment
# with single siRNAs
# Single siRNAs have been obtained from three different vendors, namely
# Qiagen, Ambion and Dharmacon
# For each ID in the feature "ID_openBIS", the corresponding gene name
# is determined by looking it up on the respective manufacturer's
# website

# First, extract IDs for single siRNAs from Qiagen
# Note that due to operator precedence, i.e. "&" having precedence over
# equality checks, the equality checks must be surrounded by parentheses
single_siRNA_Qiagen = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    &
    (main_csv_df["ID_openBIS"].str.contains("QIAGEN"))
]["ID_openBIS"]

# Now, extract IDs for single siRNAs from Ambion
single_siRNA_Ambion = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    &
    (main_csv_df["ID_openBIS"].str.contains("AMBION"))
]["ID_openBIS"]

# Lastly, extract IDs for single siRNAs from Dharmacon
single_siRNA_Dharmacon = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    &
    (main_csv_df["ID_openBIS"].str.contains("DHARMACON"))
]["ID_openBIS"]

In [4]:
# In the case of Qiagen, the manufacturer's name is prepended to the IDs
# Conveniently enough, the manufacturer's name and the actual ID are
# separated by an underscore, allowing easy separation via the split
# method
single_siRNA_Qiagen_IDs = [
    verbose_ID.split("_")[1] for verbose_ID in single_siRNA_Qiagen
]

# The manufacturer's name is also prepended to the actual IDs in the
# case of Ambion and Dharmacon
pass

In [31]:
# Instantiate a so-called headless browser, i.e. a web browser without
# GUI
# Note that the default string used for "user_agent" might signalise to
# the server that it is not a real web browser that is requesting
# access, potentially causing the server to block access
# Therefore, it has to be pretended that access is requested by a web
# browser
browser = mechanicalsoup.StatefulBrowser(user_agent="Mozilla/5.0")

# Look up the gene names for single siRNAs from Qiagen
Qiagen_url = "https://geneglobe.qiagen.com"
Qiagen_geneglobe_page = browser.open(Qiagen_url)
Qiagen_geneglobe_html = Qiagen_geneglobe_page.soup

In [27]:
#... = Qiagen_homepage_html.select_one("")

In [28]:
print()

None


In [32]:
print(Qiagen_geneglobe_html)

<!DOCTYPE html>
<html lang="en"><head>
<base href="/" id="baseHref"/>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<meta content="QIAGEN" name="author"/>
<meta content="ctat7fZ36roG8Mbl5UXfRfWu5gH7vpIgae2TmVCS85o" name="google-site-verification"/>
<meta content="to04i9ksp48smuby0b62dqaaophq0x" name="facebook-domain-verification"/>
<link href="/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="/favicon.ico" rel="icon"/>
<link href="https://fonts.gstatic.com" rel="preconnect"/>
<link as="style" href="/sfc/fonts.css" onload="this.onload=null;this.rel='stylesheet'" rel="preload"/>
<noscript><link href="/sfc/fonts.css" rel="stylesheet"/></noscript>
<link as="style" href="/sfc/glyphicons.css" onload="this.onload=nul