In [2]:
import psycopg

conn_v5 = psycopg.connect("postgresql://postgres@localhost:5432/dgidb")
conn_v4 = psycopg.connect("postgresql://postgres@localhost:5432/dgidb_v4")

In [3]:
import pandas as pd

### visual stuff

In [4]:
NEW_SOURCE_COLOR = "#fa198b"
EXISTING_SOURCE_COLOR = "#480a77"
UPDATED_SOURCE_COLOR = "#8075ff"

### counts to get

* interactions
* gene claims
* drug claims
* gene categories

In [5]:
UPDATE_SOURCE_NAMES =[
    "CIViC",
    "DrugBank",
    "ChEMBL",
    "GtoPdb",
    "PharmGKB",
    "Wikidata",
]

NEW_SOURCE_NAMES = [
    "ChemIDplus",
    "Drugs@FDA",
    "HemOnc.org",
    "NCIt",
    "RXNorm",
    "HGNC",
]

SHORT_SOURCE_NAMES = {
    "BaderLab": "BaderLab",
    "CGI": "CGI",
    "CIViC": "CIViC",
    "COSMIC": "COSMIC",
    "CancerCommons": "CancerCommons",
    "CarisMolecularIntelligence": "CMI",
    "ChEMBL": "ChEMBL",
    "ChemIDplus": "ChemIDplus",
    "ClearityFoundationBiomarkers": "CF: Biomarkers",
    "ClearityFoundationClinicalTrial": "CF: Clinical Trials",
    "DTC": "DTC",
    "DoCM": "DoCM",
    "DrugBank": "DrugBank",
    "Drugs@FDA": "Drugs@FDA",
    "Ensembl": "Ensembl",
    "FDA": "FDA",
    "HemOnc": "HemOnc.org",
    "FoundationOneGenes": "FO: Genes",
    "GO": "GO",
    "GuideToPharmacology": "GtoPdb",
    "HGNC": "HGNC",
    "HingoraniCasas": "Hingorani/Casas",
    "HopkinsGroom": "Hopkins/Groom",
    "HumanProteinAtlas": "Human Protein Atlas",
    "IDG": "IDG",
    "JAX-CKB": "JAX-CKB",
    "MskImpact": "MSK Impact",
    "MyCancerGenome": "MCG",
    "MyCancerGenomeClinicalTrial": "MCG: Clinical Trials",
    "NCBI": "NCBI Gene",
    "NCI": "NCI",
    "NCIt": "NCIt",
    "OncoKB": "OncoKB",
    "Oncomine": "Oncomine",
    "PharmGKB": "PharmGKB",
    "Pharos": "Pharos",
    "RxNorm": "RxNorm",
    "RussLampel": "Russ/Lampel",
    "TALC": "TALC",
    "TEND": "TEND",
    "TTD": "TTD",
    "TdgClinicalTrial": "TDG: Clinical Trials",
    "Tempus": "Tempus",
    "dGene": "dGene",
    "Wikidata": "Wikidata"
}

In [6]:
def make_dataframe(v5_counts, v4_counts):
    counts = pd.concat([pd.Series(v5_counts), pd.Series(v4_counts)], axis=1)
    counts = counts.fillna(0.0).astype(int)
    counts.columns = ["v5", "v4"]
    counts = counts.sort_index()
    display_names = [SHORT_SOURCE_NAMES[name] for name in counts.index]
    counts["display names"] = display_names
    source_types = []
    source_colors = []
    for source in counts.index:
        if source in UPDATE_SOURCE_NAMES:
            source_types.append("Updated")
            source_colors.append(UPDATED_SOURCE_COLOR)
        elif source in NEW_SOURCE_NAMES:
            source_types.append("New")
            source_colors.append(NEW_SOURCE_COLOR)
        else:
            source_types.append("Existing")
            source_colors.append(EXISTING_SOURCE_COLOR)
    counts["source type"] = source_types
    counts["source colors"] = source_colors
    return counts

### interactions

In [7]:
interactions_query = """
SELECT s.source_db_name, COUNT(1) FROM interaction_claims
LEFT JOIN sources s
ON interaction_claims.source_id = s.id
GROUP BY s.source_db_name
"""

with conn_v5.cursor() as cur:
    result = cur.execute(interactions_query).fetchall()
    v5_interaction_claims = {k: v for (k, v) in result}

with conn_v4.cursor() as cur:
    result = cur.execute(interactions_query).fetchall()
    v4_interaction_claims = {k: v for (k, v) in result}

In [8]:
v4_interaction_claims["ChEMBL"] = v4_interaction_claims["ChemblInteractions"]
del v4_interaction_claims["ChemblInteractions"]

In [9]:
interaction_claims = make_dataframe(v5_interaction_claims, v4_interaction_claims)
interaction_claims.to_csv("interaction_claims.csv")
interaction_claims

Unnamed: 0,v5,v4,display names,source type,source colors
CGI,368,372,CGI,Existing,#480a77
CIViC,1083,959,CIViC,Updated,#8075ff
COSMIC,37,37,COSMIC,Existing,#480a77
CancerCommons,109,109,CancerCommons,Existing,#480a77
ChEMBL,16938,7610,ChEMBL,Updated,#8075ff
ClearityFoundationBiomarkers,163,163,CF: Biomarkers,Existing,#480a77
ClearityFoundationClinicalTrial,240,281,CF: Clinical Trials,Existing,#480a77
DTC,23879,23879,DTC,Existing,#480a77
DoCM,76,76,DoCM,Existing,#480a77
FDA,427,427,FDA,Existing,#480a77


### gene claims

TODO:

 * manually fill in DrugBank counts (from v4 paper?)

In [10]:
genes_query = """
SELECT s.source_db_name, COUNT(1) FROM gene_claims
LEFT JOIN sources s
ON gene_claims.source_id = s.id
GROUP BY s.source_db_name
"""

with conn_v5.cursor() as cur:
    result = cur.execute(genes_query).fetchall()
    v5_gene_claims = {k: v for (k, v) in result}

with conn_v4.cursor() as cur:
    result = cur.execute(genes_query).fetchall()
    v4_gene_claims = {k: v for (k, v) in result}

In [11]:
v4_gene_claims["BaderLab"] = v4_gene_claims["BaderLabGenes"]
del v4_gene_claims["BaderLabGenes"]
v4_gene_claims["ChEMBL"] = v4_gene_claims["ChemblInteractions"]
del v4_gene_claims["ChemblInteractions"]
v4_gene_claims["NCBI"] = 43741  # manually supply

In [12]:
gene_claims = make_dataframe(v5_gene_claims, v4_gene_claims)
gene_claims.to_csv("gene_claims.csv")
gene_claims

Unnamed: 0,v5,v4,display names,source type,source colors
BaderLab,48,300,BaderLab,Existing,#480a77
CGI,123,118,CGI,Existing,#480a77
CIViC,278,252,CIViC,Updated,#8075ff
COSMIC,15,15,COSMIC,Existing,#480a77
CancerCommons,48,48,CancerCommons,Existing,#480a77
CarisMolecularIntelligence,608,608,CMI,Existing,#480a77
ChEMBL,1779,1085,ChEMBL,Updated,#8075ff
ClearityFoundationBiomarkers,34,34,CF: Biomarkers,Existing,#480a77
ClearityFoundationClinicalTrial,93,108,CF: Clinical Trials,Existing,#480a77
DTC,1016,1016,DTC,Existing,#480a77


### drug claims

In [13]:
drug_query = """
SELECT s.source_db_name, COUNT(1) FROM drug_claims
LEFT JOIN sources s
ON drug_claims.source_id = s.id
GROUP BY s.source_db_name
"""

with conn_v5.cursor() as cur:
    result = cur.execute(drug_query).fetchall()
    v5_drug_claims = {k: v for (k, v) in result}

with conn_v4.cursor() as cur:
    result = cur.execute(drug_query).fetchall()
    v4_drug_claims = {k: v for (k, v) in result}

In [14]:
v4_drug_claims["ChEMBL"] = v4_drug_claims["ChemblDrugs"] + v4_drug_claims["ChemblInteractions"]
del v4_drug_claims["ChemblDrugs"]
del v4_drug_claims["ChemblInteractions"]

In [15]:
drug_claims = make_dataframe(v5_drug_claims, v4_drug_claims)
drug_claims.to_csv("drug_claims.csv")
drug_claims

Unnamed: 0,v5,v4,display names,source type,source colors
CGI,148,155,CGI,Existing,#480a77
CIViC,416,367,CIViC,Updated,#8075ff
COSMIC,28,28,COSMIC,Existing,#480a77
CancerCommons,80,80,CancerCommons,Existing,#480a77
ChEMBL,5659,17037,ChEMBL,Updated,#8075ff
ChemIDplus,5879,0,ChemIDplus,New,#fa198b
ClearityFoundationBiomarkers,64,64,CF: Biomarkers,Existing,#480a77
ClearityFoundationClinicalTrial,114,115,CF: Clinical Trials,Existing,#480a77
DTC,6290,6290,DTC,Existing,#480a77
DoCM,39,39,DoCM,Existing,#480a77


## base count, broken by type

In [16]:
import plotly.graph_objects as go

In [25]:
def make_int_base():
    new_sources = interaction_claims[interaction_claims["source type"] == "New"]
    existing_sources = interaction_claims[interaction_claims["source type"] == "Existing"]
    updated_sources = interaction_claims[interaction_claims["source type"] == "Updated"]
    fig = go.Figure()
    fig.update_layout(title="Interaction Claims by Source")
    fig.add_trace(go.Bar(
        x=new_sources["display names"],
        y=new_sources["v5"],
        name="New sources",
        marker_color=new_sources["source colors"]
    ))
    fig.add_trace(go.Bar(
        x=updated_sources["display names"],
        y=updated_sources["v5"],
        name="Updated sources",
        marker_color=updated_sources["source colors"]
    ))
    fig.add_trace(go.Bar(
        x=existing_sources["display names"],
        y=existing_sources["v5"],
        name="Existing sources",
        marker_color=existing_sources["source colors"]
    ))
    fig.update_layout(xaxis_tickangle=-45, yaxis_title="Count (log scale)")
    fig.write_image("interactions.png")
    fig.update_yaxes(type="log")
    return fig
    
make_int_base()


In [24]:
def make_gene_base():
    new_sources = gene_claims[gene_claims["source type"] == "New"]
    existing_sources = gene_claims[gene_claims["source type"] == "Existing"]
    updated_sources = gene_claims[gene_claims["source type"] == "Updated"]
    fig = go.Figure()
    fig.update_layout(title="Gene Claims by Source")
    fig.add_trace(go.Bar(
        x=new_sources["display names"],
        y=new_sources["v5"],
        name="New sources",
        marker_color=new_sources["source colors"]
    ))
    fig.add_trace(go.Bar(
        x=updated_sources["display names"],
        y=updated_sources["v5"],
        name="Updated sources",
        marker_color=updated_sources["source colors"]
    ))
    fig.add_trace(go.Bar(
        x=existing_sources["display names"],
        y=existing_sources["v5"],
        name="Existing sources",
        marker_color=existing_sources["source colors"]
    ))
    fig.update_layout(xaxis_tickangle=-45, yaxis_title="Count (log scale)")
    fig.update_yaxes(type="log")
    fig.write_image("genes.png")
    return fig
    
make_gene_base()


In [23]:
def make_drug_base():
    new_sources = drug_claims[drug_claims["source type"] == "New"]
    existing_sources = drug_claims[drug_claims["source type"] == "Existing"]
    updated_sources = drug_claims[drug_claims["source type"] == "Updated"]
    fig = go.Figure()
    fig.update_layout(title="Drug Claims by Source")
    fig.add_trace(go.Bar(
        x=new_sources["display names"],
        y=new_sources["v5"],
        name="New sources",
        marker_color=new_sources["source colors"]
    ))
    fig.add_trace(go.Bar(
        x=updated_sources["display names"],
        y=updated_sources["v5"],
        name="Updated sources",
        marker_color=updated_sources["source colors"]
    ))
    fig.add_trace(go.Bar(
        x=existing_sources["display names"],
        y=existing_sources["v5"],
        name="Existing sources",
        marker_color=existing_sources["source colors"]
    ))
    fig.update_layout(xaxis_tickangle=-45, yaxis_title="Count (log scale)")
    fig.update_yaxes(type="log")
    fig.write_image("drugs.png")
    return fig
    
make_drug_base()
