In [19]:
%load_ext autoreload
%autoreload 2
%aimport theme
import pandas as pd
import urllib
import requests
import json
import os
import pandas as pd
import datetime
import altair as alt
from theme import apply_theme
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
DATA_DATE = datetime.date.today

# Evaluate Biomedical Resources (Draft)

In [21]:
# Load a WAVE API key stored in a local file
with open('../input/api.lab.key', 'r') as f:
    API_KEY = f.read()

In [4]:
INPUT = (
    '../input/portals-pages.json',          # information of pages
    '../output/portals-metadata.json',      # metadata file
    '../output/portals-pages-reports.json', # output file
)

In [5]:
def collect_reports_and_save(
    input_file,
    metadata_file,
    output_file
):
    input_df = pd.read_json(input_file)
    meta_df = pd.read_json(metadata_file).drop(columns='url')
    
    """
    Combine two datasets
    """
    input_df = input_df.rename(columns={'name': 'shortName'})
    df = input_df.merge(meta_df, on='shortName', how='left').reset_index()    

    """
    Load existing reports so that we do not need to re-run evaluation from the start
    """
    if os.path.isfile(output_file):
        f = open(output_file, 'r')
        reports = json.load(f)
        f.close()
    else:
        reports = []

    """ 
    Collect missing reports
    """
    issues = []
    for index, row in df.iterrows():
        # if index >= 4:
        #     break
        # Resource information
        # dbId = row.dbId
        # shortName = row.shortName
        url = row.url

        # Check the existence
        report_exist = any(report['url'] == url for report in reports)
        if not report_exist:
            print('Loading... ', url)

            api_url = f'https://wave.webaim.org/api/request?key={API_KEY}&reporttype=2&url={url}'
            
            try:
                with urllib.request.urlopen(api_url) as f:
                    report = json.load(f)
                    # new_data = {}
                    # new_data['dbId'] = dbId
                    # new_data['shortName'] = shortName
                    # new_data['url'] = url
                    # new_data['report'] = new_report
                    # reports.append(new_data)
                    # print(reports)
                    
                    METRICS = ['error', 'contrast', 'alert']

                    new_row = row.copy()
                    
                    # TODO: Improve the code below
                    # f = open(metadata_file, 'r')
                    # meta = json.load(f)
                    # f.close()
                    # for m in meta:
                    #     if m['shortName'] == row['shortName']:
                    #         for key in m:
                    #             new_row[key] = m[key]

                    for metric in METRICS:
                        if report['status']['success'] == False:
                            continue

                        scores = report['categories'][metric]['items']

                        new_row['issue_type'] = metric

                        if len(scores) == 0:
                            # Add an explit zero-issue row
                            row_copy = new_row.copy()
                            row_copy['issue_id'] = f'{metric}None'
                            row_copy['issue_count'] = 0
                            row_copy['issue_desc'] = 'No Issues Found'
                            issues.append(row_copy)
                        else:
                            for score_category in scores:
                                issue_id = scores[score_category]['id']
                                issue_desc = scores[score_category]['description']
                                issue_count = scores[score_category]['count']

                                row_copy = new_row.copy()
                                row_copy['issue_id'] = issue_id
                                row_copy['issue_count'] = issue_count
                                row_copy['issue_desc'] = issue_desc

                                issues.append(row_copy)
        
            except Exception as err:
                print(f"Unexpected {err=}, {type(err)=}")

    merged = pd.DataFrame.from_records(issues)
    merged.to_json(output_file, orient='records')
                
# collect_reports_and_save(*INPUT)

Loading...  https://www.cbioportal.org/
Loading...  https://www.cbioportal.org/datasets
Loading...  https://www.encodeproject.org/
Loading...  https://www.encodeproject.org/search/?type=Experiment&control_type!=*&status=released&perturbed=false
Loading...  https://www.encodeproject.org/experiments/ENCSR167IUG/
Loading...  https://www.encodeproject.org/matrix/?type=Experiment&control_type!=*&status=released&perturbed=false
Loading...  https://www.ncbi.nlm.nih.gov/geo/
Loading...  https://www.ncbi.nlm.nih.gov/sites/GDSbrowser/
Loading...  https://genome.ucsc.edu/
Loading...  https://genome.ucsc.edu/cgi-bin/hgTabless
Loading...  https://www.mirbase.org/
Loading...  https://www.mirbase.org/cgi-bin/browse.pl
Loading...  https://www.mirbase.org/summary.shtml?org=smc
Loading...  https://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=smc-mir-12455-1
Loading...  https://www.ebi.ac.uk/interpro/
Loading...  https://www.ebi.ac.uk/interpro/protein/UniProt/entry/InterPro/#table
Loading...  https://www.e

# Visualize

In [22]:
df = pd.read_json(INPUT[2])
df

Unnamed: 0,index,shortName,url,landing,tables for data,forms for data,visualizations for data,dbId,accession,fullName,...,firstPublicationYear,searchExample,citedDate,ess,response,BA_ID,issue_type,issue_id,issue_count,issue_desc
0,0,cBioPortal,https://www.cbioportal.org/,yes,yes,yes,yes,6569,DBC006569,cBio Cancer Genomics Portal,...,0,,2022-10-30 04:02:20,0,200,P000003,error,label_missing,1,Missing form label
1,0,cBioPortal,https://www.cbioportal.org/,yes,yes,yes,yes,6569,DBC006569,cBio Cancer Genomics Portal,...,0,,2022-10-30 04:02:20,0,200,P000003,error,language_missing,1,Language missing or invalid
2,0,cBioPortal,https://www.cbioportal.org/,yes,yes,yes,yes,6569,DBC006569,cBio Cancer Genomics Portal,...,0,,2022-10-30 04:02:20,0,200,P000003,error,button_empty,1,Empty button
3,0,cBioPortal,https://www.cbioportal.org/,yes,yes,yes,yes,6569,DBC006569,cBio Cancer Genomics Portal,...,0,,2022-10-30 04:02:20,0,200,P000003,error,link_empty,751,Empty link
4,0,cBioPortal,https://www.cbioportal.org/,yes,yes,yes,yes,6569,DBC006569,cBio Cancer Genomics Portal,...,0,,2022-10-30 04:02:20,0,200,P000003,contrast,contrast,501,Very low contrast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,29,DAVID,https://david.ncifcrf.gov/list.jsp,no,yes,yes,no,3061,DBC003061,"Database for Annotation, Visualization and Int...",...,2007,,2022-10-30 04:00:57,0,200,P000001,alert,event_handler,8,Device dependent event handler
257,29,DAVID,https://david.ncifcrf.gov/list.jsp,no,yes,yes,no,3061,DBC003061,"Database for Annotation, Visualization and Int...",...,2007,,2022-10-30 04:00:57,0,200,P000001,alert,javascript_jumpmenu,1,JavaScript jump menu
258,29,DAVID,https://david.ncifcrf.gov/list.jsp,no,yes,yes,no,3061,DBC003061,"Database for Annotation, Visualization and Int...",...,2007,,2022-10-30 04:00:57,0,200,P000001,alert,text_small,22,Very small text
259,29,DAVID,https://david.ncifcrf.gov/list.jsp,no,yes,yes,no,3061,DBC003061,"Database for Annotation, Visualization and Int...",...,2007,,2022-10-30 04:00:57,0,200,P000001,alert,underline,9,Underlined text


In [23]:
alt.Chart(df[df.issue_type == 'error']).mark_tick().encode(y='landing:N', x=alt.Y('issue_count:Q', scale=alt.Scale(domain=[0, 1000], clamp=True)))

In [53]:
grouped = df[df.issue_type == 'error'].groupby(by=['url', 'landing', 'tables for data', 'forms for data', 'visualizations for data', 'country', 'hostInstitution', 'shortName']).sum().reset_index()
grouped['hasIssues'] = grouped.issue_count.apply(lambda x: x >= 1)
grouped['hasIssues'] = grouped['hasIssues'].apply(lambda x: 'Has Issues' if x else 'No Issues') # More readable names

grouped.head()

Unnamed: 0,url,landing,tables for data,forms for data,visualizations for data,country,hostInstitution,shortName,index,dbId,...,isNew,bigsearchId,recordCreated,zindex,firstPublicationYear,searchExample,ess,response,issue_count,hasIssues
0,https://david.ncifcrf.gov/,yes,no,no,yes,United States,National Cancer Institute,DAVID,54,6122,...,0,0.0,3033838336000,3647.26,4014,0.0,0,400,2,Has Issues
1,https://david.ncifcrf.gov/list.jsp,no,yes,yes,no,United States,National Cancer Institute,DAVID,87,9183,...,0,0.0,4550757504000,5470.89,6021,0.0,0,600,164,Has Issues
2,https://david.ncifcrf.gov/summary.jsp,no,yes,yes,yes,United States,National Cancer Institute,DAVID,84,9183,...,0,0.0,4550757504000,5470.89,6021,0.0,0,600,164,Has Issues
3,https://genome.ucsc.edu/,yes,no,no,no,United States,University of California Santa Cruz,UCSC Genome Browser,32,372,...,0,0.0,5739138140000,2760.6,0,0.0,0,800,6,Has Issues
4,https://genome.ucsc.edu/cgi-bin/hgTabless,no,no,yes,no,United States,University of California Santa Cruz,UCSC Genome Browser,18,186,...,0,0.0,2869569070000,1380.3,0,0.0,0,400,4,Has Issues


In [56]:
(
    alt.Chart(grouped).mark_point(filled=True, color='black').encode(x='landing:N', y=alt.Y('mean(issue_count):Q')) + alt.Chart(grouped).mark_errorbar(extent='ci').encode(x='landing:N', y=alt.Y('issue_count:Q')) |
    alt.Chart(grouped).mark_point(filled=True, color='black').encode(x='tables for data:N', y=alt.Y('mean(issue_count):Q')) + alt.Chart(grouped).mark_errorbar(extent='ci').encode(x='tables for data:N', y=alt.Y('issue_count:Q')) |
    alt.Chart(grouped).mark_point(filled=True, color='black').encode(x='forms for data:N', y=alt.Y('mean(issue_count):Q')) + alt.Chart(grouped).mark_errorbar(extent='ci').encode(x='forms for data:N', y=alt.Y('issue_count:Q')) |
    alt.Chart(grouped).mark_point(filled=True, color='black').encode(x='visualizations for data:N', y=alt.Y('mean(issue_count):Q')) + alt.Chart(grouped).mark_errorbar(extent='ci').encode(x='visualizations for data:N', y=alt.Y('issue_count:Q')) |
    alt.Chart(grouped).mark_point(filled=True, color='black').encode(x='country:N', y=alt.Y('mean(issue_count):Q')) + alt.Chart(grouped).mark_errorbar(extent='ci').encode(x='country:N', y=alt.Y('issue_count:Q')) |
    alt.Chart(grouped).mark_point(filled=True, color='black').encode(x='hostInstitution:N', y=alt.Y('mean(issue_count):Q')) + alt.Chart(grouped).mark_errorbar(extent='ci').encode(x='hostInstitution:N', y=alt.Y('issue_count:Q')) | 
    alt.Chart(grouped).mark_point(filled=True, color='black').encode(x='shortName:N', y=alt.Y('mean(issue_count):Q')) + alt.Chart(grouped).mark_errorbar(extent='ci').encode(x='shortName:N', y=alt.Y('issue_count:Q'))
)

In [68]:
base = alt.Chart(df[df.issue_type == 'error']).mark_point(color='black', filled=True).encode(
    y='issue_desc:N',
    x=alt.X('mean(issue_count):Q')
)
err = base.mark_errorbar(extent='ci').encode(
    x=alt.X('mean(issue_count):Q')
)
(base + err).facet(column=alt.Column('landing:N'))

In [69]:
base = alt.Chart(df[df.issue_type == 'error']).mark_point(color='black', filled=True).encode(
    y='issue_desc:N',
    x=alt.X('mean(issue_count):Q')
)
err = base.mark_errorbar(extent='ci').encode(
    x=alt.X('mean(issue_count):Q')
)
(base + err).facet(column=alt.Column('tables for data:N'))

In [70]:
base = alt.Chart(df[df.issue_type == 'error']).mark_point(color='black', filled=True).encode(
    y='issue_desc:N',
    x=alt.X('mean(issue_count):Q')
)
err = base.mark_errorbar(extent='ci').encode(
    x=alt.X('mean(issue_count):Q')
)
(base + err).facet(column=alt.Column('visualizations for data:N'))

In [71]:
base = alt.Chart(df[df.issue_type == 'error']).mark_point(color='black', filled=True).encode(
    y='issue_desc:N',
    x=alt.X('mean(issue_count):Q')
)
err = base.mark_errorbar(extent='ci').encode(
    x=alt.X('mean(issue_count):Q')
)
(base + err).facet(column=alt.Column('forms for data:N'))

In [86]:
base = alt.Chart(df[df.issue_type == 'error']).mark_point(color='black', filled=True).encode(
    y=alt.Y('issue_desc:N', axis=alt.Axis(grid=True)),
    x=alt.X('mean(issue_count):Q', scale=alt.Scale(domain=[0, 10], clamp=True))
).properties(width=120)
err = base.mark_errorbar(extent='ci').encode(
    x=alt.X('mean(issue_count):Q')
)
(base + err).facet(column=alt.Column('shortName:N'), spacing=5)