In [3]:
import pandas as pd
import altair as alt
import os
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

# Visualize
For previous code, refer to `./deprecated/04-Visualize.ipynb`.

In [4]:
# The folder that contains the metadata and evlauation results
EVALUATION_DATE_FOLDER = 'Nov-21-2023'

## Journal Portals
### Basic data warngling

In [5]:
"""
Merge data for visualization
"""
df = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_evaluation.csv'))

# Can be useful for visualization
ERROR_TYPES = df.columns.tolist()
ERROR_TYPES.remove('page_id')
ERROR_TYPES.remove('is_success')
ERROR_TYPES.remove('error_count')

# Add `id` of resources and `page_type` of pages
df_pages = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_pages.csv'))
df = df.merge(df_pages[['id', 'page_id', 'page_type']], left_on='page_id', right_on='page_id', how='left')

# Add metadata of resources
df_meta = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_metadata.csv'))
df_meta.drop(columns=['url'], inplace=True)
df = df.merge(df_meta, left_on='id', right_on='id', how='left')

df.head(3)
# df.title
df

Unnamed: 0,page_id,is_success,error_count,aria_reference_broken,label_multiple,alt_missing,alt_link_missing,label_empty,button_empty,link_empty,...,total_cites_3years,citable_docs_3years,cites_per_doc_2years,ref_per_doc,country,region,publisher,coverage,categories,areas
0,12_home,True,0.0,,,,,,,,...,168543,3565,2395,2159,United Kingdom,Western Europe,Nature Publishing Group,1869-2022,Multidisciplinary (Q1),Multidisciplinary
1,12_research_article,True,1.0,1.0,,,,,,,...,168543,3565,2395,2159,United Kingdom,Western Europe,Nature Publishing Group,1869-2022,Multidisciplinary (Q1),Multidisciplinary
2,12_none_research_article,True,2.0,,2.0,,,,,,...,168543,3565,2395,2159,United Kingdom,Western Europe,Nature Publishing Group,1869-2022,Multidisciplinary (Q1),Multidisciplinary
3,12_article_search_result,True,0.0,,,,,,,,...,168543,3565,2395,2159,United Kingdom,Western Europe,Nature Publishing Group,1869-2022,Multidisciplinary (Q1),Multidisciplinary
4,12_latest_issue,True,0.0,,,,,,,,...,168543,3565,2395,2159,United Kingdom,Western Europe,Nature Publishing Group,1869-2022,Multidisciplinary (Q1),Multidisciplinary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,179_home,True,3.0,,,,,,3.0,,...,10764,828,1231,6608,United States,Northern America,Rockefeller University Press,"1909-1910, 1929, 1938, 1945-2022",Immunology (Q1); Immunology and Allergy (Q1); ...,Immunology and Microbiology; Medicine
96,179_research_article,False,,,,,,,3.0,,...,10764,828,1231,6608,United States,Northern America,Rockefeller University Press,"1909-1910, 1929, 1938, 1945-2022",Immunology (Q1); Immunology and Allergy (Q1); ...,Immunology and Microbiology; Medicine
97,179_none_research_article,True,0.0,,,,,,,,...,10764,828,1231,6608,United States,Northern America,Rockefeller University Press,"1909-1910, 1929, 1938, 1945-2022",Immunology (Q1); Immunology and Allergy (Q1); ...,Immunology and Microbiology; Medicine
98,179_article_search_result,True,54.0,,,,5.0,,,5.0,...,10764,828,1231,6608,United States,Northern America,Rockefeller University Press,"1909-1910, 1929, 1938, 1945-2022",Immunology (Q1); Immunology and Allergy (Q1); ...,Immunology and Microbiology; Medicine


In [6]:
# Let's add some useful columns for visualization
df['has_error'] = df.error_count.apply(lambda x: x > 0)

### Create Altair Plots!

In [7]:
THEME_COLOR = '#CC7DAA'

In [8]:
alt.Chart(df).mark_point(filled=True, color=THEME_COLOR, size=100).encode(
    alt.X('mean(error_count):Q', title='Mean of Error Count'),
    alt.Y('page_type:N', title='Page Type'),
)