# Cooking RISOTTO artifacts

This notebook purpose is to build the binary files used by RISOTTO's GUI.
We'll build three Pandas DataFrames:

- `papers`:
contains the data of the papers, including the `cord_uid` identifier and the PageRank scores.

- `papers_topics`:
contains the association between papers, topics, and subtopics.
The papers are indexed by their `cord_uid` identifier.

- `topics`:
contains the token pseudocounts of the different topics and subtopics.

Each one of the previously defined DataFrames will be stored in a single HDF file named `artifacts.hdf`.

In [1]:
# default_exp artifacts

In [2]:
# Install dependencies
%load_ext autoreload
%autoreload 2
!pip install -q -r requirements.txt

[33mYou are using pip version 19.0.3, however version 20.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
# export
from pathlib import Path

from fastprogress.fastprogress import progress_bar
import pandas as pd
import networkx as nx

from risotto.references import load_papers_from_metadata_file, process_references, build_papers_reference_graph
from risotto.lda import process_papers_file_contents
from risotto.hierarchical_lda import fit_lda_model

In [2]:
CORD19_DATASET_FOLDER = Path("./datasets/CORD-19-research-challenge")

In [3]:
# export
ARTIFACTS_PATH = "artifacts/artifacts.hdf"

def remove_duplicated_idxs(df):
    df = df.loc[~df.index.duplicated(keep='first')]
    return df


def load_papers_metadata(dataset_folder):
    metadata_df = pd.read_csv(
        f"{dataset_folder}/metadata.csv",
        index_col="cord_uid"
    )
    return metadata_df

def _get_affiliation_country(author):
    affiliation = author["affiliation"]
    if len(affiliation) == 0:
        return None, None
    # Country
    country = affiliation["location"]["country"] if "country" in affiliation["location"] else None
    # Affiliation
    laboratory = affiliation["laboratory"] 
    institution = affiliation["institution"] if len(affiliation["institution"]) > 0 else None
    if institution is not None and len(laboratory) > 0:
        institution += f" ({laboratory})"
    return institution, country


def build_papers_artifact(dataset_folder, should_dump=True):
    """
    Returns:
        - `joined_df`: a DataFrame with the dataset metadata columns and the PageRank score
            of each paper. The rows are indexed by the `cord_uid` identifier.
    """
    metadata_df = remove_duplicated_idxs(load_papers_metadata(dataset_folder))
    papers = load_papers_from_metadata_file(dataset_folder)
    process_references(papers)
    references_graph = build_papers_reference_graph(papers)
    pagerank = nx.pagerank(references_graph)
    
    # Build PageRank DataFrame dict
    df_dict = {
        "cord_uid": [],
        "pagerank": [],
        "affiliation": [],
        "country": []
    }
    for paper, score in pagerank.items():
        # PageRank
        df_dict["cord_uid"].append(paper._metadata_row.name)
        df_dict["pagerank"].append(score)
        
        # Affiliations and countries
        affiliations = set()
        countries = set()
        for author in paper._file_contents["metadata"]["authors"]:
            affiliation, country = _get_affiliation_country(author)
            if affiliation is not None:
                affiliations.add(affiliation)
            if country is not None:
                countries.add(country)
        df_dict["affiliation"].append(", ".join(affiliations))
        df_dict["country"].append(", ".join(countries))
    
    pagerank_df = pd.DataFrame.from_dict(df_dict)
    pagerank_df = remove_duplicated_idxs(pagerank_df.set_index("cord_uid"))
    
    joined_df = pagerank_df.join(metadata_df)
    
    if should_dump:
        joined_df.to_hdf(ARTIFACTS_PATH, key="papers")
    
    return joined_df, pagerank_df, metadata_df


def load_papers_artifact():
    return pd.read_hdf(ARTIFACTS_PATH, key="papers")


def build_papers_topics_artifact(dataset_folder, should_dump=True):
    """
    Returns:
        - `papers_df`: a DataFrame. Each row represents a paper and has the following columns:
            - `cord_uid`: index
            - `text`: paper text
            - `topic`: first-level topic identifier
            - `subtopic`: second-level topic identifier
        - `topics_df`: a DataFrame. Each row represents a token and has the following columns:
            - `token`: index
            - `{topic_id}`: token pseudocount in first-level topic {topic_id}
            - `{topic_id}-{subtopic_id}`: token pseudocount in second-level {subtopic_id} of first-level {topic_id}.
        NaNs values mean that the token wasn't considered in the topic modelling step.
    """
    # Load the papers contents
    papers = load_papers_from_metadata_file(dataset_folder)
    docs = process_papers_file_contents(papers)
    
    # Here we build a dataframe with the text of each dataset
    # indexed by its own `cord_uid`
    papers_df_dict = {"cord_uid": []}
    for paper in papers:
        papers_df_dict["cord_uid"].append(paper._metadata_row.name)
    papers_df_dict["text"] = docs
    papers_df = pd.DataFrame.from_dict(papers_df_dict)
    papers_df = papers_df.set_index("cord_uid")

    # Fit the first level LDA model and classify the papers
    lda, docs_vectorized, vectorizer = fit_lda_model(
        papers_df["text"].tolist(),
        n_components=8,
        n_jobs=4,
    )
    docs_classified = lda.transform(docs_vectorized)
    docs_topics = docs_classified.argmax(1)
    papers_df["topic"] = docs_topics

    # Fit the second level LDA models and classify the papers
    sub_ldas = {}
    sub_docs_vectorized = {}
    sub_docs_topics = {}
    sub_vectorizers = {}
    for topic_id, group_df in progress_bar(papers_df.groupby(by="topic")):
        _sub_lda, _sub_docs_vectorized, _sub_vectorizer = fit_lda_model(
            group_df["text"].tolist(),
            n_components=4,
            n_jobs=4,
        )
        sub_ldas[topic_id] = _sub_lda
        sub_docs_vectorized[topic_id] = _sub_docs_vectorized
        group_topics = _sub_lda.transform(_sub_docs_vectorized).argmax(1)
        sub_docs_topics[topic_id] = pd.Series(group_topics, index=group_df.index)
        sub_vectorizers[topic_id] = _sub_vectorizer

    papers_subtopics_list = list(sub_docs_topics.values())
    papers_subtopics = remove_duplicated_idxs(pd.concat(papers_subtopics_list).rename("subtopic"))
    papers_df = remove_duplicated_idxs(papers_df.join(papers_subtopics))

    # Next step is to build a DataFrame with each topic pseudocount
    # of its tokens.
    topics_df = pd.DataFrame(index=pd.Series(vectorizer.get_feature_names(), name="token"))
    for topic_idx, topic in enumerate(lda.components_):
        topics_df[f"{topic_idx}"] = topic

    subtopics_dfs = {}
    for topic_idx, sub_lda in sub_ldas.items():
        subtopic_df = pd.DataFrame(index=pd.Series(sub_vectorizers[topic_idx].get_feature_names(), name="token"))
        for subtopic_idx, subtopic in enumerate(sub_lda.components_):
            subtopic_df[f"{topic_idx}-{subtopic_idx}"] = subtopic
        subtopics_dfs[topic_idx] = subtopic_df

    for subtopic_df in subtopics_dfs.values():
        topics_df = topics_df.join(subtopic_df, how="outer")
    
    if should_dump:
        papers_df.to_hdf(ARTIFACTS_PATH, key="papers_topics")
        topics_df.to_hdf(ARTIFACTS_PATH, key="topics")
        
    return papers_df, topics_df


def load_papers_topics_artifacts():
    return pd.read_hdf(ARTIFACTS_PATH, key="papers_topics")


def load_topics_artifacts():
    return pd.read_hdf(ARTIFACTS_PATH, key="topics")


def build_artifacts(dataset_folder, should_dump=True):
    metapagerank_df, _, _ = build_papers_artifact(dataset_folder, should_dump)
    papers_df, topics_df = build_papers_topics_artifact(dataset_folder, should_dump)
    return metapagerank_df, papers_df, topics_df
    

In [None]:
metapagerank_df, papers_df, topics_df = build_artifacts(CORD19_DATASET_FOLDER)

  """Entry point for launching an IPython kernel.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['affiliation', 'country', 'sha', 'source_x', 'title', 'doi', 'pmcid',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url'],
      dtype='object')]

  encoding=encoding,
  """Entry point for launching an IPython kernel.


In [4]:
joined_df, pagerank_df, metadata_df = build_papers_artifact(CORD19_DATASET_FOLDER)

  exec(code_obj, self.user_global_ns, self.user_ns)


KeyboardInterrupt: 

In [7]:
# tell nbdev to generate library from notebooks
from nbdev.export import *
notebook2script()

Converted 00_downloader.ipynb.
Converted 01_references.ipynb.
Converted 02_representations_and_lda.ipynb.
Converted 03_hierarchical_topic_modelling.ipynb.
Converted 04_lda2vec.ipynb.
Converted 05_cook_artifacts.ipynb.
Converted 06_GUI.ipynb.
Converted 98_risotto_precook.ipynb.
Converted 99_risotto_gui.ipynb.
Converted index.ipynb.
