# Semantic Scholar API

In [None]:
import httpx
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
doi = "10.1101/444398"

In [None]:
r1 = httpx.get(
    f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}?fields=title,url,abstract,authors"
)

In [None]:
def print_paper(paper: dict):
    print(paper["title"])
    print(paper["url"])
    print(paper["abstract"])
    print(paper["authors"])

In [None]:
j1 = r1.json()
print_paper(j1)

In [None]:
paper_id = j1["paperId"]

In [None]:
num_papers = 30

In [None]:
r2 = httpx.get(
    f"https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{paper_id}?limit={num_papers}&fields=title,url,abstract,authors"
)  # ")

In [None]:
j2 = r2.json()

In [None]:
def get_paper_batch_info(ids):
    payload = {"ids": ids}
    r = httpx.post(
        "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract,authors,year,venue,embedding,tldr",
        json=payload,
        timeout=50.0,
    )
    return r.json()

In [None]:
ids = [p["paperId"] for p in j2["recommendedPapers"]] + [paper_id]

In [None]:
j3 = get_paper_batch_info(ids)

In [None]:
def parse_paper_batch_info(json_response):
    id_to_vector = {}
    id_info = []
    for p in json_response:
        id_ = p["paperId"]
        assert p["embedding"]["model"] == "specter@v0.1.1"
        id_to_vector[id_] = p["embedding"]["vector"]
        if p["tldr"] is not None:
            tldr = p["tldr"]["text"]
        else:
            tldr = ""
        id_info.append(
            [
                id_,
                p["title"],
                p["year"],
                p["venue"],
                p["abstract"],
                tldr,
                [a["name"] for a in p["authors"]],
            ]
        )
    id_info_df = pd.DataFrame(
        id_info,
        columns=["id_", "title", "year", "venue", "abstract", "tldr", "authors"],
    )
    return id_info_df, id_to_vector

In [None]:
id_info_df, id_to_vector = parse_paper_batch_info(j3)

### Finetuning recommendations

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
id_info_df[["id_", "title"]]

In [None]:
liked_ids = [
    "0824e6f75e18325a79b11e3e4a118409e3297f97",
    "d0c5f901868f6e2cb126fd51b155f631372a9669",
]
disliked_ids = [
    "13d51ef5fbf4447bd9d58283387f1610a4fcfce4",
    "b6407952a59dd1664e44e3a6336f91e8599aa30f",
    "b9c8fed348084c5b31722f5433ea805299d006aa",
    "48de1a31cca6631bd73a5d0854acfda5e5195d66",
    "0e908cfd65ebd60c690e2aabcb9e0d67bdcbfb81",
]

In [None]:
id_types = []
for i in id_info_df.loc[:, "id_"]:
    if i in liked_ids:
        id_types.append("liked")
    elif i in disliked_ids:
        id_types.append("disliked")
    elif i == paper_id:
        id_types.append("query")
    else:
        id_types.append("value")
id_info_df.insert(0, "type", id_types)

In [None]:
attractor_ids = liked_ids + [paper_id]
detractor_ids = disliked_ids.copy()

In [None]:
def print_paper_info(paper_ids, paper_info_df):
    for i in paper_ids:
        print(i)
        print(paper_info_df.loc[paper_info_df.loc[:, "id_"] == i, "title"])
        print()

In [None]:
print_paper_info(attractor_ids, id_info_df)

In [None]:
print_paper_info(detractor_ids, id_info_df)

In [None]:
assert all((i in id_to_vector for i in attractor_ids))

In [None]:
assert all((i in id_to_vector for i in detractor_ids))

In [None]:
embed_dim = 768

In [None]:
def embed_matrix(ids, id_to_vector, embed_dim):
    mat = np.fromiter(
        (id_to_vector[i] for i in ids), dtype=np.dtype((float, embed_dim))
    )
    return mat

In [None]:
attractor_ids_mat = embed_matrix(attractor_ids, id_to_vector, embed_dim)
detractor_ids_mat = embed_matrix(detractor_ids, id_to_vector, embed_dim)

In [None]:
assert attractor_ids_mat.shape == (len(attractor_ids), embed_dim)
assert detractor_ids_mat.shape == (len(detractor_ids), embed_dim)

In [None]:
query_term = "distant supervision biomedical text mining"

In [None]:
r4 = httpx.get(
    f"https://api.semanticscholar.org/graph/v1/paper/search?query={query_term}&fields=title,url,abstract,authors&limit=50"
)

In [None]:
j4 = r4.json()

In [None]:
new_query_ids = [p["paperId"] for p in j4["data"]]

In [None]:
j5 = get_paper_batch_info(new_query_ids)

In [None]:
new_query_id_df, new_query_id_to_vector = parse_paper_batch_info(j5)

In [None]:
new_query_ids_mat = embed_matrix(new_query_ids, new_query_id_to_vector, embed_dim)
new_query_ids_mat.shape

In [None]:
assert new_query_ids_mat.shape == (len(new_query_ids), embed_dim)

In [None]:
def d(a, m):
    return 1 - cosine_similarity(a, m)

In [None]:
attractor_d = d(new_query_ids_mat, attractor_ids_mat)
detractor_d = d(new_query_ids_mat, detractor_ids_mat)

In [None]:
loss = attractor_d.min(axis=1) - detractor_d.min(axis=1)

In [None]:
n = 5

In [None]:
keep_query_id = loss.argsort()[:n]
keep_query_id

In [None]:
print_paper_info([new_query_ids[i] for i in keep_query_id], new_query_id_df)

### UMAP embedding

In [None]:
import bokeh
import umap
from bokeh.models import CategoricalColorMapper, ColumnDataSource, HoverTool
from bokeh.palettes import Spectral10
from bokeh.plotting import figure, output_notebook, show

output_notebook()

In [None]:
reducer = umap.UMAP(random_state=42)

In [None]:
ids = id_info_df.loc[:, "id_"].tolist()
embeddings = [id_to_vector[i] for i in ids]
embed_df = pd.DataFrame(embeddings, index=ids)
embed_df

In [None]:
reducer.fit(embed_df)

In [None]:
embed_df_t = pd.DataFrame(reducer.transform(embed_df))
embed_df_t.columns = ["x", "y"]
embed_df_t["id_"] = ids

In [None]:
df_t = embed_df_t.merge(id_info_df, on="id_", validate="one_to_one")

In [None]:
datasource = ColumnDataSource({str(c): v.values for c, v in df_t.items()})

In [None]:
tooltips = [
    ("(x,y)", "($x, $y)"),
    ("id", "@id_"),
    ("title", "@title"),
]

In [None]:
plot_figure = figure(
    title="UMAP projection of papers",
    width=800,
    height=800,
    tooltips=tooltips,
)

In [None]:
color_map = CategoricalColorMapper(
    palette=["black", "orange", "blue", "grey"],
    factors=["query", "liked", "disliked", "value"],
)

In [None]:
plot_figure.circle(
    "x",
    "y",
    source=datasource,
    size=20,
    color={"field": "type", "transform": color_map},
    alpha=0.5,
)
show(plot_figure)

## BioGPT

In [None]:
from transformers import BioGptForCausalLM, BioGptTokenizer, pipeline, set_seed

In [None]:
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt-large")
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt-large")

In [None]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
set_seed(42)
generator("COVID-19 is", max_length=20, num_return_sequences=5, do_sample=True)

In [None]:
input_text = f"Task: please summarize the following article. Article: {j1['abstract']}"
input_text += " Summary: "
input_text

In [None]:
generator(input_text, max_new_tokens=200, num_return_sequences=5, do_sample=True)