In [1]:
import pandas as pd
import numpy as np
import time
from utilities import get_random_articles

In [2]:
all_df = pd.read_csv("data/raw_data.csv")
cited_df = pd.read_csv("data/papers_cited.csv")

all_only_articles_df = all_df.loc[all_df["Item Type"] == "journalArticle"][
    ["Author", "Title", "Publication Year"]
]

cited_titles = list(cited_df["Title"])
all_only_articles_df["is_cited_own_paper"] = all_only_articles_df["Title"].isin(
    cited_titles
)

all_only_articles_df["is_of_interest"] = True

In [3]:
display(
    all_only_articles_df.loc[all_only_articles_df["Publication Year"].isnull()][
        "Author"
    ].count()
)
display(
    all_only_articles_df.loc[all_only_articles_df["Author"].isnull()]["Author"].count()
)
display(
    all_only_articles_df.loc[all_only_articles_df["Title"].isnull()]["Author"].count()
)

0

0

0

In [4]:
all_only_articles_df.columns = [
    "authors",
    "title",
    "year",
    "is_cited_own_paper",
    "is_of_interest",
]
all_only_articles_df["year"] = pd.to_numeric(
    all_only_articles_df["year"], downcast="integer"
)

In [5]:
# This function formats the authors column to match the arXiv API output
def format_authors(authors):
    l_authors = authors.split("; ")
    return "; ".join(
        [" ".join(author.split(",")[::-1]).strip() for author in l_authors]
    )


format_authors(all_only_articles_df.iloc[0]["authors"])
all_only_articles_df["authors"] = all_only_articles_df["authors"].map(format_authors)

I have extracted from my Zotero library various papers I was interested in (papers I cited, papers on TODO lists, etc...). Now I want to get a sample of papers I would not be interested in generically. While not a perfect method, I take a random sample of arxiv papers, picked in my usual categories (hep-th, gr-qc, cond-mat.str-el) with random years (20 by 20), while checking they were not in my collection.

In [6]:
list_all_titles = list(all_only_articles_df["title"])
nextra = all_only_articles_df["title"].count()
list_random_articles = []
years_sampled = []
while len(list_random_articles) < nextra:
    ret_rnd = get_random_articles()
    if ret_rnd is None:
        continue
    year, rnd_articles = ret_rnd
    years_sampled.append(year)
    for rnd_article in rnd_articles:
        if rnd_article["title"] not in list_all_titles:
            list_random_articles.append(
                {
                    "title": rnd_article["title"],
                    "authors": "; ".join(
                        [author["name"] for author in rnd_article["authors"]]
                    ),
                    "year": year,
                    "is_cited_own_paper": False,
                    "is_of_interest": False,
                }
            )
    time.sleep(1)

In [7]:
full_df = pd.DataFrame(all_only_articles_df.to_dict("records") + list_random_articles)
idx = pd.Index(np.arange(1, len(full_df) + 1))
full_df.index = idx
full_df.index.name = "Ind"
full_df["year"] = pd.to_numeric(full_df["year"], downcast="integer")
full_df["title"] = full_df["title"].map(lambda x: x.replace("\n", ""))
full_df.to_csv("data/zotero_data.csv")