In [309]:
abstract_file = data_dir / ".." / "medrxiv_abstracts_processed.csv"
assert abstract_file.exists()
abstracts = pd.read_csv(abstract_file)
print(abstracts.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26846 entries, 0 to 26845
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   doi                               26846 non-null  object
 1   title                             26846 non-null  object
 2   authors                           26846 non-null  object
 3   author_corresponding              26846 non-null  object
 4   author_corresponding_institution  26837 non-null  object
 5   date                              26846 non-null  object
 6   version                           26846 non-null  int64 
 7   type                              26846 non-null  object
 8   license                           26833 non-null  object
 9   category                          26846 non-null  object
 10  jatsxml                           26846 non-null  object
 11  abstract                          26846 non-null  object
 12  published         

In [310]:
doi_details = abstracts[["doi", "date"]]
doi_details.head()

Unnamed: 0,doi,date
0,10.1101/19000109,2020-03-16
1,10.1101/19000653,2020-01-03
2,10.1101/19001693,2020-01-08
3,10.1101/19001719,2020-01-23
4,10.1101/19001495,2020-03-27


In [311]:
doi_details = doi_details.merge(
    py_.chain(triples)
    .filter(lambda e: e["triples"].__len__() > 0)
    .map(
        lambda e: [
            {
                "doi": e["doi"],
                "triple": _["triple_text"],
            }
            for _ in e["triples"]
        ]
    )
    .flatten()
    .thru(pd.DataFrame)
    .value(),
    on=["doi"],
)
doi_details.head()

Unnamed: 0,doi,date,triple
0,10.1101/19000653,2020-01-03,Behavior Therapy:TREATS:Attention Deficit Diso...
1,10.1101/19000653,2020-01-03,Attention Deficit Disorder:COEXISTS_WITH:Hemis...
2,10.1101/19000653,2020-01-03,Evaluation:TREATS:Complex Regional Pain Syndromes
3,10.1101/19001693,2020-01-08,Angioscopy:TREATS:Patients
4,10.1101/19001719,2020-01-23,Antihypertensive Agents:TREATS:Mood Disorders


In [315]:
# import re
def _contain_terms(text, terms):
    res = False
    for term in terms:
        exists = term.lower().strip() in text.lower().strip()
        if exists:
            return exists
    return res


TEXT = "Obesity:CAUSES:Asthma"
print(_contain_terms(TEXT, ["obesity"]))
print(_contain_terms(TEXT, ["Asthma"]))
print(_contain_terms(TEXT, ["apple"]))

True
True
False


In [319]:
from functools import partial

theme_covid_p = partial(
    _contain_terms, terms=["coronavirus", "covid", "sars", "sars-covid-2"]
)
theme_obesity_p = partial(
    _contain_terms,
    terms=[
        "obesity",
        "diabetes",
        "hypertens",
        "overweight",
        "body mass index",
        "blood pressure",
    ],
)
theme_mental_p = partial(
    _contain_terms,
    terms=["mental", "schizophrenia", "depress", "alzheimer", "parkinson"],
)
theme_others_p = (
    lambda x: sum([_(x) for _ in [theme_covid_p, theme_obesity_p, theme_mental_p]]) == 0
)

doi_details = doi_details.assign(
    theme_covid=lambda df: df["triple"].apply(theme_covid_p),
    theme_obesity=lambda df: df["triple"].apply(theme_obesity_p),
    theme_mental=lambda df: df["triple"].apply(theme_mental_p),
    theme_others=lambda df: df["triple"].apply(theme_others_p),
    date=lambda df: pd.to_datetime(df["date"].to_list(), format="%Y-%m-%d"),
)
doi_details

Unnamed: 0,doi,date,triple,theme_covid,theme_obesity,theme_mental,theme_others
0,10.1101/19000653,2020-01-03,Behavior Therapy:TREATS:Attention Deficit Diso...,False,False,False,True
1,10.1101/19000653,2020-01-03,Attention Deficit Disorder:COEXISTS_WITH:Hemis...,False,False,False,True
2,10.1101/19000653,2020-01-03,Evaluation:TREATS:Complex Regional Pain Syndromes,False,False,False,True
3,10.1101/19001693,2020-01-08,Angioscopy:TREATS:Patients,False,False,False,True
4,10.1101/19001719,2020-01-23,Antihypertensive Agents:TREATS:Mood Disorders,False,True,False,False
...,...,...,...,...,...,...,...
14431,10.1101/2021.12.27.21268432,2021-12-30,Copy Number Polymorphism:ASSOCIATED_WITH:Disease,False,False,False,True
14432,10.1101/2021.12.27.21268432,2021-12-30,Clinical Significance:COEXISTS_WITH:Hereditary...,False,False,False,True
14433,10.1101/2021.12.28.21268379,2021-12-30,HIV 2 p26:ASSOCIATED_WITH:Disease Progression,False,False,False,True
14434,10.1101/2021.12.29.21268487,2021-12-31,ACE2 gene:AFFECTS:Uptake,False,False,False,True


In [321]:
doi_details["date"]

0       2020-01-03
1       2020-01-03
2       2020-01-03
3       2020-01-08
4       2020-01-23
           ...    
14431   2021-12-30
14432   2021-12-30
14433   2021-12-30
14434   2021-12-31
14435   2021-12-31
Name: date, Length: 14436, dtype: datetime64[ns]

In [326]:
month_summary = (
    doi_details[
        ["date", "theme_covid", "theme_obesity", "theme_mental", "theme_others", "doi"]
    ]
    .assign(doi=1)
    .set_index("date")
    .resample("M")
    .agg("sum")
)
month_summary

Unnamed: 0_level_0,theme_covid,theme_obesity,theme_mental,theme_others,doi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-31,0,8,11,184,203
2020-02-29,10,4,15,244,273
2020-03-31,17,22,15,391,442
2020-04-30,18,30,14,574,636
2020-05-31,15,35,31,832,913
2020-06-30,18,28,33,779,858
2020-07-31,14,17,34,602,665
2020-08-31,12,24,22,547,605
2020-09-30,6,34,48,596,682
2020-10-31,4,18,24,490,536
