In [None]:
import sys
sys.path.append('../code')
import os
from pubmap import retrieve_pubmed, analyse_pubmed

In [None]:
data_path = "/Users/martinszyska/Sites/node/pubmap/public/data"

## do the pubmed query and clean

In [None]:
pubmed_query="Reinke P[au] OR Volk HD[au]"
pubmed_df = retrieve_pubmed(pubmed_query)
pubmed_df[:2]

### clear mistakes

In [None]:
# get rid of last tow entries and other
pubmed_clean_df = pubmed_df.iloc[:-2,:].query('title != "The role of arousal in memory for conversation."')

In [None]:
def check_names(pub):
    if len([a for a in pub['authors'] if a == "Reinke,P" or a.startswith("Volk,")]):
        return pub

In [None]:
pubmed_clean2_df = pubmed_clean_df.apply(check_names, axis=1).query("title == title").reset_index(drop=True)
# change that one "volk,HD" to "Volk,HD"
pubmed_clean2_df.iloc[752,5] = ['Ode-Hakim,S', 'Döcke,WD', 'Mutze,S', 'Volk,HD', 'Reinke,P']
pubmed_clean2_df[:3]

In [None]:
result_df = pubmed_clean2_df.apply(get_author_pos, axis=1).drop(columns="affiliations")
result_df[:3]

### remove the huge publications!!
+ these would inflate nodes immensely (20K!)

In [None]:
result_df.query("n_authors > 200")

In [None]:
result_df = result_df.query("n_authors < 200")

### save/load results

In [None]:
# result_df.to_csv(os.path.join(data_path, 'rv_pubs.csv'), sep="\t", index=False)
result_df = pd.read_csv(os.path.join(data_path, 'rv_pubs.csv'), sep="\t")
result_df[:3]

## RUN the analysis

In [None]:
analyse_pubmed(result_df, outfolder=data_path, max_nodes=250, min_power=1, min_weight=1)

## STEP BY STEP

### get the coauthor linkage

In [None]:
coauthors = get_coauthors(result_df)
coauthors

### change von Baer

In [None]:
coauthors.loc[coauthors['A'].str.contains("Baehr"), :]

In [None]:
coauthors.loc[coauthors['A'].str.contains("Baehr"), "A"] = "von Baehr,R"
coauthors.loc[coauthors['A'].str.contains("Baehr"), :]

In [None]:
# coauthors.to_csv(os.path.join(data_path, 'coauthors.csv'), sep="\t", index=False)
coauthors = pd.read_csv(os.path.join(data_path, 'coauthors.csv'), sep="\t")

### get the nodes from the unique names in A and B

In [None]:
from pubmap import get_coauthors, get_nodes, get_data, save_by_year, get_info
nodes = get_nodes(coauthors)
nodes

### store the ids of the global nodes list for unique ids

In [None]:
node_ids = nodes.reset_index().rename({'index': 'id'}, axis=1).loc[:,['id', 'name']]
node_ids

In [None]:
_, _, edges = get_data(coauthors, node_ids)
edges

### get the edges for the linkage
+ here the nodes will be reduced according to year of occurrence

In [None]:
_, nodes, edges = get_data(coauthors, node_ids, min_weight=1, after=1980, year=2010, max_nodes=250)
nodes

In [None]:
nodes.to_csv(os.path.join(data_path, "pubmap_nodes.csv"), sep="\t", index=False)
edges.to_csv(os.path.join(data_path, "pubmap_edges.csv"), sep="\t", index=False)

In [None]:
json_folder = os.path.join(data_path, "pubmap")
for year in coauthors['date'].sort_values().unique():
    print(year)
    _,_ = save_by_year(coauthors, node_ids, year=year, past_years=25, save_folder=json_folder,
    max_nodes=250, min_power=2, min_weight=1)