In [None]:
import sys
sys.path.append('../code')
import os
from pubmap import retrieve_pubmed, analyse_pubmed

# Run the Analysis

In [None]:
data_path = "/Users/martinszyska/Sites/node/pubmap/public/data"
result_df = pd.read_csv(os.path.join(data_path, 'rv_pubs_filtered.csv'), sep="\t")
analyse_pubmed(result_df, outfolder=data_path, max_nodes=250, min_power=1, min_weight=1, past_years=100)

# Step by step
### do the pubmed query and clean

In [None]:
from pubmap import retrieve_pubmed, get_author_pos
data_path = "/Users/martinszyska/Sites/node/pubmap/public/data"

In [None]:
pubmed_query="Reinke P[au] OR Volk HD[au]"
pubmed_df = retrieve_pubmed(pubmed_query)
pubmed_df[:2]

### clear mistakes

In [None]:
# get rid of last tow entries and other
pubmed_clean_df = pubmed_df.iloc[:-2,:].query('title != "The role of arousal in memory for conversation."')

In [None]:
def check_names(pub):
    if len([a for a in pub['authors'] if a == "Reinke,P" or a.startswith("Volk,")]):
        return pub

In [None]:
pubmed_clean2_df = pubmed_clean_df.apply(check_names, axis=1).query("title == title").reset_index(drop=True)
# change that one "volk,HD" to "Volk,HD"
pubmed_clean2_df.iloc[752,5] = ['Ode-Hakim,S', 'Döcke,WD', 'Mutze,S', 'Volk,HD', 'Reinke,P']
pubmed_clean2_df[:3]

In [None]:
result_df = pubmed_clean2_df.apply(get_author_pos, axis=1).drop(columns="affiliations")
result_df.to_csv(os.path.join(data_path, 'rv_pubs.csv'), sep="\t", index=False)
result_df[:3]

### remove the huge publications!!
+ these would inflate nodes immensely (20K!)

In [None]:
result_df.query("n_authors > 50")

In [None]:
result_df = result_df.query("n_authors < 50")
result_df.query("n_authors > 30")

### save/load results

In [None]:
# result_df.to_csv(os.path.join(data_path, 'rv_pubs_filtered.csv'), sep="\t", index=False)
result_df = pd.read_csv(os.path.join(data_path, 'rv_pubs_filtered.csv'), sep="\t")
result_df[:3]

## get the coauthor linkage

In [None]:
from pubmap import get_coauthors
coauthors = get_coauthors(result_df)
coauthors

### change von Baer

In [None]:
coauthors.loc[coauthors['A'].str.contains("Baehr"), :]

In [None]:
coauthors.loc[coauthors['A'].str.contains("Baehr"), "A"] = "von Baehr,R"
coauthors.loc[coauthors['A'].str.contains("Baehr"), :]

### save/load

In [None]:
from pubmap import get_nodes, get_edges, get_info, retrieve_data, to_json_dict
data_path = "/Users/martinszyska/Sites/node/pubmap/public/data"
# coauthors.to_csv(os.path.join(data_path, 'coauthors.csv'), sep="\t", index=False)
coauthors = pd.read_csv(os.path.join(data_path, 'coauthors.csv'), sep="\t")

### get the nodes from the unique names in A and B

In [None]:
nodes = get_nodes(coauthors)
nodes

### store the ids of the global nodes list for unique ids

In [None]:
nodes = nodes.reset_index().rename({'index': 'id'}, axis=1).loc[:, ["id", "name", "power", "last", "group"]]
node_ids = nodes.loc[:,['id', 'name']]
nodes

### get the global edges with unique ids

In [None]:
nodes, edges = get_edges(coauthors, nodes)
edges

### get the edge_ids for unique link ids

In [None]:
edges = edges.reset_index().rename({"index":"id"}, axis=1).loc[:, ['id', 'source', 'target', 'weight']]
edge_ids = edges.drop(columns="weight")
edge_ids

### test one year

In [None]:
year=2021
past_years = 50

json_folder = os.path.join(data_path, "pubmap")
max_nodes=250
min_power=0
max_edges=0
min_weight=1
remove_stumps=True

# loop through years
print(year)
ca_df = coauthors.query('@past_years <= date <= @year')
nodes, edges, info = retrieve_data(ca_df, node_ids, edge_ids,
max_nodes=max_nodes, 
min_power=min_power, 
min_weight=min_weight,
remove_stumps=remove_stumps)
nodes

In [None]:
import json
# save to datapath
json_file = os.path.join(json_folder, f"pubmap{year}.json")
with open(json_file, "w") as file:
    json.dump(to_json_dict(nodes, edges, info), file)

## save isolated data frame

In [None]:
def isolate_VH(nodes, edges, info):
    return nodes.query('name in ["Volk,HD", "Reinke,P"]'), edges.query('id == 0'), info

inodes, iedges, info = isolate_VH(nodes, edges, info)


for year in range(2022,2050):
    json_file = os.path.join(json_folder, f"pubmap{year}.json")
    with open(json_file, "w") as file:
        json.dump(to_json_dict(inodes, iedges, info), file)

In [None]:
edges

In [None]:
nodes.query('name in ["Volk,HD", "Reinke,P"]')
edges.query('id == 0')