In [1]:
import sys
sys.path.append('../code')
import os
from pubmap import retrieve_pubmed, analyse_pubmed, get_author_pos

In [2]:
data_path = "/Users/martinszyska/Sites/node/pubmap/pubmap_api/data"

In [None]:
pubmed_query="Reinke P[au] OR Volk HD[au]"
pubmed_df = retrieve_pubmed(pubmed_query)
pubmed_df[:2]

### clear mistakes

In [None]:
# get rid of last tow entries and other
pubmed_clean_df = pubmed_df.iloc[:-2,:].query('title != "The role of arousal in memory for conversation."')

In [None]:
def check_names(pub):
    if len([a for a in pub['authors'] if a == "Reinke,P" or a.startswith("Volk,")]):
        return pub

In [None]:
pubmed_clean2_df = pubmed_clean_df.apply(check_names, axis=1).query("title == title").reset_index(drop=True)
# change that one "volk,HD" to "Volk,HD"
pubmed_clean2_df.iloc[752,5] = ['Ode-Hakim,S', 'Döcke,WD', 'Mutze,S', 'Volk,HD', 'Reinke,P']
pubmed_clean2_df[:3]

In [None]:
result_df = pubmed_clean2_df.apply(get_author_pos, axis=1).drop(columns="affiliations")
result_df[:3]

### save results

In [None]:
result_df.to_csv(os.path.join(data_path, 'rv_pubs.csv'), sep="\t", index=False)

## RUN the analysis

In [None]:
analyse_pubmed(result_df, outfolder="/Users/martinszyska/Sites/node/pubmap/public/data", max_nodes=250, min_power=1, min_weight=1)

## STEP BY STEP

### get the coauthor linkage

In [None]:
from pubmap import get_coauthors
coauthors = get_coauthors(result_df)
coauthors

### change von Baer

In [None]:
coauthors.loc[coauthors['A'].str.contains("Baehr"), :]

In [None]:
coauthors.loc[coauthors['A'].str.contains("Baehr"), "A"] = "von Baehr,R"
coauthors.loc[coauthors['A'].str.contains("Baehr"), :]

In [3]:
# coauthors.to_csv(os.path.join(data_path, 'coauthors.csv'), sep="\t", index=False)
coauthors = pd.read_csv(os.path.join(data_path, 'coauthors.csv'), sep="\t")

### get the nodes from the unique names in A and B

In [4]:
from pubmap import get_coauthors, get_nodes, get_data, save_by_year, get_info
nodes = get_nodes(coauthors)
nodes

Unnamed: 0,name,power,last,group
0,"Volk,HD",6392,2021,1
1,"Reinke,P",2858,2021,1
2,"Sawitzki,B",1320,2021,1
3,"Kunkel,D",792,2021,1
4,"Radbruch,A",761,2021,1
...,...,...,...,...
3308,"Bimmler,M",2,1986,1
3309,"Ittenson,A",2,1985,1
3310,"Kabat,NW",2,2020,1
3311,"Blades,WH",1,2018,1


### store the ids of the global nodes list for unique ids

In [5]:
node_ids = nodes.reset_index().rename({'index': 'id'}, axis=1).loc[:,['id', 'name']]
node_ids

Unnamed: 0,id,name
0,0,"Volk,HD"
1,1,"Reinke,P"
2,2,"Sawitzki,B"
3,3,"Kunkel,D"
4,4,"Radbruch,A"
...,...,...
3308,3308,"Bimmler,M"
3309,3309,"Ittenson,A"
3310,3310,"Kabat,NW"
3311,3311,"Blades,WH"


In [6]:
_, _, edges = get_data(coauthors, node_ids)
edges

Unnamed: 0,source,target,weight
0,1,0,151
1,194,0,76
2,5,0,36
3,193,0,54
4,445,0,54
...,...,...,...
137359,2647,2557,1
137360,2638,2666,1
137361,3163,3179,1
137362,2695,2694,1


### get the edges for the linkage
+ here the nodes will be reduced according to year of occurrence

In [7]:
_, nodes, edges = get_data(coauthors, node_ids, min_weight=1, after=1980, year=2010, max_nodes=250)
nodes

Unnamed: 0,id,name,power,last,group
0,0,"Volk,HD",3925,0,1
1,1,"Reinke,P",1191,0,1
2,194,"Döcke,WD",461,1,1
3,198,"Pratschke,J",430,0,1
4,442,"Tullius,SG",378,1,1
...,...,...,...,...,...
245,1141,"Löhler,J",33,2,1
246,1159,"Chapman,S",33,0,1
247,1167,"Sharif,A",33,0,1
248,1144,"Seyfert-Margolis,V",33,0,1


In [9]:
nodes.to_csv(os.path.join(data_path, "pubmap_nodes.csv"), sep="\t", index=False)
edges.to_csv(os.path.join(data_path, "pubmap_edges.csv"), sep="\t", index=False)

In [10]:
json_folder = "/Users/martinszyska/Sites/node/pubmap/public/data/pubmap"
for year in coauthors['date'].sort_values().unique():
    print(year)
    _,_ = save_by_year(coauthors, node_ids, year=year, past_years=25, save_folder=json_folder,
    max_nodes=250, min_power=2, min_weight=1)

1977
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
