-
Notifications
You must be signed in to change notification settings - Fork 0
/
_770_compute_neighbor_metrics.py
executable file
·144 lines (121 loc) · 5.42 KB
/
_770_compute_neighbor_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
# Author: Michael E. Rose <michael.ernst.rose@gmail.com>
"""Computes measures of productivity for network neighbors
based on journal impact factors.
"""
from glob import glob
import networkx as nx
import pandas as pd
from _313_compute_author_metrics import explode, read_jif
JIF_FILE = "./751_Journal_Impact_Factors/JIFs.csv"
PUBLICATION_LIST = "./312_author_data/pub_list.csv"
NETWORK_FOLDER = "./200_yearly_networks/"
TARGET_FILE = "./770_network_neighbor_productivity/both.csv"
def compute_first_neigh_prod(neigh, wpubs, window=5):
"""Compute the productivity of first neighbors excluding joint
publications.
"""
# Add years prior to actual observation
neigh = neigh.sort_values(["index", "t", "scopus_id"])
temp = neigh.copy()
for lag in range(1, window):
new = temp.copy()
new["t"] = new["t"]-lag
neigh = neigh.append(new)
neigh = neigh.drop_duplicates()
del temp, new
# Add neighbors' publications' JIF and exclude joint publications
neigh = neigh.merge(wpubs, "left", on=["scopus_id", "t"])
neigh = neigh[neigh.apply(find_coauthor, axis=1)].drop("authors", axis=1)
return neigh.groupby(["index", "t"])["SJR"].sum().reset_index()
def find_coauthor(s):
"""Whether `s["index"]` is not in `s["authors"]`."""
try:
return s["index"] not in s["authors"]
except TypeError: # Coauthors missing for some strange reason
return True
def get_neighbors(files):
"""Return DataFrames with yearly direct and indirect neighbors"""
from os.path import basename
neigh_1st = {}
neigh_2nd = {}
for file in files:
G = nx.read_gexf(file)
year = basename(file)[:4]
neigh = {n: list(set([k for k in G[n].keys() if k.isdigit()]))
for n in G.nodes() if n.isdigit()}
neigh_1st[year] = neigh
indir = {}
for node, neighs in neigh.items():
lst = [set(G[n].keys()) for n in neighs]
new = set([n for sl in lst for n in sl if n.isdigit()])
indir[node] = list(new - set(neighs))
neigh_2nd[year] = indir
first = pd.DataFrame.from_dict(neigh_1st).reset_index()
first = transform(first, "scopus_id")
second = pd.DataFrame.from_dict(neigh_2nd).reset_index()
second = transform(second, "scopus_id")
return first, second
def cumulate_productivity(df, window=5):
"""Compute productivity over current and past 4 years."""
return (df.pivot(index="scopus_id", columns="t", values="SJR")
.fillna(0).rolling(window, min_periods=1, axis=1).sum()
.reset_index().melt(id_vars="scopus_id", value_name="SJR"))
def transform(df, value_name):
"""Melt and explode DataFrame of neighbors."""
df = df.melt(id_vars="index", var_name="t", value_name=value_name)
df = df.dropna().set_index(["index", "t"])
df = explode(df, value_name, value_name)
df[value_name] = df[value_name].astype(int)
df["t"] = df["t"].astype("uint")
return df
def main():
# Read in
cols = ["scopus_id", "years", "sources", "coauthors"]
pubs = pd.read_csv(PUBLICATION_LIST, index_col=0, usecols=cols)
for c in pubs.columns:
pubs[c] = pubs[c].str.split("|")
# Weigh publications
print(">>> Weighting publications...")
dfs = [explode(pubs, "years", "t"),
explode(pubs, "sources", "source").drop("scopus_id", axis=1),
explode(pubs, "coauthors", "authors").drop("scopus_id", axis=1)]
wpubs = pd.concat(dfs, axis=1)
wpubs["t"] = wpubs["t"].astype("uint")
jif = read_jif().drop_duplicates(subset="source").drop("year", axis=1)
wpubs = wpubs.merge(jif, "left", on="source").drop("source", axis=1)
# Read neighbors
print(">>> Reading network files...")
auth_1st, auth_2nd = get_neighbors(glob(NETWORK_FOLDER + "*auth.gexf"))
com_1st, com_2nd = get_neighbors(glob(NETWORK_FOLDER + "*com.gexf"))
# Compute first neighbors' productivity (account for joint publications)
print(">>> Computing first neighbors' productivity...")
wpubs["authors"] = wpubs["authors"].str.split(";")
auth1 = compute_first_neigh_prod(auth_1st, wpubs)
auth1 = auth1.rename(columns={'SJR': 'qit1_a', 'index': 'scopus_id'})
auth1["scopus_id"] = auth1["scopus_id"].astype(int)
com1 = compute_first_neigh_prod(com_1st, wpubs)
com1 = com1.rename(columns={'SJR': 'qit1_c', 'index': 'scopus_id'})
com1["scopus_id"] = com1["scopus_id"].astype(int)
# Compute second neighbors' productivity
print(">>> Computing second neighbors' productivity...")
wpubs = wpubs.groupby(["scopus_id", "t"])["SJR"].sum().reset_index()
wpubs = cumulate_productivity(wpubs)
params = {"right": wpubs, "how": "left", "on": ["scopus_id", "t"]}
auth2 = (auth_2nd.merge(**params)
.groupby(["scopus_id", "t"])["SJR"].sum()
.reset_index()
.rename(columns={"SJR": 'qit2_a'}))
com2 = (com_2nd.merge(**params)
.groupby(["scopus_id", "t"])["SJR"].sum()
.reset_index()
.rename(columns={"SJR": 'qit2_c'}))
# Write out
print(">>> Writing out...")
out = (auth1.merge(auth2, "outer", on=['scopus_id', 't'])
.merge(com1, "outer", on=['scopus_id', 't'])
.merge(com2, "outer", on=['scopus_id', 't'])
.sort_values(['scopus_id', 't']))
out.to_csv(TARGET_FILE, index=False)
if __name__ == '__main__':
main()