-
Notifications
You must be signed in to change notification settings - Fork 0
/
_615_compile_student_data.py
executable file
·333 lines (289 loc) · 12.9 KB
/
_615_compile_student_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#!/usr/bin/env python3
# Author: Michael E. Rose <michael.ernst.rose@gmail.com>
"""Compiles all available data on students and adds the following:
1. Placement type
2. Placement retainment
3. Citation count immediately after placement
4. Rank and year of first affiliaton change
"""
from collections import defaultdict
from pathlib import Path
import pandas as pd
from pybliometrics.scopus import CitationOverview, ScopusSearch
from tqdm import tqdm
from _005_parse_students import write_stats
from _117_get_faculty_lists import create_country_map
from _215_compute_adviser_centralities import standardize, winsorize
from _401_rank_institutions import END_YEAR
STUDENT_FILE = Path("./005_student_lists/main.csv")
PLACEMENT_FILE = Path("./020_placements/placements.csv")
MAPPING_FOLDER = Path("./090_institution_data/")
ADVISER_MAP = Path("./199_adviser-student_map/actual.csv")
GENDER_FILE = Path("./608_gender_estimates/genderize.csv")
TARGET_FILE = Path("./615_student_data/student.csv")
CITATION_RANGE = 5 # Count cites to publications this many years past placement
PLATFORMS = {"60020337", "60016621", "60007893"}
_aff_map = pd.read_csv(MAPPING_FOLDER/"non_org.csv", dtype=str).set_index("nonorg")["org"].to_dict()
tqdm.pandas()
def compare_placement(aff_d, placement):
"""Compare whether the placement is in the set of affiliations,
considering a certain lag.
"""
same = {}
old_aff = True
for lag, affs in aff_d.items():
if lag > 9:
continue
key = f"plc_same-{lag:02d}"
if affs:
value = placement in affs
old_aff = placement in affs
else:
value = old_aff
same[key] = int(value)
return same
def find_affiliation(p, auth_id):
"""Find affiliation in an author's publication."""
try:
idx = p.author_ids.index(str(int(auth_id)))
affs = p.author_afids.split(";")[idx] or None
affs = set(affs.split("-"))
affs = set(_aff_map.get(a, a) for a in affs) - PLATFORMS
return affs
except (AttributeError, ValueError): # No affiliation or author not in list
return set()
def get_affiliation_type(aff_id, refresh=100):
"""Get type of placement according to Scopus."""
from pybliometrics.scopus import AffiliationRetrieval
try:
aff_id = int(aff_id)
aff = AffiliationRetrieval(aff_id, refresh=refresh)
org_type = aff.org_type
country = country_map.get(aff.country)
if aff.country and not country:
print(aff.country)
except ValueError:
org_type = None
country = None
return pd.Series({"plc_type": org_type, "plc_country": country})
def parse_affiliations(df, scopus_id, base_year):
"""Parse affiliations listed on papers and sort them
relative to `base_year`.
"""
# Get affiliations from papers
max_year = df["year"].max()
affs = defaultdict(lambda: set())
temp = df.dropna(subset=["author_afids"]).copy()
temp["lag"] = temp["year"] - base_year
temp = temp[temp["lag"] > 0] # Consider only papers past PhD
if temp.empty:
return affs
temp["aff"] = temp.apply(lambda s: find_affiliation(s, scopus_id), axis=1)
# Fill active years w/o publication
for y in range(1, max_year-base_year):
affs[y].update(set())
# Record all affiliations
for _, p in temp.iterrows():
affs[p.lag].update(p.aff)
return affs
def query_publications(scopus_id, refresh=100):
"""Retrieve publications for a student."""
res = ScopusSearch(f"AU-ID({scopus_id})", refresh=refresh).results
cols = ["coverDate", "author_ids", "author_afids"]
temp = pd.DataFrame(res).set_index("eid")[cols]
temp["year"] = temp["coverDate"].str[:4].astype("int32")
return temp[temp["year"] <= END_YEAR].sort_values("year")
def read_rankings(verbose=True):
"""Read our version of the unweighted and SJR weighted Tilburg
Economics Ranking.
"""
# Read in
folder = Path("./401_institution_rankings/")
weighted = pd.read_csv(folder/"weighted.csv", encoding="utf",
index_col=["institution", "year"])
weighted = weighted.add_suffix("-w").reset_index()
if verbose:
sd = weighted.loc[weighted["year"] == 2004, "score-w"].std()
print(f">>> 1 SD in the weighted score in 2004 equals {sd:.2f}")
weighted = standardize_winsorize_df(weighted, col="score-w")
unweighted = pd.read_csv(folder/"unweighted.csv", encoding="utf")
unweighted = standardize_winsorize_df(unweighted)
# Combine
return weighted.join(unweighted, how="outer")
def retrieve_citations(df, year, refresh=False):
"""Compute total citation count for the first `CITATION_RANGE` years
past graduation.
"""
# Retrieve citation count until `year` + `CITATION_RANGE`
cit_year = year + CITATION_RANGE
eids = df[df["year"] <= cit_year].index.to_list()
min_year = df["year"].min()
doc_ids = [eid.split("-")[-1] for eid in eids]
if not doc_ids:
cites = None
else:
sub_lists = [doc_ids[i:i + 25] for i in range(0, len(doc_ids), 25)]
cites = 0
for sub_list in sub_lists:
co = CitationOverview(sub_list, start=min_year, refresh=refresh)
for p in co.cc:
cites += sum([t[1] for t in p if t[0] < cit_year])
return cites
def standardize_winsorize_df(df, col="score"):
"""Standardize and winsorize a column in a DataFrame year-wise."""
df[f"{col}-std"] = df.groupby("year")[col].apply(standardize)
grouped = df.groupby("year")
df[f"{col}-win"] = grouped[col].apply(winsorize)
df[f"{col}-std-win"] = grouped[f"{col}-std"].apply(winsorize)
return df.set_index(["institution", "year"])
country_map = create_country_map()
def main():
# Read in
cols = ['stu_id', 'stu_school', 'stu_jel', 'stu_year', 'stu_scopus']
students = pd.read_csv(STUDENT_FILE, index_col="stu_id", usecols=cols)
plc = pd.read_csv(PLACEMENT_FILE, encoding="utf8").drop(columns="source")
df = plc.join(students, how="inner", on="stu_id")
# Store students w/o placement
no_placement = set(students.index) - set(df["stu_id"].unique())
no_placement = pd.DataFrame(index=sorted(no_placement))
print(f">>> No placement information for {no_placement.shape[0]:,} students")
stats = {'N_of_PhDs_without_placement': no_placement.shape[0]}
# Drop students w/o placement year
mask = df['plc_year'].isna()
print(f">>> No placement year for {mask.sum():,} students")
df.loc[mask, 'stu_plc'] = None
# Ignore placements past two years after PhD
mask = df["plc_year"] > df["stu_year"] + 2
df.loc[mask, ["stu_plc", "stu_rank"]] = None
df = df[['stu_id', 'stu_plc', 'stu_rank', 'plc_year', 'stu_year']]
# Add Scopus IDs of placements
affmap = pd.read_csv(MAPPING_FOLDER/"mapping.csv", index_col="our_name")
df = (df.join(affmap, on="stu_plc")
.rename(columns={"Scopus": "plc_scopus"}))
# Add rankings of placements
ranks = read_rankings()
df = (df.join(ranks.add_prefix("plc_"), on=["plc_scopus", "stu_year"])
.set_index("stu_id")
.drop(columns="stu_year"))
df = students.join(df).sort_index()
mask_ranked = ~df['plc_rank-w'].isna()
print(f">>> {df['stu_plc'].nunique():,} ranked placements "
f"for {mask_ranked.sum():,} students")
stats['N_of_PhDs_with_ranked_placement'] = mask_ranked.sum()
# Store unranked placements
no_rank = df[~mask_ranked].dropna(subset=["plc_year"]).copy()
no_rank = no_rank["plc_scopus"].fillna(no_rank["stu_plc"]).dropna()
stats['N_of_PhDs_without_ranked_placement'] = no_rank.shape[0]
no_rank = (no_rank.astype("str").str.replace(".0", "", regex=False)
.value_counts().to_frame('Occurrences'))
no_rank.index.name = "Affiliation"
no_rank = (no_rank.reset_index()
.sort_values(["Occurrences", "Affiliation"],
ascending=[False, True]))
print(f">>> {no_rank.shape[0]} unranked placements for "
f"{sum(no_rank['Occurrences']):,} students")
# Merge affiliation types
print(f">>> Retrieving affiliation information...")
info = df['plc_scopus'].progress_apply(get_affiliation_type)
df = pd.concat([df, info], axis=1)
print("... Distribution of types and countries:")
print(df['plc_type'].value_counts())
print(df['plc_country'].value_counts())
# Merge with gender information (assume unknowns to be male)
gender = pd.read_csv(GENDER_FILE, index_col="id", usecols=["id", "gender"])
gender = gender.rename(columns={"gender": "stu_sex"}).fillna("male")
gender.index.name = "stu_id"
df = df.join(gender)
# Merge with school ranks
df = (df.join(affmap.dropna(), on='stu_school')
.join(ranks.add_prefix("school_"), on=('Scopus', 'stu_year'))
.rename(columns={'Scopus': 'school_scopus'}))
assert(df['school_rank-w'].isna().sum() == 0)
# Drop students w/o Scopus ID
no_scopus = df[df["stu_scopus"].isna()].reset_index()[["stu_id"]]
print(f">>> Found {no_scopus.shape[0]} ({no_scopus.shape[0]/df.shape[0]:.2%}) "
"students w/o Scopus profile")
# Retrieve career and citation information
print(">>> Counting citations and searching affiliations...")
affs = {}
cites = {}
change = {}
coauthors = {}
students = students.dropna(subset=["stu_scopus"])
advisers = pd.read_csv(ADVISER_MAP, index_col="stu_id",
usecols=["stu_id", "adv_scopus"])
advisers["adv_scopus"] = advisers["adv_scopus"].str.split(";")
students = students.join(advisers)
students = students.dropna(subset=["adv_scopus"])
adv_map = students["adv_scopus"].to_dict()
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
# Query publications
try:
student_id = int(row["stu_scopus"])
except ValueError:
continue
pubs = query_publications(student_id)
# Retrieve co-authors
stu_adv = set(adv_map.get(idx, set()))
pubs["author_ids"] = pubs["author_ids"].str.split(";")
mask_five = pubs["year"].between(row.stu_year, row.stu_year+5)
coauth = pubs[mask_five]["author_ids"]
coauthors[student_id] = set([a for sl in coauth for a in sl])
# Retrieve citations
mask_with_adv = pubs["author_ids"].apply(
lambda s: len(stu_adv.intersection(s)) > 0)
try:
cites[student_id] = retrieve_citations(pubs[~mask_with_adv], row["stu_year"])
except KeyError:
pass
# Retrieve affiliations
if row["plc_scopus"] != row["plc_scopus"]:
continue
aff_d = parse_affiliations(pubs, str(student_id), int(row["plc_year"]))
retention = compare_placement(aff_d, str(int(row["plc_scopus"])))
affs[student_id] = retention
# Get first affiliation after change
try:
idx = min([k for k, v in retention.items() if v == 0])
lag = int(idx.split("-")[-1])
change[student_id] = (aff_d[lag], row["plc_year"]+lag)
except ValueError:
change[student_id] = (None, None)
change = pd.Series(change).apply(pd.Series).dropna()
change.columns = ["aff_id", "change_year"]
change = change.explode('aff_id')
change["aff_id"] = change["aff_id"].astype("uint64")
# Compute affiliation rank of changes
change = change.join(ranks[["rank-w", "rank"]].add_prefix("change_"),
on=('aff_id', 'change_year'))
change.index.name = "stu_scopus"
change = change.sort_values(["change_rank-w", "aff_id"])
change = change[~change.index.duplicated(keep='first')]
change = (change.reset_index().rename(columns={"aff_id": "change_scopus"})
.sort_values("change_rank-w")
.drop_duplicates(subset=["stu_scopus"])
.set_index("stu_scopus"))
# Merge all
cite_label = f"stu_citestock_{CITATION_RANGE}p"
df = (df.join(pd.DataFrame(affs).T, on='stu_scopus')
.drop("school_scopus", axis=1)
.join(pd.Series(cites, name=cite_label), on="stu_scopus")
.join(change, on="stu_scopus"))
# Write out
df.to_csv(TARGET_FILE, index_label="stu_id")
write_stats(stats)
# Maintenance
no_pubs = df[df[cite_label].isna()]["stu_scopus"].dropna().astype("uint64")
print(f">>> Found {no_pubs.shape[0]} students w/ Scopus profile ID but "
f"w/o publications {CITATION_RANGE} years past graduation")
# Compare with advisors
coauthors = pd.Series(coauthors)
students = students.dropna(subset="adv_scopus").set_index("stu_scopus")
students["coauthors"] = coauthors
students["joint_pub"] = students.apply(
lambda s: len(s["coauthors"].intersection(s["adv_scopus"])), axis=1)
joint_pub = students['joint_pub'].sum()
print(f">>> {joint_pub:,} students publish with their adviser(s) within "
f"5 years past graduation {joint_pub/students.shape[0]:.1%}")
if __name__ == '__main__':
main()