In [1]:
import pandas as pd
import numpy as np
import pickle

In [18]:
#read the first 10000 rows of data/workid_authorid_institutionid_pubdate24.csv
paper_data = pd.read_csv('data/workid_authorid_institutionid_pubdate24.csv', nrows=100000, index_col=0)
# read data/institution_geo24.pkl
with open('data/institution_geo24.pkl', 'rb') as f:
    institution_geo = pickle.load(f)
institution_geo

Unnamed: 0,institution_id,city,region,country_code,country,latitude,longitude
0,I220871436,,,FR,,48.848610,2.342780
1,I4210140407,Portland,Oregon,US,United States,45.525047,-122.672940
2,I4210121796,London,,GB,United Kingdom,51.544495,-0.056045
3,I2800206933,Vienna,,AT,Austria,48.204475,16.357422
4,I2802204158,Salem,Oregon,US,United States,44.886715,-123.031906
...,...,...,...,...,...,...,...
108136,I4394709273,Washington,,,United States,38.895110,-77.036370
108137,I4394709274,Delft,,,The Netherlands,52.006670,4.355560
108138,I4394709279,Washington,,,United States,38.895110,-77.036370
108139,I82930923,London,,GB,United Kingdom,51.531994,-0.119311


In [19]:
# drop institutions with invalid geo information
institution_geo.dropna(subset=['city', 'country_code'], inplace=True)
institution_geo

Unnamed: 0,institution_id,city,region,country_code,country,latitude,longitude
1,I4210140407,Portland,Oregon,US,United States,45.525047,-122.672940
2,I4210121796,London,,GB,United Kingdom,51.544495,-0.056045
3,I2800206933,Vienna,,AT,Austria,48.204475,16.357422
4,I2802204158,Salem,Oregon,US,United States,44.886715,-123.031906
5,I2998552023,Los Angeles,California,US,United States,34.046494,-118.250730
...,...,...,...,...,...,...,...
107410,I4210167378,Seoul,,KR,South Korea,37.527344,126.885635
107430,I4387154366,Paris,,FR,France,48.853410,2.348800
107433,I4387155155,Barcelona,,ES,Spain,41.388790,2.158990
108139,I82930923,London,,GB,United Kingdom,51.531994,-0.119311


Handle records caused by short-term visiting.

In [20]:
# filter institutions with valid geo info
paper_data = paper_data[paper_data['institution_id'].isin(institution_geo['institution_id'])]
print('number of ins with valid geo info:', len(paper_data))

paper_data['publication_date'] = pd.to_datetime(paper_data['publication_date'])

# Sort by author_id and publication_date
paper_data.sort_values(by=['author_id', 'publication_date'], inplace=True)

# Calculate the duration of stay in each institution
paper_data['first_publication'] = paper_data.groupby(['author_id', 'institution_id'])['publication_date'].transform('min')
paper_data['last_publication'] = paper_data.groupby(['author_id', 'institution_id'])['publication_date'].transform('max')

# filter the records where the author stay in a ins for less than 2 years
paper_data = paper_data[paper_data['last_publication'] - paper_data['first_publication'] > pd.Timedelta(days=365*2)]
print('filtered paper data length: ', len(paper_data))
paper_data

number of ins with valid geo info: 99983
filtered paper data length:  16314


Unnamed: 0,work_id,author_id,institution_id,publication_date,first_publication,last_publication
37666,W1884413841,A5000043115,I43526919,2009-06-30,2009-06-30,2017-12-22
43972,W1501299913,A5000043115,I43526919,2014-03-01,2009-06-30,2017-12-22
161811,W2777298192,A5000043115,I43526919,2017-12-22,2009-06-30,2017-12-22
365818,W3031475227,A5000045667,I2801556517,2010-08-18,2010-08-18,2019-09-25
365905,W3031628442,A5000045667,I2801556517,2013-04-25,2010-08-18,2019-09-25
...,...,...,...,...,...,...
284081,W4205148890,A5094564465,I44260953,2021-11-01,2020-10-01,2022-11-01
410532,W4308205655,A5094564465,I44260953,2022-11-01,2020-10-01,2022-11-01
390831,W4206751057,A5094564485,I44260953,2020-10-01,2020-10-01,2022-11-01
284154,W4205591922,A5094564485,I44260953,2021-08-01,2020-10-01,2022-11-01


## Extract Mobility

Handle paper records with authors affiliated to multiple institutions

In [16]:
# move_data = paper_data.drop_duplicates(subset=['author_id', 'institution_id'], keep='first')
move_data = paper_data.drop(['first_publication', 'last_publication'], axis=1)

move_data['next_work'] = move_data.groupby('author_id')['work_id'].shift(-1)
move_data['next_ins'] = move_data.groupby('author_id')['institution_id'].shift(-1)
move_data['prev_work'] = move_data.groupby('author_id')['work_id'].shift(1)

# drop records with ins=next_ins
move_data = move_data[move_data['institution_id'] != move_data['next_ins']]
print('Number of records:', len(move_data))

# Handle Multiple institutions
# drop records with work=next_work or work=prev_work





# Calculate the stay time in each institution





# move_data['stay_time'] = move_data['publication_date'] - move_data['prev_pub_date']
# move_data.rename(columns={"publication_date": "move_date"}, inplace=True)
move_data

Unnamed: 0,work_id,author_id,institution_id,publication_date,first_publication,last_publication,next_ins,next_pub_date,stay_time
161811,W2777298192,A5000043115,I43526919,2017-12-22,2009-06-30,2017-12-22,,NaT,NaT
364586,W3029251598,A5000045667,I2801556517,2019-09-25,2010-08-18,2019-09-25,,NaT,NaT
12322,W2289784002,A5000058679,I4210138949,2010-01-01,1997-01-01,2010-01-01,,NaT,NaT
110922,W2512002024,A5000064050,I255141171,2016-04-20,2008-06-01,2016-04-20,,NaT,NaT
480482,W2795181566,A5000079207,I179647637,2017-07-05,2011-01-01,2017-07-05,,NaT,NaT
...,...,...,...,...,...,...,...,...,...
410542,W4308205655,A5094378599,I44260953,2022-11-01,2020-10-01,2022-11-01,,NaT,NaT
410520,W4308205655,A5094380067,I44260953,2022-11-01,2020-10-01,2022-11-01,,NaT,NaT
410537,W4308205655,A5094380080,I44260953,2022-11-01,2020-10-01,2022-11-01,,NaT,NaT
410532,W4308205655,A5094564465,I44260953,2022-11-01,2020-10-01,2022-11-01,,NaT,NaT


In [17]:
move_data[move_data['author_id'] == 'A5001349134']

Unnamed: 0,work_id,author_id,institution_id,publication_date,first_publication,last_publication,next_ins,next_pub_date,stay_time
176938,W2910919700,A5001349134,I4210107675,2013-08-05,2013-08-05,2021-06-30,I166843116,2013-08-31,26 days
71896,W1832114070,A5001349134,I166843116,2014-09-14,2013-08-31,2016-03-19,I4210107675,2015-09-21,372 days
194945,W2978392402,A5001349134,I4210107675,2015-09-21,2013-08-05,2021-06-30,I166843116,2016-01-21,122 days
103810,W2400040857,A5001349134,I166843116,2016-03-19,2013-08-31,2016-03-19,I4210107675,2018-10-02,927 days
269482,W3201019528,A5001349134,I4210107675,2021-06-30,2013-08-05,2021-06-30,,NaT,NaT


In [13]:
raw_data =  pd.read_csv('data/workid_authorid_institutionid_pubdate24.csv', nrows=100000, index_col=0)
raw_data[raw_data['author_id']== 'A5001349134'].sort_values(by='publication_date')

Unnamed: 0,work_id,author_id,institution_id,publication_date
176938,W2910919700,A5001349134,I4210107675,2013-08-05
44514,W1559707462,A5001349134,I166843116,2013-08-31
40579,W1591225243,A5001349134,I166843116,2014-03-15
71896,W1832114070,A5001349134,I166843116,2014-09-14
194945,W2978392402,A5001349134,I4210107675,2015-09-21
98437,W2329871730,A5001349134,I92961808,2016-01-01
96905,W2300078121,A5001349134,I166843116,2016-01-21
97730,W2310250014,A5001349134,I166843116,2016-01-22
103810,W2400040857,A5001349134,I166843116,2016-03-19
184448,W2943448356,A5001349134,I3132802412,2017-01-01


In [14]:
raw_data[raw_data['author_id']== 'A5001157990'].sort_values(by='publication_date')

Unnamed: 0,work_id,author_id,institution_id,publication_date
131343,W2737511318,A5001157990,I51235708,2011-01-01
64210,W1544793928,A5001157990,I4210088341,2014-07-28
164743,W2788550934,A5001157990,I4210088341,2017-07-31
175327,W2904745244,A5001157990,I51235708,2018-01-01
181226,W2922503656,A5001157990,I51235708,2018-01-01
