In [2]:
import pandas as pd
import numpy as np
pd.__version__

'2.2.2'

In [3]:
mdf = pd.read_csv('data/movement_df_with_ld.csv', index_col=0)
# convert the NaNs in the "continent_code" to string "NA"
mdf['continent_code'] = mdf['continent_code'].fillna('NA')

mdf['move_time'] = pd.to_datetime(mdf['move_time'])

bins = list(range(1960, 2026, 5))
labels = [f'{i}-{i+4}' for i in bins[:-1]]

print(bins)
print(labels)

[1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025]
['1960-1964', '1965-1969', '1970-1974', '1975-1979', '1980-1984', '1985-1989', '1990-1994', '1995-1999', '2000-2004', '2005-2009', '2010-2014', '2015-2019', '2020-2024']


In [4]:
labels

['1960-1964',
 '1965-1969',
 '1970-1974',
 '1975-1979',
 '1980-1984',
 '1985-1989',
 '1990-1994',
 '1995-1999',
 '2000-2004',
 '2005-2009',
 '2010-2014',
 '2015-2019',
 '2020-2024']

In [5]:
mdf[mdf['institution_id'] == mdf['prev_ins']]

Unnamed: 0,author_id,institution_id,move_time,prev_ins,stay_time,city,country_code,continent_code,move_year,prev_country,level_distance


In [6]:
mdf['time_slot']  = pd.cut(mdf['move_time'].dt.year, bins=bins, labels=labels, right=False)

movement_counts = mdf.groupby(['prev_ins', 'institution_id', 'time_slot', 'level_distance'], observed = True).size().reset_index(name='counts')

movement_counts

Unnamed: 0,prev_ins,institution_id,time_slot,level_distance,counts
0,I100005738,I100191712,2005-2009,1,1
1,I100005738,I10052268,2015-2019,1,1
2,I100005738,I10052268,2020-2024,1,1
3,I100005738,I100538780,1995-1999,1,1
4,I100005738,I100633361,2015-2019,1,1
...,...,...,...,...,...
3214902,I99981631,I4210099175,1990-1994,0,1
3214903,I99981631,I4210099175,2000-2004,0,3
3214904,I99981631,I4210146600,2015-2019,0,1
3214905,I99981631,I4210146600,2020-2024,0,1


In [7]:
import tqdm

# 预计算总的移动数量
total_movements = mdf.groupby('time_slot').size().to_dict()

# 预计算每个时间段和前机构的流出数量
outflows = mdf.groupby(['time_slot', 'prev_ins']).size().to_dict()

# 预计算每个时间段和当前机构的流入数量
inflows = mdf.groupby(['time_slot', 'institution_id']).size().to_dict()

def calculate_simulated_movement(row):
    time_slot = row['time_slot']
    prev_ins = row['prev_ins']
    institution_id = row['institution_id']

    total_movements_count = total_movements.get(time_slot, 0)
    outflow_count = outflows.get((time_slot, prev_ins), 0)
    inflow_count = inflows.get((time_slot, institution_id), 0)

    return (outflow_count * inflow_count) / total_movements_count if total_movements_count != 0 else 0

tqdm.tqdm.pandas()

movement_counts['Simulated_movement'] = movement_counts.progress_apply(calculate_simulated_movement, axis=1)

  total_movements = mdf.groupby('time_slot').size().to_dict()
  outflows = mdf.groupby(['time_slot', 'prev_ins']).size().to_dict()
  inflows = mdf.groupby(['time_slot', 'institution_id']).size().to_dict()
100%|██████████| 3214907/3214907 [00:19<00:00, 162020.73it/s]


In [9]:
movement_counts['time_slot'].value_counts()

time_slot
2020-2024    739979
2015-2019    627788
2010-2014    511481
2005-2009    390666
2000-2004    285352
1995-1999    211201
1990-1994    155584
1985-1989    106753
1980-1984     75700
1975-1979     52809
1970-1974     35658
1965-1969     18772
1960-1964      3164
Name: count, dtype: int64

In [10]:
movement_counts.to_csv('data/movement_counts.csv', index=False)