In [19]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

In [20]:
top30ins = pd.read_hdf('data/top30ins_geo.h5')
top20ins = pd.read_hdf('data/top20ins_geo.h5')
top20countries = top20ins['country_code'].unique()

In [21]:
def get_level_distance(ins_1, ins_2):
    row_1 = top30ins.loc[ins_1]
    row_2 = top30ins.loc[ins_2]
    # Considering many countries have cities with the same name, the level distance should be judged from top to bottom.
    if(row_1['continent_id'] != row_2['continent_id']):
        return 3 # Intercontinental movement
    elif(row_1['country_id'] != row_2['country_id']):
        return 2 # Movement between different countries within the same continent
    elif(row_1['city_id'] != row_2['city_id']):
        return 1 # Movement between different cities within the same country
    else: return 0 # Movement within the same city


In [22]:
## get the power of p_i
def get_n(ins_geo, all_geo, flow_mat):
    n = [0,0,0,0]
    for i in ins_geo.index:
        for k in all_geo.index:
            flow = flow_mat[i,k]
            if(flow != 0):
                d = get_level_distance(i, k)
                n[d] += flow
    return n

In [23]:
def estimate_by_time(flow_mat, ins_geo, all_geo):
    n = get_n(ins_geo, all_geo, flow_mat)
    if sum(n) == 0:
        return [0,0,0,0]
    return [n[i]/sum(n) for i in range(4)]

In [24]:
def estimate(all_geo, top20ins, country):
    '''
    all_geo: Institutions with valid geographic information from the top 30 countries.
    top20ins: Institutions with valid geographic information from the top 20 countries and top 10 cities.
    country: Current country.
    goal: Probability of level distance from any institution with valid geographic information in the current country to the top 20 countries and top 10 cities.
    '''
    
    # ins_geo = preprocess(ins_geo)
    
    country_ins = all_geo[all_geo['country_code'] == country] # All institutions in the current country

    # Assigning IDs to all cities, countries, and continents in top20ins
    top20ins = top20ins[['city', 'country_code', 'continent_code']]
    countries = list(set(top20ins['country_code']))
    top20ins.loc[:, 'country_id'] = top20ins['country_code'].apply(lambda x: countries.index(x))
    continents = list(set(top20ins['continent_code']))
    top20ins.loc[:, 'continent_id'] = top20ins['continent_code'].apply(lambda x: continents.index(x))
    cities = list(set(top20ins['city']))
    top20ins.loc[:, 'city_id'] = top20ins['city'].apply(lambda x: cities.index(x))
    top20ins = top20ins[['city_id', 'country_id', 'continent_id']]
    
    
    results = []
    for year in range(1960, 2011, 5):
        f_read = open('data/flow_matrices/flow_matrix[%d-%d].pkl' %(year, year+4), 'rb')
        this_mat = pickle.load(f_read)
        f_read.close()
        this_mat = this_mat.A
        results.append(estimate_by_time(this_mat, country_ins, top20ins))
    f_read = open('data/flow_matrices/flow_matrix[%d-%d].pkl' %(2015, 2021), 'rb')
    this_mat = pickle.load(f_read)
    f_read.close()
    this_mat = this_mat.A
    results.append(estimate_by_time(this_mat, country_ins, top20ins))
    
    return np.array(results)


In [25]:
import warnings
year_index = []
for year in range(1960, 2011, 5):
    year_index.append("%d-%d"%(year, year+4))
year_index.append("2015-2021")

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    writer = pd.ExcelWriter('data/level_distance.xlsx', engine='xlsxwriter')
    for country in top20countries:
        results = estimate(top30ins, top20ins, country)
        results_df = pd.DataFrame(results, 
                                  columns=['P(d = 0)', 'P(d = 1)', 'P(d = 2)', 'P(d = 3)'], 
                                  index=year_index)
        results_df.to_excel(writer, sheet_name=country)
    writer.close()

In [26]:
def draw(results, country):
    %matplotlib inline

    x = range(1960, 2016, 5)

    p_0 = results[:, 0]
    p_1 = results[:, 1]
    p_2 = results[:, 2]
    p_3 = results[:, 3]

    plt.figure(figsize=(16,9))
    plt.plot(x, p_0, 'o-', color='r', label="P{d = 0}")
    plt.plot(x, p_1, 'p-', color='b', label="P{d = 1}")
    plt.plot(x, p_2, '*-', color='g', label="P{d = 2}")
    plt.plot(x, p_3, 's-', color='y', label='P{d = 3}')

    plt.title(country)
    plt.xlabel("time")
    plt.ylabel("prob")
    plt.legend(loc="best")

    plt.show()

In [27]:
# with warnings.catch_warnings():
#     warnings.simplefilter('ignore')
#     for country in ['US', 'CN', 'IN', 'JP', 'GB', 'FR', 'DE', 'BR', 'RU', 'ES', 'KR', 'CA',
#        'IT', 'AU', 'ID', 'IR', 'TR', 'TW', 'PL', 'MX']:
#         results = np.array(estimate(top30ins, top20ins, country))
#         draw(results, country)