This script prepares the regression table

input: publication table

output: regression table with all variables

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
import tqdm

In [2]:
pubs_path = "/u/miaoli/ember_home/dropbox/NationalFunding/Data/DerivedData/CleanedRawData/pub.pkl"
reg_table_path = "/u/miaoli/ember_home/dropbox/NationalFunding/Data/DerivedData/Derived/dependence/reg_table.csv"

In [3]:
# Parameters
pubs_path = "../data/NationalFunding/Data/simulated_data/Data/DerivedData/CleanedRawData/pub.pkl"
reg_table_path = "../data/NationalFunding/Data/simulated_data/Data/DerivedData/Derived/dependence/funding_unit_flow.csv"


In [4]:
def get_pubcnt_by_cntry_year(df):
    df = df[['id','year','author_distinct']].explode('author_distinct')
    df = df.groupby(['author_distinct','year']).size().reset_index(name='pubcnt')
    df = df.rename(columns={'author_distinct':'cntry'})
    df = df.pivot_table(index='cntry',columns='year',values='pubcnt',fill_value=0).reset_index()#fill missing years for countries
    df = df.melt(id_vars='cntry',var_name='year',value_name='pubcnt')
    return df

In [5]:
def get_fundcnt_from_cntry(df):
    df = df[df.funder!='Not-Funded']
    pub2cntry = df[['id','year','author_distinct','funder']].explode('author_distinct') #assign publications into author countries
    fund_from_cntry = pub2cntry.explode('funder').groupby(['author_distinct','year','funder']).size().reset_index(name='fundcnt')
    fund_from_cntry = fund_from_cntry.rename(columns={'author_distinct':'cntry'})
    return fund_from_cntry

#### count the amount of funding by types: domestic funding, major countries and others

In [6]:
def count_fund_by_type(fund_from_cntry, topcntrys):
    
    fund_from_cntry['region']=fund_from_cntry.apply(
        lambda row: "domestic" if row['cntry']==row['funder'] 
        else (row['funder'] if row['funder'] in topcntrys else "others"),axis=1)
    
    fund_from_region = fund_from_cntry.groupby(['cntry','year','region'])['fundcnt'].sum().reset_index()
    
    fund_from_region = fund_from_region.pivot_table(index=['cntry','year'], columns=['region'],values='fundcnt',fill_value=0)
    fund_from_region['foreign'] = fund_from_region.loc[:, fund_from_region.columns != 'domestic'].sum(axis=1)
    fund_from_region = np.log(fund_from_region+1).reset_index()
    return fund_from_region

In [7]:
def create_next_year_pub(df):
    # the input of the function pubcnt per country per year
    df = df.sort_values(by=['cntry','year'])
    df_next_year = pd.DataFrame(columns=['cntry', 'year','current_year_pubcnt', 'prvs_year_pubcnt'])
    
    for cntry in df.cntry.unique():
        cntry_df = df[df.cntry==cntry].copy()
        cntry_df.loc[:,'prvs_year_pubcnt'] = cntry_df['pubcnt'].shift(1)
        cntry_df = cntry_df[['cntry','year','pubcnt','prvs_year_pubcnt']].rename(columns={'pubcnt':'current_year_pubcnt'})
        df_next_year = pd.concat([df_next_year, cntry_df],ignore_index=True)
    
    df_next_year.dropna(inplace=True)
    df_next_year.reset_index(drop=True, inplace=True)
    
    return df_next_year

In [8]:
df = pd.read_pickle(pubs_path)

In [9]:
pubcnt_cntry_year = get_pubcnt_by_cntry_year(df)

In [10]:
fund_from_cntrys = get_fundcnt_from_cntry(df)

In [11]:
topcntrys=['United States','China','EU','United Kingdom','France','Germany']
fund_by_types = count_fund_by_type(fund_from_cntrys, topcntrys)

In [12]:
pubcnt_over_years = create_next_year_pub(pubcnt_cntry_year)

In [13]:
reg_table = pubcnt_over_years.merge(fund_by_types, on=['cntry','year'])

In [14]:
reg_table = reg_table.loc[(reg_table['current_year_pubcnt']!=0)&(reg_table['prvs_year_pubcnt']!=0)]
reg_table['pub_growth'] = reg_table['current_year_pubcnt']/reg_table['prvs_year_pubcnt']-1
reg_table['current_year_pubcnt'] = np.log(reg_table['current_year_pubcnt'].astype('float64',errors='ignore'))
reg_table['prvs_year_pubcnt'] = np.log(reg_table['prvs_year_pubcnt'])

In [15]:
reg_table.to_csv(reg_table_path, index=False)