this script builds research profile of countries after internationally funded publications from a **specific** country is removed. This script constructs a loose counterfactual compared with build_profile_without_top20.ipynb. Here the removing standard is remove a paper that funding from the specific country flows to other countries as well as the country is the **only** funding source for the paper e.g., USA funded paper is authored by China

In [1]:
import pandas as pd
import tqdm
import numpy as np

In [2]:
pubs_path="../../data/NationalFunding/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_fund_frac_path='../../data/NationalFunding/Data/DerivedData/Derived/cntry_fund_frac.csv'

profile_path="../../../data/nf_folder/Data/DerivedData/Derived/dep-FundExchange/pub_no_foreign_top20.csv"

In [3]:
# Parameters
pubs_path = "../data/NationalFunding/Data/simulated_data/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_fund_frac_path = "../data/NationalFunding/Data/simulated_data/Data/DerivedData/Derived/cntry_fund_frac.csv"
profile_path = "../data/NationalFunding/Data/simulated_data/Data/DerivedData/Derived/dependence/pub_noforeign_exclusive_fund_top20.csv"


In [4]:
pub_df=pd.read_pickle(pubs_path)
pub_df.head()

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol,cofund,intfund
0,0,2014,29,1,[Canada],"[United States, Brazil, United States, Republi...",[Canada],"[Republic of Serbia, China, Brazil, United Sta...",1,0,1,1
1,1,2017,32,1,"[Russia, Spain]","[Austria, China, China, China, United States, ...","[Russia, Spain]","[China, United States, Austria, Russia, Japan]",1,1,1,1
2,2,2012,15,1,[Turkey],Not-Funded,[Turkey],Not-Funded,0,0,0,0
3,3,2010,53,1,"[United States, Iran]",[China],"[Iran, United States]",[China],1,1,0,1
4,4,2015,22,1,[Denmark],Not-Funded,[Denmark],Not-Funded,0,0,0,0


In [5]:
#get top 20 funders
def return_topn(df,n):
    df=df[df.funder!='Not-Funded']
    topn=df.groupby(['funder'])['cnt'].sum().reset_index().sort_values(
        by='cnt',ascending=False).head(n).funder.tolist()
    return topn
n=20
cntry_fund=pd.read_csv(cntry_fund_frac_path)
topn=return_topn(cntry_fund,n)

In [6]:
def remove_publications(pub_df,cntry):
    """filter out the publications that US is the only funder and there are other countries in the authorlist."""
    df = pub_df.copy()
    
    df['author_distinct']=df['author_distinct'].apply(set)
    df['funder_distinct']=df['funder_distinct'].apply(set)
    
    author_country_array = np.array(df['author_distinct'].tolist())
    funder_country_array = np.array(df['funder_distinct'].tolist())
    
    single_funder = np.array([cntry in country_set and len(country_set)==1 for country_set in funder_country_array])
    multiple_author_countries = np.array([len(country_set)>1 for country_set in author_country_array])
    author_not_funder = np.array([len(country_set)==1 and cntry not in country_set for country_set in author_country_array])
    
    filtered_df = df[~((multiple_author_countries & single_funder)|(single_funder & author_not_funder))]
        
    return filtered_df

In [7]:
def build_country_profile(pub_df, cntry):
    """construct the research profile of rest of countries when the internationally-funded publications are removed
    """
    
    filtered_df = remove_publications(pub_df,cntry)
    filtered_df=filtered_df[['id','dis','author_distinct']].explode('author_distinct')
    df=filtered_df.groupby(['author_distinct','dis']).size().reset_index(name='count')
    
    return df

In [8]:
filter_df=pd.DataFrame()
years=pub_df['year'].unique()

for year in tqdm.tqdm(years):
    pub_year=pub_df[pub_df.year==year] #the publication in a specific year
    for cntry in topn:
        country_profile=build_country_profile(pub_year, cntry)
        country_profile['srce_cntry']=cntry
        country_profile['year']=year
        filter_df=pd.concat([filter_df,country_profile])

  0%|                                                    | 0/10 [00:00<?, ?it/s]

 10%|████▍                                       | 1/10 [00:07<01:06,  7.42s/it]

 20%|████████▊                                   | 2/10 [00:15<01:00,  7.57s/it]

 30%|█████████████▏                              | 3/10 [00:21<00:49,  7.03s/it]

 40%|█████████████████▌                          | 4/10 [00:27<00:39,  6.57s/it]

 50%|██████████████████████                      | 5/10 [00:34<00:34,  6.90s/it]

 60%|██████████████████████████▍                 | 6/10 [00:43<00:29,  7.36s/it]

 70%|██████████████████████████████▊             | 7/10 [00:50<00:22,  7.45s/it]

 80%|███████████████████████████████████▏        | 8/10 [00:56<00:13,  6.84s/it]

 90%|███████████████████████████████████████▌    | 9/10 [01:02<00:06,  6.69s/it]

100%|███████████████████████████████████████████| 10/10 [01:09<00:00,  6.89s/it]

100%|███████████████████████████████████████████| 10/10 [01:09<00:00,  7.00s/it]




In [9]:
filter_df.to_csv(profile_path, index=False)