this script builds research profile of countries after internationally funded publications from a **specific** country is removed. Here the removing standard is remove a paper that funding from the specific country flows to other countries, e.g., USA funded paper is authored by China

In [1]:
import pandas as pd
import tqdm

In [2]:
pubs_path="../../../data/nf_folder/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_fund_frac_path='../../../data/nf_folder/Data/DerivedData/Derived/cntry_fund_frac.csv'

profile_path="../../../data/nf_folder/Data/DerivedData/Derived/dep-FundExchange/pub_no_foreign_top20.csv"

In [4]:
pub_df=pd.read_pickle(pubs_path)
pub_df.head()

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol,cofund,intfund
0,60694041,2017,9,1,"[Canada, United States]","[United States, United States]","[United States, Canada]",[United States],1,1,0,0
1,48573156,2012,89,1,[France],Not-Funded,[France],Not-Funded,0,0,0,0
2,55323671,2015,89,1,"[France, Chile]","[Chile, Chile, Chile, Chile, Chile]","[Chile, France]",[Chile],1,1,0,0
3,57633115,2016,31,1,"[Switzerland, United States]",[Switzerland],"[Switzerland, United States]",[Switzerland],1,1,0,0
4,64831413,2018,99,1,[China],"[China, China, China, China, China, China]",[China],[China],1,0,0,0


In [10]:
#get top 20 funders
def return_topn(df,n):
    df=df[df.funder!='Not-Funded']
    topn=df.groupby(['funder'])['cnt'].sum().reset_index().sort_values(
        by='cnt',ascending=False).head(n).funder.tolist()
    return topn
n=20
cntry_fund=pd.read_csv(cntry_fund_frac_path)
topn=return_topn(cntry_fund,n)

In [5]:
def get_publications_to_remove(fund_df,cntry):
    """Returns a list of publication IDs that are funded by a specific country
    and are either internationally coauthored or authored by other countries."""
    def assign_label(authors, cntry):
        
        if len(authors)>1 or authors[0]!=cntry:
            return 1
        else:
            return 0
    
    df=fund_df[fund_df.funder_distinct==cntry].copy()
    df['label']=df.apply(lambda x: assign_label(x.author_distinct,cntry),axis=1)
    removeid=list(df[df.label==1]['id'].unique())
    
    return removeid

In [28]:
def build_country_profile(pub_df, fund_df, cntry):
    """construct the research profile of rest of countries when the internationally-funded publications are removed
    """
    
    removeid=get_publications_to_remove(fund_df, cntry)
    df=pub_df[~pub_df['id'].isin(removeid)]
    df=df[['id','dis','author_distinct']].explode('author_distinct')
    df=df.groupby(['author_distinct','dis']).size().reset_index(name='count')
    
    return df

In [29]:
filter_df=pd.DataFrame()
years=pub_df['year'].unique()

for year in tqdm.tqdm(years):
    pub_year=pub_df[pub_df.year==year]
    fund_year=pub_year[['id','author_distinct','funder_distinct']].explode('funder_distinct')
    for cntry in topn:
        country_profile=build_country_profile(pub_year, fund_year, cntry)
        country_profile['srce_cntry']=cntry
        country_profile['year']=year
        filter_df=pd.concat([filter_df,country_profile])

2017
2012
2015
2016
2018
2010
2013
2014
2009
2011


In [9]:
filter_df.to_csv(profile_path, index=False)