this script builds research profile of countries after internationally funded publications from a **specific** country is removed. Here the removing standard is remove a paper that funding from the specific country flows to other countries, e.g., USA funded paper is authored by China. 

**Here the target country includes all countries.**

In [15]:
import pandas as pd
from tqdm import tqdm

In [25]:
pubs_path="../../../data/nf_folder/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_fund_frac_path='../../../data/nf_folder/Data/DerivedData/Derived/cntry_fund_frac.csv'
raw_path="../../../data/nf_folder/Data/DerivedData/Derived/cntry_author_full.csv"

In [3]:
profile_path="../../../data/nf_folder/Data/DerivedData/Derived/dep-FundExchange/pub_no_foreign_top20.csv"

In [3]:
pub_df=pd.read_pickle(pubs_path)
pub_df.head()

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol,cofund,intfund
0,60694041,2017,9,1,"[Canada, United States]","[United States, United States]","[United States, Canada]",[United States],1,1,0,0
1,48573156,2012,89,1,[France],Not-Funded,[France],Not-Funded,0,0,0,0
2,55323671,2015,89,1,"[France, Chile]","[Chile, Chile, Chile, Chile, Chile]","[Chile, France]",[Chile],1,1,0,0
3,57633115,2016,31,1,"[Switzerland, United States]",[Switzerland],"[Switzerland, United States]",[Switzerland],1,1,0,0
4,64831413,2018,99,1,[China],"[China, China, China, China, China, China]",[China],[China],1,0,0,0


In [9]:
cntry_fund=pd.read_csv(cntry_fund_frac_path)
funders=cntry_fund.funder.unique()

In [13]:
def get_removeid(fund_df,cntry):
    """for publications that are funded by a specific country, label them as 1 if they are
    internationally coauthored or authored by other countries
    """
    def assign_label(authors, cntry):
        label=0
        if len(authors)>1:#
            label=1
        elif authors[0]!=cntry:
            label=1
        return label
    
    df=fund_df[fund_df.funder_distinct==cntry].copy()
    df['label']=df.apply(lambda x: assign_label(x.author_distinct,cntry),axis=1)
    removeid=list(df[df.label==1]['id'].unique())
    
    return removeid

In [14]:
def build_profile(pub_df, fund_df, cntry):
    """construct the research profile of rest of countries when the internationally-funded publications are removed
    """
    
    removeid=get_removeid(fund_df, cntry)
    df=pub_df[~pub_df['id'].isin(removeid)]
    df=df[['id','dis','author_distinct']].explode('author_distinct')
    df=df.groupby(['author_distinct','dis']).size().reset_index(name='count')
    
    return df

In [None]:
filter_df=pd.DataFrame()
fund_df=pub_df[['id','author_distinct','funder_distinct']]
fund_df=fund_df.explode('funder_distinct')

In [18]:
for cntry in tqdm(funders):
    profile=build_profile(pub_df, fund_df, cntry)
    profile['srce_cntry']=cntry
    filter_df=pd.concat([filter_df,profile])

100%|██████████| 192/192 [1:55:06<00:00, 35.97s/it]  


In [20]:
filter_df=filter_df.groupby(['author_distinct','srce_cntry'])['count'].sum().reset_index()

In [30]:
filter_df=filter_df.rename(columns={'count':'filter'})

In [31]:
raw_df=pd.read_csv(raw_path)
raw_df = raw_df.groupby(['cntry'])['count'].sum().reset_index(name='raw')
filter_df = filter_df.rename(columns={'author_distinct':'cntry'})
meta=raw_df.merge(filter_df,on=['cntry'])
meta['p']=(meta['raw']-meta['filter'])/meta['raw']

In [38]:
meta=meta[['srce_cntry','cntry','raw','filter','p']]

In [40]:
meta=meta.rename(columns={'srce_cntry':'source','cntry':'target','raw':'raw_pub','filter':'filter_pub'})

In [41]:
meta.to_csv("pubred_all2all.csv")