this script builds research profile of countries after internationally funded publications from a **specific** country is removed. Here the removing standard is remove a paper that funding from the specific country flows to other countries, e.g., USA funded paper is authored by China. 

**all countries are removed iteratively**

In [1]:
import pandas as pd
import tqdm

In [2]:
pubs_path="../../data/nf_folder/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_full_path="../../data/nf_folder/Data/DerivedData/Derived/cntry_author_full.csv"

profile_path="../../../data/nf_folder/Data/DerivedData/Derived/dep-FundExchange/pub_no_foreign_individual_allcountries.csv"

In [3]:
# Parameters
pubs_path = "../data/NationalFunding/Data/simulated_data/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_full_path = "../data/NationalFunding/Data/simulated_data/Data/DerivedData/Derived/cntry_author_full.csv"
profile_path = "../data/NationalFunding/Data/simulated_data/Data/DerivedData/Derived/dependence/pub_noforeign_fund_all2all.csv"


In [4]:
pub_df=pd.read_pickle(pubs_path)
pub_df.head()

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol,cofund,intfund
0,0,2014,29,1,[Canada],"[United States, Brazil, United States, Republi...",[Canada],"[Republic of Serbia, China, Brazil, United Sta...",1,0,1,1
1,1,2017,32,1,"[Russia, Spain]","[Austria, China, China, China, United States, ...","[Russia, Spain]","[China, United States, Austria, Russia, Japan]",1,1,1,1
2,2,2012,15,1,[Turkey],Not-Funded,[Turkey],Not-Funded,0,0,0,0
3,3,2010,53,1,"[United States, Iran]",[China],"[Iran, United States]",[China],1,1,0,1
4,4,2015,22,1,[Denmark],Not-Funded,[Denmark],Not-Funded,0,0,0,0


In [5]:
def get_publications_to_remove(fund_df,cntry):
    """Returns a list of publication IDs that are funded by a specific country
    and are either internationally coauthored or authored by other countries."""
    def assign_label(authors, cntry):
        
        if len(authors)>1 or authors[0]!=cntry:
            return 1
        else:
            return 0
    
    df=fund_df[fund_df.funder_distinct==cntry].copy()
    df['label']=df.apply(lambda x: assign_label(x.author_distinct,cntry),axis=1)
    removeid=list(df[df.label==1]['id'].unique())
    
    return removeid

In [6]:
def build_country_profile(pub_df, fund_df, cntry):
    """construct the research profile of rest of countries when the internationally-funded publications are removed
    """
    
    removeid=get_publications_to_remove(fund_df, cntry)
    df=pub_df[~pub_df['id'].isin(removeid)]
    df=df[['id','dis','author_distinct']].explode('author_distinct') #assign papers to authorship countries using full counting
    df=df.groupby(['author_distinct','dis']).size().reset_index(name='count')
    
    return df

In [7]:
filter_df=pd.DataFrame()
fund_df = pub_df[['id','author_distinct','funder_distinct']].explode('funder_distinct')
cntrys=fund_df.funder_distinct.unique()

for cntry in tqdm.tqdm(cntrys):
    country_profile=build_country_profile(pub_df, fund_df, cntry)
    country_profile['srce_cntry']=cntry
    filter_df=pd.concat([filter_df,country_profile])

  0%|                                                   | 0/161 [00:00<?, ?it/s]

  1%|▎                                          | 1/161 [00:00<01:14,  2.14it/s]

  1%|▌                                          | 2/161 [00:03<04:34,  1.73s/it]

  2%|▊                                          | 3/161 [00:03<03:22,  1.28s/it]

  2%|█                                          | 4/161 [00:05<03:50,  1.47s/it]

  3%|█▎                                         | 5/161 [00:06<02:53,  1.11s/it]

  4%|█▌                                         | 6/161 [00:06<02:24,  1.07it/s]

  4%|█▊                                         | 7/161 [00:07<02:14,  1.14it/s]

  5%|██▏                                        | 8/161 [00:09<03:32,  1.39s/it]

  6%|██▍                                        | 9/161 [00:10<02:51,  1.13s/it]

  6%|██▌                                       | 10/161 [00:11<02:28,  1.01it/s]

  7%|██▊                                       | 11/161 [00:11<02:08,  1.17it/s]

  7%|███▏                                      | 12/161 [00:12<02:03,  1.21it/s]

  8%|███▍                                      | 13/161 [00:13<02:01,  1.22it/s]

  9%|███▋                                      | 14/161 [00:13<01:45,  1.39it/s]

  9%|███▉                                      | 15/161 [00:14<01:39,  1.46it/s]

 10%|████▏                                     | 16/161 [00:15<01:41,  1.43it/s]

 11%|████▍                                     | 17/161 [00:15<01:32,  1.55it/s]

 11%|████▋                                     | 18/161 [00:16<01:32,  1.54it/s]

 12%|████▉                                     | 19/161 [00:16<01:32,  1.54it/s]

 12%|█████▏                                    | 20/161 [00:17<01:25,  1.66it/s]

 13%|█████▍                                    | 21/161 [00:17<01:20,  1.73it/s]

 14%|█████▋                                    | 22/161 [00:18<01:15,  1.83it/s]

 14%|██████                                    | 23/161 [00:19<01:18,  1.76it/s]

 15%|██████▎                                   | 24/161 [00:19<01:14,  1.84it/s]

 16%|██████▌                                   | 25/161 [00:20<01:17,  1.75it/s]

 16%|██████▊                                   | 26/161 [00:20<01:15,  1.78it/s]

 17%|███████                                   | 27/161 [00:21<01:13,  1.83it/s]

 17%|███████▎                                  | 28/161 [00:21<01:11,  1.86it/s]

 18%|███████▌                                  | 29/161 [00:22<01:17,  1.71it/s]

 19%|███████▊                                  | 30/161 [00:22<01:15,  1.73it/s]

 19%|████████                                  | 31/161 [00:23<01:14,  1.74it/s]

 20%|████████▎                                 | 32/161 [00:23<01:10,  1.83it/s]

 20%|████████▌                                 | 33/161 [00:24<01:07,  1.90it/s]

 21%|████████▊                                 | 34/161 [00:24<01:05,  1.93it/s]

 22%|█████████▏                                | 35/161 [00:25<01:03,  1.99it/s]

 22%|█████████▍                                | 36/161 [00:25<01:04,  1.95it/s]

 23%|█████████▋                                | 37/161 [00:26<01:02,  1.99it/s]

 24%|█████████▉                                | 38/161 [00:26<01:02,  1.97it/s]

 24%|██████████▏                               | 39/161 [00:27<01:01,  1.98it/s]

 25%|██████████▍                               | 40/161 [00:27<01:01,  1.98it/s]

 25%|██████████▋                               | 41/161 [00:28<00:59,  2.01it/s]

 26%|██████████▉                               | 42/161 [00:28<00:59,  2.01it/s]

 27%|███████████▏                              | 43/161 [00:29<00:57,  2.05it/s]

 27%|███████████▍                              | 44/161 [00:29<00:56,  2.07it/s]

 28%|███████████▋                              | 45/161 [00:30<00:55,  2.07it/s]

 29%|████████████                              | 46/161 [00:30<00:54,  2.10it/s]

 29%|████████████▎                             | 47/161 [00:31<00:56,  2.03it/s]

 30%|████████████▌                             | 48/161 [00:31<00:56,  2.01it/s]

 30%|████████████▊                             | 49/161 [00:32<00:57,  1.96it/s]

 31%|█████████████                             | 50/161 [00:32<00:55,  2.00it/s]

 32%|█████████████▎                            | 51/161 [00:33<00:54,  2.00it/s]

 32%|█████████████▌                            | 52/161 [00:33<00:53,  2.03it/s]

 33%|█████████████▊                            | 53/161 [00:34<00:52,  2.06it/s]

 34%|██████████████                            | 54/161 [00:34<00:52,  2.03it/s]

 34%|██████████████▎                           | 55/161 [00:35<00:51,  2.05it/s]

 35%|██████████████▌                           | 56/161 [00:35<00:52,  2.01it/s]

 35%|██████████████▊                           | 57/161 [00:36<00:50,  2.04it/s]

 36%|███████████████▏                          | 58/161 [00:36<00:50,  2.05it/s]

 37%|███████████████▍                          | 59/161 [00:37<00:49,  2.07it/s]

 37%|███████████████▋                          | 60/161 [00:37<00:48,  2.09it/s]

 38%|███████████████▉                          | 61/161 [00:38<00:47,  2.10it/s]

 39%|████████████████▏                         | 62/161 [00:38<00:46,  2.11it/s]

 39%|████████████████▍                         | 63/161 [00:39<00:46,  2.11it/s]

 40%|████████████████▋                         | 64/161 [00:39<00:45,  2.11it/s]

 40%|████████████████▉                         | 65/161 [00:40<00:45,  2.11it/s]

 41%|█████████████████▏                        | 66/161 [00:40<00:45,  2.09it/s]

 42%|█████████████████▍                        | 67/161 [00:41<00:44,  2.10it/s]

 42%|█████████████████▋                        | 68/161 [00:41<00:44,  2.09it/s]

 43%|██████████████████                        | 69/161 [00:42<00:44,  2.09it/s]

 43%|██████████████████▎                       | 70/161 [00:42<00:43,  2.08it/s]

 44%|██████████████████▌                       | 71/161 [00:42<00:43,  2.08it/s]

 45%|██████████████████▊                       | 72/161 [00:43<00:42,  2.07it/s]

 45%|███████████████████                       | 73/161 [00:43<00:42,  2.08it/s]

 46%|███████████████████▎                      | 74/161 [00:44<00:41,  2.08it/s]

 47%|███████████████████▌                      | 75/161 [00:44<00:41,  2.08it/s]

 47%|███████████████████▊                      | 76/161 [00:45<00:40,  2.08it/s]

 48%|████████████████████                      | 77/161 [00:45<00:41,  2.05it/s]

 48%|████████████████████▎                     | 78/161 [00:46<00:40,  2.04it/s]

 49%|████████████████████▌                     | 79/161 [00:46<00:40,  2.05it/s]

 50%|████████████████████▊                     | 80/161 [00:47<00:39,  2.06it/s]

 50%|█████████████████████▏                    | 81/161 [00:47<00:38,  2.06it/s]

 51%|█████████████████████▍                    | 82/161 [00:48<00:38,  2.06it/s]

 52%|█████████████████████▋                    | 83/161 [00:48<00:37,  2.06it/s]

 52%|█████████████████████▉                    | 84/161 [00:49<00:37,  2.06it/s]

 53%|██████████████████████▏                   | 85/161 [00:49<00:37,  2.05it/s]

 53%|██████████████████████▍                   | 86/161 [00:50<00:36,  2.05it/s]

 54%|██████████████████████▋                   | 87/161 [00:50<00:35,  2.06it/s]

 55%|██████████████████████▉                   | 88/161 [00:51<00:35,  2.06it/s]

 55%|███████████████████████▏                  | 89/161 [00:51<00:35,  2.01it/s]

 56%|███████████████████████▍                  | 90/161 [00:52<00:34,  2.04it/s]

 57%|███████████████████████▋                  | 91/161 [00:52<00:34,  2.05it/s]

 57%|████████████████████████                  | 92/161 [00:53<00:33,  2.05it/s]

 58%|████████████████████████▎                 | 93/161 [00:53<00:33,  2.05it/s]

 58%|████████████████████████▌                 | 94/161 [00:54<00:32,  2.05it/s]

 59%|████████████████████████▊                 | 95/161 [00:54<00:32,  2.06it/s]

 60%|█████████████████████████                 | 96/161 [00:55<00:31,  2.07it/s]

 60%|█████████████████████████▎                | 97/161 [00:55<00:30,  2.07it/s]

 61%|█████████████████████████▌                | 98/161 [00:56<00:30,  2.07it/s]

 61%|█████████████████████████▊                | 99/161 [00:56<00:29,  2.07it/s]

 62%|█████████████████████████▍               | 100/161 [00:57<00:29,  2.07it/s]

 63%|█████████████████████████▋               | 101/161 [00:57<00:28,  2.07it/s]

 63%|█████████████████████████▉               | 102/161 [00:58<00:28,  2.07it/s]

 64%|██████████████████████████▏              | 103/161 [00:58<00:28,  2.07it/s]

 65%|██████████████████████████▍              | 104/161 [00:59<00:27,  2.06it/s]

 65%|██████████████████████████▋              | 105/161 [00:59<00:27,  2.05it/s]

 66%|██████████████████████████▉              | 106/161 [01:00<00:26,  2.05it/s]

 66%|███████████████████████████▏             | 107/161 [01:00<00:26,  2.06it/s]

 67%|███████████████████████████▌             | 108/161 [01:00<00:25,  2.06it/s]

 68%|███████████████████████████▊             | 109/161 [01:01<00:25,  2.05it/s]

 68%|████████████████████████████             | 110/161 [01:01<00:24,  2.05it/s]

 69%|████████████████████████████▎            | 111/161 [01:02<00:24,  2.05it/s]

 70%|████████████████████████████▌            | 112/161 [01:02<00:24,  2.04it/s]

 70%|████████████████████████████▊            | 113/161 [01:03<00:23,  2.05it/s]

 71%|█████████████████████████████            | 114/161 [01:03<00:22,  2.05it/s]

 71%|█████████████████████████████▎           | 115/161 [01:04<00:22,  2.05it/s]

 72%|█████████████████████████████▌           | 116/161 [01:04<00:22,  2.01it/s]

 73%|█████████████████████████████▊           | 117/161 [01:05<00:21,  2.02it/s]

 73%|██████████████████████████████           | 118/161 [01:05<00:21,  2.03it/s]

 74%|██████████████████████████████▎          | 119/161 [01:06<00:20,  2.03it/s]

 75%|██████████████████████████████▌          | 120/161 [01:06<00:20,  2.05it/s]

 75%|██████████████████████████████▊          | 121/161 [01:07<00:19,  2.04it/s]

 76%|███████████████████████████████          | 122/161 [01:07<00:19,  2.04it/s]

 76%|███████████████████████████████▎         | 123/161 [01:08<00:18,  2.03it/s]

 77%|███████████████████████████████▌         | 124/161 [01:08<00:18,  2.04it/s]

 78%|███████████████████████████████▊         | 125/161 [01:09<00:17,  2.04it/s]

 78%|████████████████████████████████         | 126/161 [01:09<00:17,  2.05it/s]

 79%|████████████████████████████████▎        | 127/161 [01:10<00:16,  2.03it/s]

 80%|████████████████████████████████▌        | 128/161 [01:10<00:16,  2.03it/s]

 80%|████████████████████████████████▊        | 129/161 [01:11<00:15,  2.03it/s]

 81%|█████████████████████████████████        | 130/161 [01:11<00:15,  2.03it/s]

 81%|█████████████████████████████████▎       | 131/161 [01:12<00:15,  1.97it/s]

 82%|█████████████████████████████████▌       | 132/161 [01:12<00:14,  1.99it/s]

 83%|█████████████████████████████████▊       | 133/161 [01:13<00:14,  1.99it/s]

 83%|██████████████████████████████████       | 134/161 [01:13<00:13,  2.01it/s]

 84%|██████████████████████████████████▍      | 135/161 [01:14<00:12,  2.03it/s]

 84%|██████████████████████████████████▋      | 136/161 [01:14<00:12,  2.03it/s]

 85%|██████████████████████████████████▉      | 137/161 [01:15<00:11,  2.04it/s]

 86%|███████████████████████████████████▏     | 138/161 [01:15<00:11,  2.02it/s]

 86%|███████████████████████████████████▍     | 139/161 [01:16<00:10,  2.03it/s]

 87%|███████████████████████████████████▋     | 140/161 [01:16<00:10,  2.03it/s]

 88%|███████████████████████████████████▉     | 141/161 [01:17<00:09,  2.03it/s]

 88%|████████████████████████████████████▏    | 142/161 [01:17<00:09,  2.02it/s]

 89%|████████████████████████████████████▍    | 143/161 [01:18<00:08,  2.03it/s]

 89%|████████████████████████████████████▋    | 144/161 [01:18<00:08,  2.03it/s]

 90%|████████████████████████████████████▉    | 145/161 [01:19<00:07,  2.04it/s]

 91%|█████████████████████████████████████▏   | 146/161 [01:19<00:07,  2.03it/s]

 91%|█████████████████████████████████████▍   | 147/161 [01:20<00:06,  2.02it/s]

 92%|█████████████████████████████████████▋   | 148/161 [01:20<00:06,  2.02it/s]

 93%|█████████████████████████████████████▉   | 149/161 [01:21<00:05,  2.02it/s]

 93%|██████████████████████████████████████▏  | 150/161 [01:21<00:05,  2.04it/s]

 94%|██████████████████████████████████████▍  | 151/161 [01:22<00:04,  2.04it/s]

 94%|██████████████████████████████████████▋  | 152/161 [01:22<00:04,  2.05it/s]

 95%|██████████████████████████████████████▉  | 153/161 [01:23<00:03,  2.05it/s]

 96%|███████████████████████████████████████▏ | 154/161 [01:23<00:03,  2.03it/s]

 96%|███████████████████████████████████████▍ | 155/161 [01:24<00:02,  2.04it/s]

 97%|███████████████████████████████████████▋ | 156/161 [01:24<00:02,  2.04it/s]

 98%|███████████████████████████████████████▉ | 157/161 [01:25<00:01,  2.04it/s]

 98%|████████████████████████████████████████▏| 158/161 [01:25<00:01,  2.06it/s]

 99%|████████████████████████████████████████▍| 159/161 [01:26<00:00,  2.06it/s]

 99%|████████████████████████████████████████▋| 160/161 [01:26<00:00,  2.05it/s]

100%|█████████████████████████████████████████| 161/161 [01:27<00:00,  2.07it/s]

100%|█████████████████████████████████████████| 161/161 [01:27<00:00,  1.85it/s]




In [8]:
filter_df=filter_df.groupby(['author_distinct','srce_cntry'])['count'].sum().reset_index()
filter_df=filter_df.rename(columns={'count':'filter'})

In [9]:
raw_df=pd.read_csv(cntry_full_path)
raw_df = raw_df.groupby(['cntry'])['count'].sum().reset_index(name='raw')

In [10]:
meta=raw_df.merge(filter_df,left_on='cntry', right_on='author_distinct')
meta['p']=(meta['raw']-meta['filter'])/meta['raw']

In [11]:
meta=meta[['srce_cntry','cntry','raw','filter','p']]
meta=meta.rename(columns={'srce_cntry':'source','cntry':'target','raw':'raw_pub','filter':'filter_pub'})
meta=meta[meta.source!='Not-Funded']

In [12]:
meta.to_csv(profile_path, index=False)