this script builds research profile of countries after internationally funded publications from a **specific** country is removed. Here the removing standard is remove a paper that funding from the specific country flows to other countries, e.g., USA funded paper is authored by China. 

**all countries are removed iteratively**

In [1]:
import pandas as pd
import tqdm

In [2]:
pubs_path="../../data/nf_folder/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_full_path="../../data/nf_folder/Data/DerivedData/Derived/cntry_author_full.csv"

profile_path="../../../data/nf_folder/Data/DerivedData/Derived/dep-FundExchange/pub_no_foreign_individual_allcountries.csv"

In [3]:
# Parameters
pubs_path = "../data/NationalFunding/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_full_path = (
    "../data/NationalFunding/Data/DerivedData/Derived/cntry_author_full.csv"
)
profile_path = "../data/NationalFunding/Data/DerivedData/Derived/dependence/pub_noforeign_fund_all2all.csv"


In [4]:
pub_df=pd.read_pickle(pubs_path)
pub_df.head()

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol,cofund,intfund
0,60694041,2017,9,1,"[Canada, United States]","[United States, United States]","[Canada, United States]",[United States],1,1,0,0
1,48573156,2012,89,1,[France],Not-Funded,[France],Not-Funded,0,0,0,0
2,55323671,2015,89,1,"[France, Chile]","[Chile, Chile, Chile, Chile, Chile]","[Chile, France]",[Chile],1,1,0,0
3,57633115,2016,31,1,"[Switzerland, United States]",[Switzerland],"[Switzerland, United States]",[Switzerland],1,1,0,0
4,64831413,2018,99,1,[China],"[China, China, China, China, China, China]",[China],[China],1,0,0,0


In [5]:
def get_publications_to_remove(fund_df,cntry):
    """Returns a list of publication IDs that are funded by a specific country
    and are either internationally coauthored or authored by other countries."""
    def assign_label(authors, cntry):
        
        if len(authors)>1 or authors[0]!=cntry:
            return 1
        else:
            return 0
    
    df=fund_df[fund_df.funder_distinct==cntry].copy()
    df['label']=df.apply(lambda x: assign_label(x.author_distinct,cntry),axis=1)
    removeid=list(df[df.label==1]['id'].unique())
    
    return removeid

In [6]:
def build_country_profile(pub_df, fund_df, cntry):
    """construct the research profile of rest of countries when the internationally-funded publications are removed
    """
    
    removeid=get_publications_to_remove(fund_df, cntry)
    df=pub_df[~pub_df['id'].isin(removeid)]
    df=df[['id','dis','author_distinct']].explode('author_distinct')
    df=df.groupby(['author_distinct','dis']).size().reset_index(name='count')
    
    return df

In [7]:
filter_df=pd.DataFrame()
fund_df = pub_df[['id','author_distinct','funder_distinct']].explode('funder_distinct')
cntrys=fund_df.funder_distinct.unique()

for cntry in tqdm.tqdm(cntrys):
    country_profile=build_country_profile(pub_df, fund_df, cntry)
    country_profile['srce_cntry']=cntry
    filter_df=pd.concat([filter_df,country_profile])

  0%|                                                   | 0/192 [00:00<?, ?it/s]

  1%|▏                                        | 1/192 [00:25<1:22:05, 25.79s/it]

  1%|▍                                        | 2/192 [01:17<2:09:06, 40.77s/it]

  2%|▋                                        | 3/192 [01:26<1:24:01, 26.67s/it]

  2%|▊                                        | 4/192 [01:37<1:04:00, 20.43s/it]

  3%|█                                        | 5/192 [02:03<1:09:28, 22.29s/it]

  3%|█▎                                         | 6/192 [02:15<58:34, 18.89s/it]

  4%|█▌                                         | 7/192 [02:28<52:30, 17.03s/it]

  4%|█▊                                         | 8/192 [02:42<48:26, 15.80s/it]

  5%|██                                         | 9/192 [02:52<42:48, 14.04s/it]

  5%|██▏                                       | 10/192 [03:05<41:47, 13.78s/it]

  6%|██▍                                       | 11/192 [03:18<41:08, 13.64s/it]

  6%|██▋                                       | 12/192 [03:30<38:52, 12.96s/it]

  7%|██▊                                       | 13/192 [03:41<37:31, 12.58s/it]

  7%|███                                       | 14/192 [03:51<35:01, 11.80s/it]

  8%|███▎                                      | 15/192 [04:01<33:13, 11.26s/it]

  8%|███▌                                      | 16/192 [04:14<34:11, 11.66s/it]

  9%|███▋                                      | 17/192 [04:24<33:01, 11.32s/it]

  9%|███▉                                      | 18/192 [04:34<31:41, 10.93s/it]

 10%|████▏                                     | 19/192 [04:44<30:31, 10.59s/it]

 10%|████▍                                     | 20/192 [04:55<30:42, 10.71s/it]

 11%|████▌                                     | 21/192 [05:07<31:24, 11.02s/it]

 11%|████▊                                     | 22/192 [05:18<31:00, 10.94s/it]

 12%|█████                                     | 23/192 [05:28<30:09, 10.71s/it]

 12%|█████▎                                    | 24/192 [05:39<30:41, 10.96s/it]

 13%|█████▍                                    | 25/192 [05:50<29:49, 10.72s/it]

 14%|█████▋                                    | 26/192 [06:00<29:23, 10.62s/it]

 14%|█████▉                                    | 27/192 [06:11<29:33, 10.75s/it]

 15%|██████▏                                   | 28/192 [06:23<30:35, 11.19s/it]

 15%|██████▎                                   | 29/192 [06:33<29:26, 10.84s/it]

 16%|██████▌                                   | 30/192 [06:43<28:16, 10.47s/it]

 16%|██████▊                                   | 31/192 [06:53<27:53, 10.39s/it]

 17%|███████                                   | 32/192 [07:04<28:21, 10.63s/it]

 17%|███████▏                                  | 33/192 [07:15<28:02, 10.58s/it]

 18%|███████▍                                  | 34/192 [07:25<27:17, 10.36s/it]

 18%|███████▋                                  | 35/192 [07:35<27:01, 10.33s/it]

 19%|███████▉                                  | 36/192 [07:45<26:50, 10.33s/it]

 19%|████████                                  | 37/192 [07:55<26:23, 10.21s/it]

 20%|████████▎                                 | 38/192 [08:05<26:15, 10.23s/it]

 20%|████████▌                                 | 39/192 [08:15<25:50, 10.13s/it]

 21%|████████▊                                 | 40/192 [08:25<25:29, 10.06s/it]

 21%|████████▉                                 | 41/192 [08:35<25:13, 10.02s/it]

 22%|█████████▏                                | 42/192 [08:45<25:04, 10.03s/it]

 22%|█████████▍                                | 43/192 [08:55<24:46,  9.98s/it]

 23%|█████████▋                                | 44/192 [09:05<24:33,  9.96s/it]

 23%|█████████▊                                | 45/192 [09:15<24:33, 10.02s/it]

 24%|██████████                                | 46/192 [09:25<24:22, 10.02s/it]

 24%|██████████▎                               | 47/192 [09:36<24:34, 10.17s/it]

 25%|██████████▌                               | 48/192 [09:46<24:10, 10.07s/it]

 26%|██████████▋                               | 49/192 [09:55<23:47,  9.98s/it]

 26%|██████████▉                               | 50/192 [10:05<23:31,  9.94s/it]

 27%|███████████▏                              | 51/192 [10:15<23:20,  9.93s/it]

 27%|███████████▍                              | 52/192 [10:25<23:08,  9.92s/it]

 28%|███████████▌                              | 53/192 [10:35<23:00,  9.93s/it]

 28%|███████████▊                              | 54/192 [10:45<22:42,  9.88s/it]

 29%|████████████                              | 55/192 [10:54<22:29,  9.85s/it]

 29%|████████████▎                             | 56/192 [11:04<22:22,  9.87s/it]

 30%|████████████▍                             | 57/192 [11:14<22:12,  9.87s/it]

 30%|████████████▋                             | 58/192 [11:24<22:02,  9.87s/it]

 31%|████████████▉                             | 59/192 [11:34<21:49,  9.85s/it]

 31%|█████████████▏                            | 60/192 [11:44<21:41,  9.86s/it]

 32%|█████████████▎                            | 61/192 [11:54<21:31,  9.86s/it]

 32%|█████████████▌                            | 62/192 [12:04<21:22,  9.86s/it]

 33%|█████████████▊                            | 63/192 [12:13<21:09,  9.84s/it]

 33%|██████████████                            | 64/192 [12:23<20:52,  9.78s/it]

 34%|██████████████▏                           | 65/192 [12:33<20:40,  9.77s/it]

 34%|██████████████▍                           | 66/192 [12:43<20:34,  9.80s/it]

 35%|██████████████▋                           | 67/192 [12:53<20:31,  9.86s/it]

 35%|██████████████▉                           | 68/192 [13:02<20:24,  9.88s/it]

 36%|███████████████                           | 69/192 [13:12<20:08,  9.83s/it]

 36%|███████████████▎                          | 70/192 [13:22<19:59,  9.83s/it]

 37%|███████████████▌                          | 71/192 [13:32<19:46,  9.81s/it]

 38%|███████████████▊                          | 72/192 [13:42<19:36,  9.80s/it]

 38%|███████████████▉                          | 73/192 [13:51<19:27,  9.81s/it]

 39%|████████████████▏                         | 74/192 [14:01<19:14,  9.79s/it]

 39%|████████████████▍                         | 75/192 [14:11<19:07,  9.81s/it]

 40%|████████████████▋                         | 76/192 [14:21<18:55,  9.79s/it]

 40%|████████████████▊                         | 77/192 [14:31<18:48,  9.81s/it]

 41%|█████████████████                         | 78/192 [14:40<18:40,  9.83s/it]

 41%|█████████████████▎                        | 79/192 [14:50<18:26,  9.79s/it]

 42%|█████████████████▌                        | 80/192 [15:00<18:13,  9.76s/it]

 42%|█████████████████▋                        | 81/192 [15:09<17:58,  9.71s/it]

 43%|█████████████████▉                        | 82/192 [15:19<17:48,  9.71s/it]

 43%|██████████████████▏                       | 83/192 [15:29<17:45,  9.78s/it]

 44%|██████████████████▍                       | 84/192 [15:39<17:36,  9.78s/it]

 44%|██████████████████▌                       | 85/192 [15:49<17:27,  9.79s/it]

 45%|██████████████████▊                       | 86/192 [15:58<17:15,  9.77s/it]

 45%|███████████████████                       | 87/192 [16:08<17:03,  9.75s/it]

 46%|███████████████████▎                      | 88/192 [16:18<16:53,  9.75s/it]

 46%|███████████████████▍                      | 89/192 [16:28<16:47,  9.78s/it]

 47%|███████████████████▋                      | 90/192 [16:38<16:37,  9.78s/it]

 47%|███████████████████▉                      | 91/192 [16:47<16:27,  9.77s/it]

 48%|████████████████████▏                     | 92/192 [16:57<16:15,  9.75s/it]

 48%|████████████████████▎                     | 93/192 [17:07<16:00,  9.70s/it]

 49%|████████████████████▌                     | 94/192 [17:16<15:51,  9.71s/it]

 49%|████████████████████▊                     | 95/192 [17:26<15:44,  9.74s/it]

 50%|█████████████████████                     | 96/192 [17:36<15:39,  9.79s/it]

 51%|█████████████████████▏                    | 97/192 [17:46<15:29,  9.78s/it]

 51%|█████████████████████▍                    | 98/192 [17:56<15:18,  9.77s/it]

 52%|█████████████████████▋                    | 99/192 [18:05<15:06,  9.75s/it]

 52%|█████████████████████▎                   | 100/192 [18:15<14:57,  9.75s/it]

 53%|█████████████████████▌                   | 101/192 [18:25<14:51,  9.79s/it]

 53%|█████████████████████▊                   | 102/192 [18:35<14:43,  9.81s/it]

 54%|█████████████████████▉                   | 103/192 [18:45<14:33,  9.82s/it]

 54%|██████████████████████▏                  | 104/192 [18:54<14:21,  9.79s/it]

 55%|██████████████████████▍                  | 105/192 [19:04<14:08,  9.75s/it]

 55%|██████████████████████▋                  | 106/192 [19:14<14:00,  9.77s/it]

 56%|██████████████████████▊                  | 107/192 [19:24<13:54,  9.82s/it]

 56%|███████████████████████                  | 108/192 [19:33<13:42,  9.79s/it]

 57%|███████████████████████▎                 | 109/192 [19:43<13:29,  9.76s/it]

 57%|███████████████████████▍                 | 110/192 [19:53<13:22,  9.79s/it]

 58%|███████████████████████▋                 | 111/192 [20:03<13:12,  9.79s/it]

 58%|███████████████████████▉                 | 112/192 [20:13<13:05,  9.82s/it]

 59%|████████████████████████▏                | 113/192 [20:22<12:54,  9.81s/it]

 59%|████████████████████████▎                | 114/192 [20:32<12:44,  9.80s/it]

 60%|████████████████████████▌                | 115/192 [20:42<12:34,  9.79s/it]

 60%|████████████████████████▊                | 116/192 [20:52<12:21,  9.75s/it]

 61%|████████████████████████▉                | 117/192 [21:01<12:11,  9.76s/it]

 61%|█████████████████████████▏               | 118/192 [21:11<12:07,  9.82s/it]

 62%|█████████████████████████▍               | 119/192 [21:21<11:56,  9.82s/it]

 62%|█████████████████████████▋               | 120/192 [21:31<11:45,  9.79s/it]

 63%|█████████████████████████▊               | 121/192 [21:41<11:36,  9.81s/it]

 64%|██████████████████████████               | 122/192 [21:50<11:24,  9.78s/it]

 64%|██████████████████████████▎              | 123/192 [22:00<11:16,  9.80s/it]

 65%|██████████████████████████▍              | 124/192 [22:10<11:08,  9.83s/it]

 65%|██████████████████████████▋              | 125/192 [22:20<10:57,  9.81s/it]

 66%|██████████████████████████▉              | 126/192 [22:30<10:47,  9.81s/it]

 66%|███████████████████████████              | 127/192 [22:40<10:36,  9.79s/it]

 67%|███████████████████████████▎             | 128/192 [22:49<10:26,  9.79s/it]

 67%|███████████████████████████▌             | 129/192 [22:59<10:17,  9.81s/it]

 68%|███████████████████████████▊             | 130/192 [23:09<10:10,  9.84s/it]

 68%|███████████████████████████▉             | 131/192 [23:19<09:58,  9.81s/it]

 69%|████████████████████████████▏            | 132/192 [23:29<09:46,  9.78s/it]

 69%|████████████████████████████▍            | 133/192 [23:38<09:36,  9.77s/it]

 70%|████████████████████████████▌            | 134/192 [23:48<09:24,  9.73s/it]

 70%|████████████████████████████▊            | 135/192 [23:58<09:16,  9.77s/it]

 71%|█████████████████████████████            | 136/192 [24:08<09:08,  9.80s/it]

 71%|█████████████████████████████▎           | 137/192 [24:17<08:59,  9.80s/it]

 72%|█████████████████████████████▍           | 138/192 [24:27<08:48,  9.79s/it]

 72%|█████████████████████████████▋           | 139/192 [24:37<08:37,  9.77s/it]

 73%|█████████████████████████████▉           | 140/192 [24:47<08:30,  9.81s/it]

 73%|██████████████████████████████           | 141/192 [24:57<08:22,  9.86s/it]

 74%|██████████████████████████████▎          | 142/192 [25:07<08:13,  9.86s/it]

 74%|██████████████████████████████▌          | 143/192 [25:16<08:01,  9.83s/it]

 75%|██████████████████████████████▊          | 144/192 [25:26<07:51,  9.82s/it]

 76%|██████████████████████████████▉          | 145/192 [25:36<07:40,  9.79s/it]

 76%|███████████████████████████████▏         | 146/192 [25:46<07:31,  9.81s/it]

 77%|███████████████████████████████▍         | 147/192 [25:56<07:22,  9.84s/it]

 77%|███████████████████████████████▌         | 148/192 [26:06<07:13,  9.84s/it]

 78%|███████████████████████████████▊         | 149/192 [26:15<07:02,  9.84s/it]

 78%|████████████████████████████████         | 150/192 [26:25<06:51,  9.80s/it]

 79%|████████████████████████████████▏        | 151/192 [26:35<06:43,  9.85s/it]

 79%|████████████████████████████████▍        | 152/192 [26:45<06:34,  9.87s/it]

 80%|████████████████████████████████▋        | 153/192 [26:55<06:24,  9.86s/it]

 80%|████████████████████████████████▉        | 154/192 [27:05<06:14,  9.85s/it]

 81%|█████████████████████████████████        | 155/192 [27:14<06:03,  9.82s/it]

 81%|█████████████████████████████████▎       | 156/192 [27:24<05:52,  9.79s/it]

 82%|█████████████████████████████████▌       | 157/192 [27:34<05:44,  9.84s/it]

 82%|█████████████████████████████████▋       | 158/192 [27:44<05:34,  9.84s/it]

 83%|█████████████████████████████████▉       | 159/192 [27:54<05:25,  9.86s/it]

 83%|██████████████████████████████████▏      | 160/192 [28:04<05:14,  9.83s/it]

 84%|██████████████████████████████████▍      | 161/192 [28:13<05:04,  9.81s/it]

 84%|██████████████████████████████████▌      | 162/192 [28:23<04:54,  9.81s/it]

 85%|██████████████████████████████████▊      | 163/192 [28:33<04:45,  9.84s/it]

 85%|███████████████████████████████████      | 164/192 [28:43<04:36,  9.86s/it]

 86%|███████████████████████████████████▏     | 165/192 [28:53<04:25,  9.85s/it]

 86%|███████████████████████████████████▍     | 166/192 [29:03<04:16,  9.87s/it]

 87%|███████████████████████████████████▋     | 167/192 [29:12<04:05,  9.83s/it]

 88%|███████████████████████████████████▉     | 168/192 [29:22<03:56,  9.84s/it]

 88%|████████████████████████████████████     | 169/192 [29:32<03:46,  9.86s/it]

 89%|████████████████████████████████████▎    | 170/192 [29:42<03:37,  9.87s/it]

 89%|████████████████████████████████████▌    | 171/192 [29:52<03:27,  9.87s/it]

 90%|████████████████████████████████████▋    | 172/192 [30:02<03:16,  9.83s/it]

 90%|████████████████████████████████████▉    | 173/192 [30:11<03:06,  9.80s/it]

 91%|█████████████████████████████████████▏   | 174/192 [30:21<02:56,  9.78s/it]

 91%|█████████████████████████████████████▎   | 175/192 [30:31<02:47,  9.85s/it]

 92%|█████████████████████████████████████▌   | 176/192 [30:41<02:37,  9.86s/it]

 92%|█████████████████████████████████████▊   | 177/192 [30:51<02:27,  9.86s/it]

 93%|██████████████████████████████████████   | 178/192 [31:01<02:16,  9.78s/it]

 93%|██████████████████████████████████████▏  | 179/192 [31:10<02:06,  9.74s/it]

 94%|██████████████████████████████████████▍  | 180/192 [31:20<01:57,  9.75s/it]

 94%|██████████████████████████████████████▋  | 181/192 [31:30<01:47,  9.81s/it]

 95%|██████████████████████████████████████▊  | 182/192 [31:40<01:38,  9.83s/it]

 95%|███████████████████████████████████████  | 183/192 [31:50<01:28,  9.84s/it]

 96%|███████████████████████████████████████▎ | 184/192 [31:59<01:18,  9.81s/it]

 96%|███████████████████████████████████████▌ | 185/192 [32:09<01:08,  9.80s/it]

 97%|███████████████████████████████████████▋ | 186/192 [32:19<00:59,  9.85s/it]

 97%|███████████████████████████████████████▉ | 187/192 [32:29<00:49,  9.87s/it]

 98%|████████████████████████████████████████▏| 188/192 [32:39<00:39,  9.90s/it]

 98%|████████████████████████████████████████▎| 189/192 [32:49<00:29,  9.88s/it]

 99%|████████████████████████████████████████▌| 190/192 [32:58<00:19,  9.79s/it]

 99%|████████████████████████████████████████▊| 191/192 [33:08<00:09,  9.78s/it]

100%|█████████████████████████████████████████| 192/192 [33:18<00:00,  9.78s/it]

100%|█████████████████████████████████████████| 192/192 [33:18<00:00, 10.41s/it]




In [8]:
filter_df=filter_df.groupby(['author_distinct','srce_cntry'])['count'].sum().reset_index()
filter_df=filter_df.rename(columns={'count':'filter'})

In [9]:
raw_df=pd.read_csv(cntry_full_path)
raw_df = raw_df.groupby(['cntry'])['count'].sum().reset_index(name='raw')

In [10]:
meta=raw_df.merge(filter_df,left_on='cntry', right_on='author_distinct')
meta['p']=(meta['raw']-meta['filter'])/meta['raw']

In [11]:
meta=meta[['srce_cntry','cntry','raw','filter','p']]
meta=meta.rename(columns={'srce_cntry':'source','cntry':'target','raw':'raw_pub','filter':'filter_pub'})
meta=meta[meta.source!='Not-Funded']

In [12]:
meta.to_csv(profile_path, index=False)