this script builds research profile of countries after internationally funded publications from a **specific** country is removed. Here the removing standard is remove a paper that funding from the specific country flows to other countries, e.g., USA funded paper is authored by China. 

**all countries are removed iteratively**

In [1]:
import pandas as pd
import tqdm

In [2]:
pubs_path="../../data/nf_folder/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_full_path="../../data/nf_folder/Data/DerivedData/Derived/cntry_author_full.csv"

profile_path="../../../data/nf_folder/Data/DerivedData/Derived/dep-FundExchange/pub_no_foreign_individual_allcountries.csv"

In [3]:
# Parameters
pubs_path = "../data/NationalFunding/Data/DerivedData/CleanedRawData/pub.pkl"
cntry_full_path = (
    "../data/NationalFunding/Data/DerivedData/Derived/cntry_author_full.csv"
)
profile_path = "../data/NationalFunding/Data/DerivedData/Derived/dependence/pub_noforeign_fund_all2all.csv"


In [4]:
pub_df=pd.read_pickle(pubs_path)
pub_df.head()

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol,cofund,intfund
0,60694041,2017,9,1,"[Canada, United States]","[United States, United States]","[Canada, United States]",[United States],1,1,0,0
1,48573156,2012,89,1,[France],Not-Funded,[France],Not-Funded,0,0,0,0
2,55323671,2015,89,1,"[France, Chile]","[Chile, Chile, Chile, Chile, Chile]","[France, Chile]",[Chile],1,1,0,0
3,57633115,2016,31,1,"[Switzerland, United States]",[Switzerland],"[Switzerland, United States]",[Switzerland],1,1,0,0
4,64831413,2018,99,1,[China],"[China, China, China, China, China, China]",[China],[China],1,0,0,0


In [5]:
def get_publications_to_remove(fund_df,cntry):
    """Returns a list of publication IDs that are funded by a specific country
    and are either internationally coauthored or authored by other countries."""
    def assign_label(authors, cntry):
        
        if len(authors)>1 or authors[0]!=cntry:
            return 1
        else:
            return 0
    
    df=fund_df[fund_df.funder_distinct==cntry].copy()
    df['label']=df.apply(lambda x: assign_label(x.author_distinct,cntry),axis=1)
    removeid=list(df[df.label==1]['id'].unique())
    
    return removeid

In [6]:
def build_country_profile(pub_df, fund_df, cntry):
    """construct the research profile of rest of countries when the internationally-funded publications are removed
    """
    
    removeid=get_publications_to_remove(fund_df, cntry)
    df=pub_df[~pub_df['id'].isin(removeid)]
    df=df[['id','dis','author_distinct']].explode('author_distinct') #assign papers to authorship countries using full counting
    df=df.groupby(['author_distinct','dis']).size().reset_index(name='count')
    
    return df

In [7]:
filter_df=pd.DataFrame()
fund_df = pub_df[['id','author_distinct','funder_distinct']].explode('funder_distinct')
cntrys=fund_df.funder_distinct.unique()

for cntry in tqdm.tqdm(cntrys):
    country_profile=build_country_profile(pub_df, fund_df, cntry)
    country_profile['srce_cntry']=cntry
    filter_df=pd.concat([filter_df,country_profile])

  0%|                                                   | 0/192 [00:00<?, ?it/s]

  1%|▏                                        | 1/192 [00:26<1:24:03, 26.40s/it]

  1%|▍                                        | 2/192 [01:18<2:10:45, 41.29s/it]

  2%|▋                                        | 3/192 [01:27<1:24:29, 26.82s/it]

  2%|▊                                        | 4/192 [01:38<1:03:43, 20.34s/it]

  3%|█                                        | 5/192 [02:03<1:08:40, 22.04s/it]

  3%|█▎                                         | 6/192 [02:14<57:23, 18.51s/it]

  4%|█▌                                         | 7/192 [02:27<51:19, 16.64s/it]

  4%|█▊                                         | 8/192 [02:40<47:15, 15.41s/it]

  5%|██                                         | 9/192 [02:50<41:32, 13.62s/it]

  5%|██▏                                       | 10/192 [03:02<40:25, 13.33s/it]

  6%|██▍                                       | 11/192 [03:15<39:22, 13.05s/it]

  6%|██▋                                       | 12/192 [03:26<37:16, 12.43s/it]

  7%|██▊                                       | 13/192 [03:37<35:58, 12.06s/it]

  7%|███                                       | 14/192 [03:46<33:30, 11.30s/it]

  8%|███▎                                      | 15/192 [03:56<31:40, 10.74s/it]

  8%|███▌                                      | 16/192 [04:08<32:33, 11.10s/it]

  9%|███▋                                      | 17/192 [04:18<31:30, 10.80s/it]

  9%|███▉                                      | 18/192 [04:28<30:17, 10.44s/it]

 10%|████▏                                     | 19/192 [04:37<29:11, 10.12s/it]

 10%|████▍                                     | 20/192 [04:47<29:16, 10.21s/it]

 11%|████▌                                     | 21/192 [04:58<29:50, 10.47s/it]

 11%|████▊                                     | 22/192 [05:09<29:22, 10.37s/it]

 12%|█████                                     | 23/192 [05:18<28:36, 10.16s/it]

 12%|█████▎                                    | 24/192 [05:29<29:11, 10.43s/it]

 13%|█████▍                                    | 25/192 [05:39<28:17, 10.16s/it]

 14%|█████▋                                    | 26/192 [05:49<27:48, 10.05s/it]

 14%|█████▉                                    | 27/192 [05:59<28:07, 10.23s/it]

 15%|██████▏                                   | 28/192 [06:11<29:16, 10.71s/it]

 15%|██████▎                                   | 29/192 [06:21<28:08, 10.36s/it]

 16%|██████▌                                   | 30/192 [06:30<27:01, 10.01s/it]

 16%|██████▊                                   | 31/192 [06:39<26:34,  9.91s/it]

 17%|███████                                   | 32/192 [06:50<27:05, 10.16s/it]

 17%|███████▏                                  | 33/192 [07:00<26:43, 10.08s/it]

 18%|███████▍                                  | 34/192 [07:09<25:53,  9.83s/it]

 18%|███████▋                                  | 35/192 [07:19<25:36,  9.79s/it]

 19%|███████▉                                  | 36/192 [07:29<25:25,  9.78s/it]

 19%|████████                                  | 37/192 [07:38<25:00,  9.68s/it]

 20%|████████▎                                 | 38/192 [07:48<24:50,  9.68s/it]

 20%|████████▌                                 | 39/192 [07:57<24:25,  9.58s/it]

 21%|████████▊                                 | 40/192 [08:07<24:06,  9.52s/it]

 21%|████████▉                                 | 41/192 [08:16<23:52,  9.49s/it]

 22%|█████████▏                                | 42/192 [08:25<23:40,  9.47s/it]

 22%|█████████▍                                | 43/192 [08:35<23:23,  9.42s/it]

 23%|█████████▋                                | 44/192 [08:44<23:05,  9.36s/it]

 23%|█████████▊                                | 45/192 [08:54<23:05,  9.43s/it]

 24%|██████████                                | 46/192 [09:03<22:57,  9.44s/it]

 24%|██████████▎                               | 47/192 [09:13<23:05,  9.56s/it]

 25%|██████████▌                               | 48/192 [09:22<22:47,  9.49s/it]

 26%|██████████▋                               | 49/192 [09:31<22:27,  9.42s/it]

 26%|██████████▉                               | 50/192 [09:41<22:16,  9.41s/it]

 27%|███████████▏                              | 51/192 [09:50<22:05,  9.40s/it]

 27%|███████████▍                              | 52/192 [09:59<21:49,  9.35s/it]

 28%|███████████▌                              | 53/192 [10:09<21:43,  9.38s/it]

 28%|███████████▊                              | 54/192 [10:18<21:26,  9.32s/it]

 29%|████████████                              | 55/192 [10:27<21:17,  9.32s/it]

 29%|████████████▎                             | 56/192 [10:37<21:08,  9.33s/it]

 30%|████████████▍                             | 57/192 [10:46<20:56,  9.31s/it]

 30%|████████████▋                             | 58/192 [10:55<20:42,  9.27s/it]

 31%|████████████▉                             | 59/192 [11:04<20:32,  9.26s/it]

 31%|█████████████▏                            | 60/192 [11:14<20:26,  9.29s/it]

 32%|█████████████▎                            | 61/192 [11:23<20:20,  9.32s/it]

 32%|█████████████▌                            | 62/192 [11:33<20:10,  9.31s/it]

 33%|█████████████▊                            | 63/192 [11:42<19:58,  9.29s/it]

 33%|██████████████                            | 64/192 [11:51<19:43,  9.25s/it]

 34%|██████████████▏                           | 65/192 [12:00<19:31,  9.23s/it]

 34%|██████████████▍                           | 66/192 [12:09<19:22,  9.22s/it]

 35%|██████████████▋                           | 67/192 [12:19<19:19,  9.28s/it]

 35%|██████████████▉                           | 68/192 [12:28<19:15,  9.32s/it]

 36%|███████████████                           | 69/192 [12:37<19:05,  9.31s/it]

 36%|███████████████▎                          | 70/192 [12:47<18:53,  9.29s/it]

 37%|███████████████▌                          | 71/192 [12:56<18:41,  9.27s/it]

 38%|███████████████▊                          | 72/192 [13:05<18:30,  9.26s/it]

 38%|███████████████▉                          | 73/192 [13:14<18:22,  9.27s/it]

 39%|████████████████▏                         | 74/192 [13:24<18:11,  9.25s/it]

 39%|████████████████▍                         | 75/192 [13:33<18:01,  9.24s/it]

 40%|████████████████▋                         | 76/192 [13:42<17:55,  9.27s/it]

 40%|████████████████▊                         | 77/192 [13:51<17:41,  9.23s/it]

 41%|█████████████████                         | 78/192 [14:01<17:37,  9.28s/it]

 41%|█████████████████▎                        | 79/192 [14:10<17:27,  9.27s/it]

 42%|█████████████████▌                        | 80/192 [14:19<17:15,  9.25s/it]

 42%|█████████████████▋                        | 81/192 [14:28<17:01,  9.20s/it]

 43%|█████████████████▉                        | 82/192 [14:37<16:49,  9.18s/it]

 43%|██████████████████▏                       | 83/192 [14:47<16:43,  9.21s/it]

 44%|██████████████████▍                       | 84/192 [14:56<16:37,  9.23s/it]

 44%|██████████████████▌                       | 85/192 [15:05<16:28,  9.24s/it]

 45%|██████████████████▊                       | 86/192 [15:14<16:15,  9.21s/it]

 45%|███████████████████                       | 87/192 [15:23<16:04,  9.19s/it]

 46%|███████████████████▎                      | 88/192 [15:33<15:52,  9.16s/it]

 46%|███████████████████▍                      | 89/192 [15:42<15:43,  9.16s/it]

 47%|███████████████████▋                      | 90/192 [15:51<15:38,  9.20s/it]

 47%|███████████████████▉                      | 91/192 [16:00<15:27,  9.18s/it]

 48%|████████████████████▏                     | 92/192 [16:09<15:18,  9.18s/it]

 48%|████████████████████▎                     | 93/192 [16:18<15:06,  9.15s/it]

 49%|████████████████████▌                     | 94/192 [16:27<14:54,  9.12s/it]

 49%|████████████████████▊                     | 95/192 [16:37<14:46,  9.14s/it]

 50%|█████████████████████                     | 96/192 [16:46<14:38,  9.15s/it]

 51%|█████████████████████▏                    | 97/192 [16:55<14:31,  9.18s/it]

 51%|█████████████████████▍                    | 98/192 [17:04<14:24,  9.20s/it]

 52%|█████████████████████▋                    | 99/192 [17:14<14:16,  9.21s/it]

 52%|█████████████████████▎                   | 100/192 [17:23<14:08,  9.22s/it]

 53%|█████████████████████▌                   | 101/192 [17:32<13:54,  9.17s/it]

 53%|█████████████████████▊                   | 102/192 [17:41<13:43,  9.15s/it]

 54%|█████████████████████▉                   | 103/192 [17:50<13:35,  9.16s/it]

 54%|██████████████████████▏                  | 104/192 [17:59<13:27,  9.17s/it]

 55%|██████████████████████▍                  | 105/192 [18:08<13:17,  9.16s/it]

 55%|██████████████████████▋                  | 106/192 [18:18<13:07,  9.16s/it]

 56%|██████████████████████▊                  | 107/192 [18:27<13:00,  9.18s/it]

 56%|███████████████████████                  | 108/192 [18:36<12:50,  9.18s/it]

 57%|███████████████████████▎                 | 109/192 [18:45<12:40,  9.17s/it]

 57%|███████████████████████▍                 | 110/192 [18:55<12:37,  9.24s/it]

 58%|███████████████████████▋                 | 111/192 [19:04<12:30,  9.27s/it]

 58%|███████████████████████▉                 | 112/192 [19:13<12:19,  9.24s/it]

 59%|████████████████████████▏                | 113/192 [19:22<12:07,  9.21s/it]

 59%|████████████████████████▎                | 114/192 [19:31<11:57,  9.20s/it]

 60%|████████████████████████▌                | 115/192 [19:41<11:50,  9.23s/it]

 60%|████████████████████████▊                | 116/192 [19:50<11:41,  9.23s/it]

 61%|████████████████████████▉                | 117/192 [19:59<11:31,  9.22s/it]

 61%|█████████████████████████▏               | 118/192 [20:08<11:24,  9.25s/it]

 62%|█████████████████████████▍               | 119/192 [20:18<11:14,  9.23s/it]

 62%|█████████████████████████▋               | 120/192 [20:27<11:04,  9.23s/it]

 63%|█████████████████████████▊               | 121/192 [20:36<10:55,  9.23s/it]

 64%|██████████████████████████               | 122/192 [20:45<10:46,  9.24s/it]

 64%|██████████████████████████▎              | 123/192 [20:55<10:41,  9.30s/it]

 65%|██████████████████████████▍              | 124/192 [21:04<10:32,  9.30s/it]

 65%|██████████████████████████▋              | 125/192 [21:13<10:19,  9.25s/it]

 66%|██████████████████████████▉              | 126/192 [21:23<10:11,  9.27s/it]

 66%|███████████████████████████              | 127/192 [21:32<10:02,  9.27s/it]

 67%|███████████████████████████▎             | 128/192 [21:41<09:54,  9.28s/it]

 67%|███████████████████████████▌             | 129/192 [21:50<09:43,  9.25s/it]

 68%|███████████████████████████▊             | 130/192 [21:59<09:31,  9.21s/it]

 68%|███████████████████████████▉             | 131/192 [22:09<09:20,  9.19s/it]

 69%|████████████████████████████▏            | 132/192 [22:18<09:10,  9.18s/it]

 69%|████████████████████████████▍            | 133/192 [22:27<09:03,  9.21s/it]

 70%|████████████████████████████▌            | 134/192 [22:36<08:55,  9.23s/it]

 70%|████████████████████████████▊            | 135/192 [22:46<08:45,  9.22s/it]

 71%|█████████████████████████████            | 136/192 [22:55<08:36,  9.22s/it]

 71%|█████████████████████████████▎           | 137/192 [23:04<08:25,  9.20s/it]

 72%|█████████████████████████████▍           | 138/192 [23:13<08:16,  9.19s/it]

 72%|█████████████████████████████▋           | 139/192 [23:22<08:09,  9.23s/it]

 73%|█████████████████████████████▉           | 140/192 [23:32<08:01,  9.26s/it]

 73%|██████████████████████████████           | 141/192 [23:41<07:53,  9.29s/it]

 74%|██████████████████████████████▎          | 142/192 [23:50<07:42,  9.26s/it]

 74%|██████████████████████████████▌          | 143/192 [23:59<07:31,  9.21s/it]

 75%|██████████████████████████████▊          | 144/192 [24:08<07:20,  9.18s/it]

 76%|██████████████████████████████▉          | 145/192 [24:18<07:11,  9.17s/it]

 76%|███████████████████████████████▏         | 146/192 [24:27<07:02,  9.19s/it]

 77%|███████████████████████████████▍         | 147/192 [24:36<06:55,  9.23s/it]

 77%|███████████████████████████████▌         | 148/192 [24:45<06:46,  9.23s/it]

 78%|███████████████████████████████▊         | 149/192 [24:55<06:37,  9.24s/it]

 78%|████████████████████████████████         | 150/192 [25:04<06:26,  9.20s/it]

 79%|████████████████████████████████▏        | 151/192 [25:13<06:17,  9.21s/it]

 79%|████████████████████████████████▍        | 152/192 [25:22<06:08,  9.22s/it]

 80%|████████████████████████████████▋        | 153/192 [25:32<06:00,  9.24s/it]

 80%|████████████████████████████████▉        | 154/192 [25:41<05:51,  9.26s/it]

 81%|█████████████████████████████████        | 155/192 [25:50<05:41,  9.24s/it]

 81%|█████████████████████████████████▎       | 156/192 [25:59<05:32,  9.24s/it]

 82%|█████████████████████████████████▌       | 157/192 [26:08<05:22,  9.21s/it]

 82%|█████████████████████████████████▋       | 158/192 [26:18<05:12,  9.20s/it]

 83%|█████████████████████████████████▉       | 159/192 [26:27<05:04,  9.22s/it]

 83%|██████████████████████████████████▏      | 160/192 [26:36<04:56,  9.26s/it]

 84%|██████████████████████████████████▍      | 161/192 [26:45<04:46,  9.25s/it]

 84%|██████████████████████████████████▌      | 162/192 [26:55<04:37,  9.26s/it]

 85%|██████████████████████████████████▊      | 163/192 [27:04<04:27,  9.22s/it]

 85%|███████████████████████████████████      | 164/192 [27:13<04:16,  9.17s/it]

 86%|███████████████████████████████████▏     | 165/192 [27:22<04:07,  9.15s/it]

 86%|███████████████████████████████████▍     | 166/192 [27:31<03:58,  9.17s/it]

 87%|███████████████████████████████████▋     | 167/192 [27:41<03:50,  9.22s/it]

 88%|███████████████████████████████████▉     | 168/192 [27:50<03:42,  9.25s/it]

 88%|████████████████████████████████████     | 169/192 [27:59<03:33,  9.27s/it]

 89%|████████████████████████████████████▎    | 170/192 [28:08<03:23,  9.26s/it]

 89%|████████████████████████████████████▌    | 171/192 [28:18<03:15,  9.29s/it]

 90%|████████████████████████████████████▋    | 172/192 [28:27<03:05,  9.26s/it]

 90%|████████████████████████████████████▉    | 173/192 [28:36<02:56,  9.30s/it]

 91%|█████████████████████████████████████▏   | 174/192 [28:46<02:47,  9.32s/it]

 91%|█████████████████████████████████████▎   | 175/192 [28:55<02:38,  9.34s/it]

 92%|█████████████████████████████████████▌   | 176/192 [29:04<02:28,  9.31s/it]

 92%|█████████████████████████████████████▊   | 177/192 [29:14<02:19,  9.28s/it]

 93%|██████████████████████████████████████   | 178/192 [29:23<02:09,  9.21s/it]

 93%|██████████████████████████████████████▏  | 179/192 [29:32<02:00,  9.23s/it]

 94%|██████████████████████████████████████▍  | 180/192 [29:41<01:51,  9.26s/it]

 94%|██████████████████████████████████████▋  | 181/192 [29:51<01:42,  9.28s/it]

 95%|██████████████████████████████████████▊  | 182/192 [30:00<01:32,  9.26s/it]

 95%|███████████████████████████████████████  | 183/192 [30:09<01:23,  9.27s/it]

 96%|███████████████████████████████████████▎ | 184/192 [30:18<01:13,  9.23s/it]

 96%|███████████████████████████████████████▌ | 185/192 [30:27<01:04,  9.23s/it]

 97%|███████████████████████████████████████▋ | 186/192 [30:37<00:55,  9.29s/it]

 97%|███████████████████████████████████████▉ | 187/192 [30:46<00:46,  9.33s/it]

 98%|████████████████████████████████████████▏| 188/192 [30:56<00:37,  9.34s/it]

 98%|████████████████████████████████████████▎| 189/192 [31:05<00:27,  9.32s/it]

 99%|████████████████████████████████████████▌| 190/192 [31:14<00:18,  9.26s/it]

 99%|████████████████████████████████████████▊| 191/192 [31:23<00:09,  9.22s/it]

100%|█████████████████████████████████████████| 192/192 [31:32<00:00,  9.22s/it]

100%|█████████████████████████████████████████| 192/192 [31:32<00:00,  9.86s/it]




In [8]:
filter_df=filter_df.groupby(['author_distinct','srce_cntry'])['count'].sum().reset_index()
filter_df=filter_df.rename(columns={'count':'filter'})

In [9]:
raw_df=pd.read_csv(cntry_full_path)
raw_df = raw_df.groupby(['cntry'])['count'].sum().reset_index(name='raw')

In [10]:
meta=raw_df.merge(filter_df,left_on='cntry', right_on='author_distinct')
meta['p']=(meta['raw']-meta['filter'])/meta['raw']

In [11]:
meta=meta[['srce_cntry','cntry','raw','filter','p']]
meta=meta.rename(columns={'srce_cntry':'source','cntry':'target','raw':'raw_pub','filter':'filter_pub'})
meta=meta[meta.source!='Not-Funded']

In [12]:
meta.to_csv(profile_path, index=False)