In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats

In [28]:
def estimate_ci_bootstrapping(negative, positive, num_iterations=1000,z=1.96):
    data = np.concatenate([np.ones(negative),np.zeros(positive)])
    bootstrap_samples = np.random.choice(a=data, size=(num_iterations, len(data)), replace=True)
    negative_rate = np.mean(bootstrap_samples, axis=1)
    mean_of_negative_rate = np.mean(negative_rate)
    sem_of_negative_rate = stats.sem(negative_rate)
    return mean_of_negative_rate, mean_of_negative_rate-1.96*sem_of_negative_rate, mean_of_negative_rate+1.96*sem_of_negative_rate

#### get the first part of the sampling results by extract the us and china cases from the sampling of the whole database

In [2]:
result_path = "../../data/NationalFunding/Data/RobustCheck/check_09_and_18/funding_year_check_result.xlsx"
pub_path = "../../data/NationalFunding/Data/RobustCheck/check_09_and_18/funding_year_check.txt"

In [6]:
result_df = pd.read_excel(result_path)
result_df.head(2)

Unnamed: 0,ost_bk,ut,year,title,label
0,65041047,WOS:000265236000001,2009,Symbolic Computations and Exact and Explicit S...,0.0
1,32590270,WOS:000266300300033,2009,Experiencing the pediatric intensive care unit...,0.0


In [7]:
pub_df = pd.read_csv(pub_path,sep="|",on_bad_lines='skip')
pub_df.head(2)

Unnamed: 0,OST_BK,UID,Annee_Bibliographique,Titre,Country
0,65041047,WOS:000265236000001,2009,Symbolic Computations and Exact and Explicit S...,Turkey
1,65041047,WOS:000265236000001,2009,Symbolic Computations and Exact and Explicit S...,Turkey


In [8]:
pub_df= pub_df.rename(columns={'OST_BK':'ost_bk','UID':'ut','Annee_Bibliographique':'year','Titre':'title','Country':'cntry'})
pub_df.sample(1)

Unnamed: 0,ost_bk,ut,year,title,cntry
1123736,84565005,WOS:000437669400013,2018,Fine versus coarse atrial fibrillation in rheu...,Iran


In [22]:
df = result_df.merge(pub_df, on=['ost_bk','ut','year','title'])
us_chn_part1 = df[(df.cntry.isin(['USA','Peoples R China']))&(df['label']<2)].drop_duplicates()
us_chn_part1 = us_chn_part1.groupby(['cntry','year','label']).size().reset_index(name='count')
us_chn_part1 = us_chn_part1.replace(to_replace={'cntry':{'USA':'usa','Peoples R China':'china'}})
us_chn_part1.sample(1)

Unnamed: 0,cntry,year,label,count
5,usa,2009,1.0,14


In [21]:
us_chn_part1

Unnamed: 0,cntry,year,label,count
0,china,2009,0.0,9
1,china,2009,1.0,4
2,china,2018,0.0,17
3,china,2018,1.0,3
4,usa,2009,0.0,59
5,usa,2009,1.0,23
6,usa,2018,0.0,84
7,usa,2018,1.0,8


#### get the second part the sampling which is the sampling of individual country

In [11]:
sample_path = "../../data/NationalFunding/Data/RobustCheck/check_09_and_18/us_china_2009_2018.csv"

In [12]:
sample_df = pd.read_csv(sample_path)
sample_df.sample(2)

Unnamed: 0,ost_bk,uid,year,title,country,label
105,69067532,WOS:000269175500013,2009,Potential for Chemical Mixture Exposures and H...,USA,1.0
394,51637609,WOS:000439792800008,2018,Acid suppression medications reduce risk of oe...,USA,


In [13]:
# a little bit of cleaning
sample_df = sample_df[sample_df['label']<2]
sample_df.rename(columns={'country':'cntry'},inplace=True)

In [16]:
us_chn_part2 = sample_df.groupby(['cntry','year','label']).size().reset_index(name='count')
us_chn_part2.replace(to_replace={'cntry':{'USA':'usa','Peoples R China':'china'}},inplace=True)
us_chn_part2.head(2)


Unnamed: 0,cntry,year,label,count
0,china,2009,0.0,53
1,china,2009,1.0,23


In [23]:
us_chn = us_chn_part1.merge(us_chn_part2, on=['cntry','year','label'])
us_chn['total'] = us_chn['count_x']+us_chn['count_y']
us_chn.replace(to_replace={'label':{0:'positive',1:'negative'}},inplace=True)
us_chn = us_chn.pivot(index=['cntry','year'],columns='label',values='total').reset_index()
us_chn.head()

label,cntry,year,negative,positive
0,china,2009,25,59
1,china,2018,21,82
2,usa,2009,36,101
3,usa,2018,10,84


In [29]:
res_df = []
for index, row in us_chn.iterrows():
    cntry = row['cntry']
    year=row['year']
    negative_value = row['negative']
    positive_value = row['positive']
    mean, lower, upper = estimate_ci_bootstrapping(negative_value, positive_value)
    res_df.append([cntry,year,mean,lower,upper])
res_df = pd.DataFrame(res_df, columns=['cntry','year','mean','lower','upper'])

In [30]:
res_df.to_csv("../../data/NationalFunding/Data/RobustCheck/check_09_and_18/ci_us_chn.csv",index=False)