this script is for clean and compile the raw data. Raw data includes three files: pub file records the information of articles; pub_funder file includes the funder in

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [25]:
pub_path="../../data/nf_folder/Data/RawData/pubs.csv"
pub_fund_path="../../data/nf_folder/Data/RawData/pub_funder.csv"
pub_author_path="../../data/nf_folder/Data/RawData/pub_author.csv"
cntry_path="../../data/nf_folder/Data/AdditionalData/cntry_convert.xlsx"

In [2]:
clean_pub_path="../../data/nf_folder/Data/DerivedData/CleanedRawData/pub.pkl"

In [6]:
pub_df=pd.read_csv(pub_path,sep="\t")
pub_df.head()

Unnamed: 0,ID_Art,Annee_Bibliographique,Code_Discipline,Code_Document
0,60694041,2017,9,1
1,48573156,2012,89,1
2,55323671,2015,89,1
3,57633115,2016,31,1
4,64831413,2018,99,1


count how many pubs in each document type

In [8]:
pub_fund=pd.read_csv(pub_fund_path,sep="\t",
                     error_bad_lines=False, warn_bad_lines=False)
pub_fund.head()

Unnamed: 0,ID_Art,Organisation,Numero_Financement,Country
0,62152918,Natural Science Foundation of Jiangsu Province,BK20140131,PEOPLES-R-CHINA
1,62152749,Shanghai Commission of Health and Family Planning,201440538,PEOPLES-R-CHINA
2,62152799,"U.S. Department of Energy's Office of Science,...",DE-SC0007041,USA
3,49692015,FCT,SFRH/BD/47118/2008,PORTUGAL
4,49692016,Basque government,IT472-10,SPAIN


count how many fa-gn entries are with grant number

In [14]:
pub_fund[pub_fund['Numero_Financement'].notnull()].shape[0]/pub_fund.shape[0]

0.6977087938106727

In [26]:
pub_author=pd.read_csv(pub_author_path,sep="\t",
                       error_bad_lines=False, warn_bad_lines=False)
pub_author.head()

Unnamed: 0,ID_Art,Pays
0,61705542,PEOPLES-R-CHINA
1,46171057,AUSTRALIA
2,52161176,HUNGARY
3,62843155,SOUTH-KOREA
4,62124106,PORTUGAL


In [30]:
cntry_df=pd.read_excel(cntry_path)
cntry_dict=dict(zip(cntry_df['Clarivate country'],cntry_df['Country']))

change the clarivate name into the standard format

In [31]:
pub_fund=pub_fund.replace(to_replace={'Country':cntry_dict})

In [36]:
pub_author=pub_author.replace(to_replace={'Pays':cntry_dict})

consolidate three datasets into one big data sheet

In [40]:
pub_fund=pub_fund[['ID_Art','Country']]
pub_fund=pub_fund.groupby(['ID_Art'])['Country'].apply(list).reset_index()
pub_author=pub_author.groupby(['ID_Art'])['Pays'].apply(list).reset_index()

In [41]:
pub_author.head()

Unnamed: 0,ID_Art,Pays
0,30258899,"[Vietnam, United States]"
1,30258900,"[Vietnam, France]"
2,30258901,"[Romania, France]"
3,30258902,[United States]
4,30258903,[Italy]


In [43]:
pub_fund.head()

Unnamed: 0,ID_Art,Country
0,30507292,[Turkey]
1,30574690,"[United States, United States, United States, ..."
2,30574691,[Russia]
3,30585095,"[Spain, Spain, Spain, Spain, Spain, Spain]"
4,30585097,[Spain]


In [45]:
pub_df=pub_df.merge(pub_author,on='ID_Art',how='left')
pub_df=pub_df.merge(pub_fund,on='ID_Art',how='left')

In [9]:
pub_df['Country']=pub_df['Country'].fillna(value='Not-Funded')

In [10]:
pub_df.head()

Unnamed: 0,ID_Art,Annee_Bibliographique,Code_Discipline,Code_Document,Pays,Country
0,60694041,2017,9,1,"[Canada, United States]","[United States, United States]"
1,48573156,2012,89,1,[France],Not-Funded
2,55323671,2015,89,1,"[France, Chile]","[Chile, Chile, Chile, Chile, Chile]"
3,57633115,2016,31,1,"[Switzerland, United States]",[Switzerland]
4,64831413,2018,99,1,[China],"[China, China, China, China, China, China]"


In [17]:
pub_df['author_distinct']=pub_df['Pays'].apply(lambda x: list(set(x)))
pub_df['funder_distinct']=pub_df['Country'].apply(lambda x:list(set(x)) 
                                                  if x!='Not-Funded' else 'Not-Funded')

In [19]:
pub_df=pub_df.rename(mapper={'ID_Art':'id','Annee_Bibliographique':'year','Code_Discipline':'dis',
                             'Code_Document':'type','Pays':'author','Country':'funder'},axis=1)

In [28]:
pub_df.head()

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct
0,60694041,2017,9,1,"[Canada, United States]","[United States, United States]","[United States, Canada]",[United States]
1,48573156,2012,89,1,[France],Not-Funded,[France],Not-Funded
2,55323671,2015,89,1,"[France, Chile]","[Chile, Chile, Chile, Chile, Chile]","[Chile, France]",[Chile]
3,57633115,2016,31,1,"[Switzerland, United States]",[Switzerland],"[Switzerland, United States]",[Switzerland]
4,64831413,2018,99,1,[China],"[China, China, China, China, China, China]",[China],[China]


In [29]:
pub_df['funded']=pub_df['funder'].apply(lambda x:1 if x!='Not-Funded' else 0)
pub_df['IntCol']=pub_df['author_distinct'].apply(lambda x:1 if len(x)>1 else 0)
pub_df.head()

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol
0,60694041,2017,9,1,"[Canada, United States]","[United States, United States]","[United States, Canada]",[United States],1,1
1,48573156,2012,89,1,[France],Not-Funded,[France],Not-Funded,0,0
2,55323671,2015,89,1,"[France, Chile]","[Chile, Chile, Chile, Chile, Chile]","[Chile, France]",[Chile],1,1
3,57633115,2016,31,1,"[Switzerland, United States]",[Switzerland],"[Switzerland, United States]",[Switzerland],1,1
4,64831413,2018,99,1,[China],"[China, China, China, China, China, China]",[China],[China],1,0


In [5]:
def cofund(funders):
    label=1
    if funders=='Not-Funded':
        label=0
    elif 'EU' not in funders and len(funders)==1:
        label=0
    return label
pub_df['cofund']=pub_df['funder_distinct'].apply(lambda x: cofund(x))

In [8]:
pub_df.to_pickle(clean_pub_path)