## this script use fixed-effect model to estimate the importance of different funding sources

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
import tqdm

In [2]:
pub_path = "../../data/NationalFunding/Data/DerivedData/CleanedRawData/pub.pkl"

In [3]:
df = pd.read_pickle(pub_path)

In [4]:
df.sample(2)

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol,cofund,intfund
2334018,51212588,2013,82,1,[Iran],Not-Funded,[Iran],Not-Funded,0,0,0,0
6821579,47688318,2011,8,1,[Greece],Not-Funded,[Greece],Not-Funded,0,0,0,0


In [5]:
def get_pubcnt_cntry_year(df):
    df = df[['id','year','author_distinct']].explode('author_distinct')
    df = df.groupby(['author_distinct','year']).size().reset_index(name='pubcnt')
    df = df.rename(columns={'author_distinct':'cntry'})
    return df

In [6]:
def get_fundcnt_from_cntrys(df):
    df = df[df.funder!='Not-Funded']
    pub2cntry = df[['id','year','author_distinct','funder']].explode('author_distinct') #assign publications into author countries
    fund2cntry = pub2cntry.explode('funder').groupby(['author_distinct','year','funder']).size().reset_index(name='fundcnt')
    fund2cntry = fund2cntry.rename(columns={'author_distinct':'cntry'})
    return fund2cntry

In [7]:
def map_funder_to_new_name(fund2cntry, topcntrys):
    topcntrys_dict = dict(zip(topcntrys, topcntrys))
    #replace the non-top funders to "others"
    df = fund2cntry.copy()
    df['region'] = df['funder'].copy()
    df['region'] = df['region'].map(topcntrys_dict)
    df['region'] = df['region'].fillna('others')
    df.loc[df['cntry']==df['funder'],'region']='domestic' #assign "domestic"
    return df

In [8]:
def group_fund_by_region(fund2cntry):
    #group the amount of fund by region
    fund_by_region = fund2cntry.groupby(['cntry','year','region'])['fundcnt'].sum().reset_index() 
    fund_by_region = fund_by_region.pivot_table(
        index=['cntry','year'], columns=['region'],values='fundcnt',fill_value=0)
    fund_by_region['foreign'] = fund_by_region.loc[:, fund_by_region.columns != 'domestic'].sum(axis=1)
    #for those regions that 
    fund_by_region = np.log(fund_by_region+1).reset_index()
    return fund_by_region

In [9]:
def add_previous_pub(pubcnt):
    df_curr = pubcnt[pubcnt.year>2009]
    df_curr = df_curr.rename(columns={'year':'year_curr','pubcnt':'pubcnt_curr'})
    df_prev = pubcnt[pubcnt.year<2018]
    df_prev = df_prev.rename(columns={'year':'year_prev','pubcnt':'pubcnt_prev'})
    df_curr['year_prev'] = df_curr['year_curr']-1
    df = df_curr.merge(df_prev, on=['cntry','year_prev'])
    return df

#### get the amount of grants from each top countries

In [10]:
fund_by_cntry = get_fundcnt_from_cntrys(df)
topcntrys=['United States','China','EU','United Kingdom','France','Germany']
fund_by_cntry_region = map_funder_to_new_name(fund_by_cntry, topcntrys)
fund_by_region = group_fund_by_region(fund_by_cntry_region)

In [11]:
test = fund_by_cntry_region.copy()

In [31]:
test[(test.cntry=='Australia')&(test.year==2009)].sort_values(by='fundcnt',ascending=False)

21233

In [13]:
df.head(2)

Unnamed: 0,id,year,dis,type,author,funder,author_distinct,funder_distinct,funded,IntCol,cofund,intfund
0,60694041,2017,9,1,"[Canada, United States]","[United States, United States]","[Canada, United States]",[United States],1,1,0,0
1,48573156,2012,89,1,[France],Not-Funded,[France],Not-Funded,0,0,0,0


In [33]:
tt1 = df[(df.year==2009)&(df.funder!='Not-Funded')].explode("author_distinct")

In [37]:
t['t'] = t.apply(lambda x:1 if len(x.funder_distinct)==1 and x.funder_distinct[0]=='Australia' else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['t'] = t.apply(lambda x:1 if len(x.funder_distinct)==1 and x.funder_distinct[0]=='Australia' else 0, axis=1)


In [42]:
t[t.t==0].explode('funder_distinct').groupby('funder_distinct').size().sort_values(ascending=False)

funder_distinct
Australia         3173
United States     2900
China             1164
United Kingdom    1152
Germany            531
                  ... 
Brunei               1
Bulgaria             1
Nigeria              1
Iraq                 1
Zimbabwe             1
Length: 89, dtype: int64

In [40]:
8270/15956

0.5183003258962146

#### get the number of publications authored by countries between two consecutive years

In [49]:
pubcnt = get_pubcnt_cntry_year(df)
pubcnt_new = add_previous_pub(pubcnt)

In [50]:
def calculate_growth_rate_take_log(df):
    df['growth_rate']=(df['pubcnt_curr']/df['pubcnt_prev'])-1
    #df['growth_rate']=df['growth_rate']
    df['pubcnt_curr']=np.log(df['pubcnt_curr'])
    df['pubcnt_prev']=np.log(df['pubcnt_prev'])
    return df

In [51]:
pubcnt_new=calculate_growth_rate_take_log(pubcnt_new)

In [52]:
pubcnt_new.head(2)

Unnamed: 0,cntry,year_curr,pubcnt_curr,year_prev,pubcnt_prev,growth_rate
0,Afghanistan,2010,3.496508,2009,2.833213,0.941176
1,Afghanistan,2011,3.332205,2010,3.496508,-0.151515


In [53]:
reg_df = pubcnt_new.merge(fund_by_region, left_on=['cntry','year_curr'], right_on=['cntry','year']
                         ).drop(columns=['year'])

In [56]:
reg_df.to_csv("regression_data.csv", index=False)

In [25]:
reg_df = pd.read_csv("regression_data.csv")

In [26]:
reg_df['test'] = reg_df.apply(lambda x: 1 if x.foreign>x.domestic else 0, axis=1)

In [30]:
reg_df[reg_df.test==1].cntry.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antarctica', 'Antigua & Barbuda', 'Argentina', 'Armenia',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belgium', 'Belize', 'Benin', 'Bermuda',
       'Bhutan', 'Bolivia', 'Bosnia & Herzegovina', 'Botswana', 'Brunei',
       'Bulgaria', 'Burkina Faso', 'Burma', 'Burundi', 'Byelarus',
       'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Republic', 'Chad', 'Chile', 'Colombia', 'Comoros',
       'Congo', 'Cook Islands', 'Costa Rica', 'Crimea', 'Croatia', 'Cuba',
       'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica ',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia',
       'Federated States of Micronesia ', 'Fiji', 'Finland', 'France',
       'French-Guiana', 'French-Polynesia', 'Gabon', 'Gambia', 'Georgia',
       'Germany', 'Ghana ', 'Greece', 'G

In [5]:
import pandas as pd

# Assuming df is your original DataFrame
# For demonstration purposes, let's create a sample DataFrame
data = {'country': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
        'year': [2010, 2011, 2012, 2013, 2014, 2010, 2011, 2012, 2013, 2014],
        'value': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55]}

df = pd.DataFrame(data)

# Sort the original DataFrame by 'country' and 'year'
df_sorted = df.sort_values(by=['country', 'year'])

# Create a new DataFrame with columns 'country', 'current_year_value', 'next_year_value'
df_next_year = pd.DataFrame(columns=['country', 'current_year_value', 'next_year_value'])

# Iterate through unique countries
for country in df_sorted['country'].unique():
    # Filter rows for the current country
    country_data = df_sorted[df_sorted['country'] == country]
    
    # Create a new column 'next_year_value' by shifting the 'value' column
    country_data['next_year_value'] = country_data['value'].shift(-1)
    
    # Keep only the relevant columns
    country_data = country_data[['country', 'value', 'next_year_value']].rename(columns={'value': 'current_year_value'})
    
    # Append the data for the current country to the new DataFrame
    df_next_year = pd.concat([df_next_year, country_data], ignore_index=True)

# Drop rows with NaN values (last year for each country)
df_next_year = df_next_year.dropna()

# Reset index
df_next_year.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
print(df_next_year)


  country current_year_value  next_year_value
0       A                 10             15.0
1       A                 15             20.0
2       A                 20             25.0
3       A                 25             30.0
4       B                 35             40.0
5       B                 40             45.0
6       B                 45             50.0
7       B                 50             55.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_data['next_year_value'] = country_data['value'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_data['next_year_value'] = country_data['value'].shift(-1)


In [10]:
country_data

Unnamed: 0,country,current_year_value,next_year_value
5,B,35,40.0
6,B,40,45.0
7,B,45,50.0
8,B,50,55.0
9,B,55,


In [12]:
df_sorted[df_sorted['country'] == 'A']['value']

0    10
1    15
2    20
3    25
4    30
Name: value, dtype: int64

In [13]:
df_sorted[df_sorted['country'] == 'A']['value'].shift(-1)

0    15.0
1    20.0
2    25.0
3    30.0
4     NaN
Name: value, dtype: float64