<b>This notebook contains the code used for merging following files:<b>
    <ul>
        <li><i>VParty_VDem_WB_basic_indicators.csv</i></li>
    </ul>

In [1]:
# importing modules

import pandas as pd
import numpy as np

In [4]:
# READING IN DATASETS

In [85]:
''' Smaller datasets were uploaded on the relevant github page (link: https://github.com/MateuszMarcol/PopulismDatasets),
the bigger datasets need to be downloaded by the user who is running this code. I will explicitly specify where a download is necessary.'''

# Reading in V-Party dataset (filename: V-Dem-CPD-Party-V2.csv)
link_VParty = 'https://raw.githubusercontent.com/MateuszMarcol/PopulismDatasets/main/datasets_raw/V-Dem-CPD-Party-V2.csv'
dataset_VParty = pd.read_csv(link_VParty)

In [86]:
# Reading in V-Dem dataset
'''Since V-Dem is a huge dataset (and I use the Full version) it needs to be downloaded directly. A variable "VDem_path"
should be respecified by the user.'''

VDem_path = '/Users/mateuszmarcol/Datasets/Country_Year_V-Dem_Full+others_CSV_v12/V-Dem-CY-Full+Others-v12.csv'
dataset_VDem = pd.read_csv(VDem_path) # !DOWNLOAD NEEDED

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [87]:
# Reading in World Bank datasets

WB_indicators_link = "https://raw.githubusercontent.com/MateuszMarcol/PopulismDatasets/main/datasets_raw/WB_basic_indicators.csv"
dataset_WB_indicators = pd.read_csv(WB_indicators_link)

<H1>MERGING DATASETS</H1>

In [8]:
# ADDING WORLD BANK INDICATORS TO V-DEM DATASET

In [88]:
# list of all WB indicators
WB_basic_indicators_list = list(set(dataset_WB_indicators["Series Name"]))
# dropping the redundant variables from WB indicators list
WB_basic_indicators_list.remove('Last Updated: 02/15/2022') # deleting the non-indicator variable
del WB_basic_indicators_list[0] # deleting first nan element

In [89]:
WB_basic_indicators_list

['Births attended by skilled health staff (% of total)',
 'GNI per capita, Atlas method (current US$)',
 'Population, total',
 'High-technology exports (% of manufactured exports)',
 'Life expectancy at birth, total (years)',
 'Poverty headcount ratio at national poverty lines (% of population)',
 'External debt stocks, total (DOD, current US$)',
 'Contraceptive prevalence, any method (% of married women ages 15-49)',
 'Prevalence of HIV, total (% of population ages 15-49)',
 'Net migration',
 'GDP (current US$)',
 'Forest area (sq. km)',
 'Revenue, excluding grants (% of GDP)',
 'GDP per capita (current US$)',
 'Personal remittances, paid (current US$)',
 'Immunization, measles (% of children ages 12-23 months)',
 'Income share held by lowest 20%',
 'CO2 emissions (metric tons per capita)',
 'Market capitalization of listed domestic companies (% of GDP)',
 'Net ODA received per capita (current US$)',
 'Foreign direct investment, net (BoP, current US$)',
 'Agriculture, forestry, and fi

In [90]:
# DEFINING FUNCTIONS FOR WB DATA MANIPULATION

# this function returns a simple dataframe (based on WB dataset) with 
# following columns: [country 3-digit code, year, variable of interest from WB]
# the variable name needs to be specified
def WB_dataframe(variable_name): 
    # years are the columns in WB dataset, so firsty these columns need to be rearranged to variables
    dataset_modified = dataset_WB_indicators[dataset_WB_indicators["Series Name"]==variable_name].melt(id_vars=['Country Code'], value_vars=['1971 [YR1971]', '1972 [YR1972]', '1973 [YR1973]', '1974 [YR1974]',
       '1975 [YR1975]', '1976 [YR1976]', '1977 [YR1977]', '1978 [YR1978]',
       '1979 [YR1979]', '1980 [YR1980]', '1981 [YR1981]', '1982 [YR1982]',
       '1983 [YR1983]', '1984 [YR1984]', '1985 [YR1985]', '1986 [YR1986]',
       '1987 [YR1987]', '1988 [YR1988]', '1989 [YR1989]', '1990 [YR1990]',
       '1991 [YR1991]', '1992 [YR1992]', '1993 [YR1993]', '1994 [YR1994]',
       '1995 [YR1995]', '1996 [YR1996]', '1997 [YR1997]', '1998 [YR1998]',
       '1999 [YR1999]', '2000 [YR2000]', '2001 [YR2001]', '2002 [YR2002]',
       '2003 [YR2003]', '2004 [YR2004]', '2005 [YR2005]', '2006 [YR2006]',
       '2007 [YR2007]', '2008 [YR2008]', '2009 [YR2009]', '2010 [YR2010]',
       '2011 [YR2011]', '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]',
       '2015 [YR2015]', '2016 [YR2016]', '2017 [YR2017]', '2018 [YR2018]',
       '2019 [YR2019]', '2020 [YR2020]']) # melting the columns into a "year" variable
    
    dataset_modified["variable"] = dataset_modified["variable"].apply(lambda x: int(str(x[0:4]))) # simplifying the notation for a year 
    
    dataset_modified.rename(columns={"variable":"year", "value":variable_name, "Country Code": "country_text_id"}, inplace=True) 
    # renaming variables to V-Dem-consistent notation ("year", "country_text_id")

    dataset_modified[variable_name] = pd.to_numeric(dataset_modified[variable_name], errors="coerce")
    # changing the value type to numeric type
    
    return dataset_modified

# this function creates a dataset based on the list of WB indicators
def complete_WB_dataframe(list_of_indicators):
    dataset = WB_dataframe(list_of_indicators[0]) # setting up the base frame with the first WB variable
    for i in list_of_indicators[1:]: # adding another variables from the list (skipping th first one)
        if i not in list(dataset.columns):
            dataset = pd.merge(dataset, WB_dataframe(i), on=["country_text_id", "year"], how="outer") 
            # the outer option does not allow for observations to be dropped
    return dataset

In [91]:
# calling out the function with list that contains all variables as a function argument
aggregated_WB_indicators = complete_WB_dataframe(WB_basic_indicators_list)

# the obtained dataset contains all WB indicators 
# this dataset is V-Dem friendly, therefore merging will be easier and more effective

In [92]:
# MERGING WORLD BANK INDICATORS WITH V-DEM

In [93]:
# merging the aggregated_WB_indicators dataframe with V-Dem
dataset_VDem_WB_all_indicators = pd.merge(dataset_VDem, aggregated_WB_indicators, on=["country_text_id", "year"], how="left")

In [94]:
# MERGING V-PARTY WITH WORLD BANK DATA & V-DEM

In [95]:
'''In V-Party an observation is a political party in a given year. This dataset also contains populism variables. On the other hand,
The V-Dem_WB_all_indicators dataset (that was created above) contains country- and year-specific variables. The way I merge both datasets
is following:
    - I treat V-Party as a base (as it contains populism variables).
    - To all parties in a given year I add a country-year-specific observation from V-Dem & World Bank.
    - If more parties were examined in the same year (which is often the case), then they will have assigned the same 
      V-Dem & World Bank extension.
    Disclaimer: All V-Dem & World Bank observations that do not correspond to countries and years covered in V-Party are dropped'''

# merging V-Party and V-Dem & World Bank
dataset_VDem_VParty_WB_basic_indicators = pd.merge(dataset_VParty, dataset_VDem_WB_all_indicators, on=["country_text_id", "year"], how="left")

'''Disclaimer: variables that have the same name in both datasets (like 'country_name') will have a suffix added - "_x" for V-Party
and "_y" for V-Dem variables.'''

'Disclaimer: variables that have the same name in both datasets (like \'country_name\') will have a suffix added - "_x" for V-Party\nand "_y" for V-Dem variables.'

In [98]:
# this saved dataset is saved on the user's driver, therefore a path specification is needed

user_path = "/Users/mateuszmarcol/Populism/PopulismDatasets/" # the user's path to respecify
dataset_VDem_VParty_WB_basic_indicators.to_csv(user_path + "VDem_VParty_WB_basic_indicators.csv")