# Table of contents
1. [Working directory, packages and data](#chapter1)
2. [Data cleaning](#chapter2)
3. [Wrapper function](#chapter3)
4. [Rooduijn & Pauwels](#chapter4)
   1. [Construct validity](#subparagraph1)
   2. [Face validity](#subparagraph2)
   3. [External validity](#subparagraph3)
       1. [CHES](#subparagraph4)
       2. [PopuList](#subparagraph5)
5. [Decadri & Boussalis](#chapter5)
   1. [Construct validity](#subparagraph6)
   2. [Face validity](#subparagraph7)
   3. [External validity](#subparagraph8)
       1. [CHES](#subparagraph9)
       2. [PopuList](#subparagraph10)

# Working directory, packages and data <a class="anchor" id="chapter1"></a>

Setting the working directory

In [1]:
setwd('C:/Users/jacop/OneDrive/Desktop/Tesi/data/')

Loading the libraries

In [2]:
suppressWarnings(suppressPackageStartupMessages(library(dtplyr)))
suppressWarnings(suppressPackageStartupMessages(library(tidyverse)))
suppressWarnings(suppressPackageStartupMessages(library(data.table)))
suppressWarnings(suppressPackageStartupMessages(library(quanteda)))

Loading the data

In [3]:
load("parliamentary_groups2.rds")

Creating a lazy data.table out of our dataframe so that we can use dtplyr on it

In [4]:
texts <- lazy_dt(Texts)

# Data cleaning <a class="anchor" id="chapter2"></a>

Casting the "legislatura" variable as numeric

In [5]:
texts <- texts %>% mutate(legislatura = as.integer(legislatura)) %>% as_tibble()

Filtering the dataset by focusing on the last seven legislatures

In [6]:
texts <- texts %>% filter(legislatura >= 12) %>% as_tibble()

# Wrapper function <a class="anchor" id="chapter3"></a>

We'll run the dictionary analyses with a wrapper function. We'll use the "dictionary" argument to specify which dictionary we'll use each time. In particular:

- no stemming will be applied;
- all stopwords will be removed (both Quantedas' and Decadri & Boussalis' additional ones);
- all rows from the dataset will be included.

In [93]:
dict_analysis <- function(data, 
                          dictionary) {
        
    # Creating the corpus
    
    my_corpus <- corpus(data, 
                        text_field = "textclean")
       
    # Tokenizing the corpus
    
    toks <- tokens(my_corpus, 
                   remove_punct = TRUE, 
                   remove_symbols = TRUE, 
                   remove_numbers = TRUE, 
                   remove_separators = TRUE
                   )
    
    # Removing the Quanteda-provided stopwords
    
    cleaned_toks <- tokens_remove(toks, pattern = stopwords("it"))
    
    # Removing the additional stopwords provided by Decadri and Boussalis
    
    decadri_boussalis_additional_stopwords  <- suppressMessages(read_csv("it_stopwords_new_list.csv")) %>% 
                                               pull(stopwords)
        
    cleaned_toks <- tokens_remove(cleaned_toks, pattern = decadri_boussalis_additional_stopwords)
           
    # Building the dfm
    
    my_dfm <- dfm(cleaned_toks)
    
    # Building Rooduijn & Pauwels' dictionary
    
    anti_elitism <- c("elit*", "consens*", "antidemocratic*", "referend*", "corrot*", "propagand*", "politici*",
                      "ingann*", "tradi*", "vergogn*", "scandal*", "verita", "disonest*", "partitocrazia",
                      "menzogn*", "mentir*")
    
    roduijn_and_pauwels_dictionary <- dictionary(list(anti_elitism = anti_elitism))
           
    # Building Decadri and Boussalis' dictionary
    
    anti_elitism <- c("antidemocratic*", "casta", "consens*", "corrot*", "disonest*", "elit*", "establishment", "ingann*", 
                      "mentir*", "menzogn*", "partitocrazia", "propagand*", "scandal*", "tradim*", "tradir*", "tradit*", 
                      "vergogn*", "verita")

    people_centrism  <- c("abitant*", "cittadin*", "consumator*", "contribuent*", "elettor*", "gente", "popol*")

    decadri_and_boussalis_dictionary <- dictionary(list(anti_elitism = anti_elitism, 
                                                        people_centrism = people_centrism))
        
    # Applying the dictionary to the dfm
    
    if (dictionary == "Decadri_Boussalis") {
        
        my_dict_lookup <- convert(dfm_lookup(my_dfm, dictionary = decadri_and_boussalis_dictionary), 
                                  to = "data.frame") 
        
        } else if (dictionary == "Rooduijn_Pauwels") {
        
        my_dict_lookup <- convert(dfm_lookup(my_dfm, dictionary = roduijn_and_pauwels_dictionary), 
                                  to = "data.frame") 
        } else {
        print("Invalid dictionary selected")
        break 
        }
    
    # Extracting the total number of cleaned tokens in each document 
    
    doc_length <- map_int(cleaned_toks, ~ length(.))
    
    # Storing the vector with the total number of cleaned tokens inside a tibble along with each document's id and group cluster

    tbl <- tibble(doc_id = names(cleaned_toks), 
                  n_of_cleaned_toks = doc_length, 
                  group_cluster = docvars(my_corpus) %>% pull(group_cluster2),
                  party = docvars(my_corpus) %>% pull(gruppoP),
                  year = docvars(my_corpus) %>% pull(year))
    
    if (dictionary == "Decadri_Boussalis") {

        my_dict_lookup <- my_dict_lookup %>% 
                          mutate(populist_toks = anti_elitism + people_centrism) %>%  # Summing the anti elitism and the people centrism dimensions
                          left_join(tbl, by = "doc_id") %>% # Joining the tbl with the total number of clean tokens to the dictionary lookup
                          mutate(perc_of_populist_toks = populist_toks / n_of_cleaned_toks) %>% # Computing the proportion of populist tokens over all cleaned tokens
                          mutate(standardized_perc_of_populist_toks = as.double(scale(perc_of_populist_toks))) %>% # Standardizing the %
                          mutate(perc_of_populist_toks = replace_na(perc_of_populist_toks, 0)) %>% # Filling NAs with zeroes
                          relocate(doc_id, year, party, group_cluster, anti_elitism, people_centrism, populist_toks, # Reordering the columns
                                   n_of_cleaned_toks, perc_of_populist_toks, standardized_perc_of_populist_toks)
    
        return(my_dict_lookup)
        
        } else if (dictionary == "Rooduijn_Pauwels") {
        
         my_dict_lookup <- my_dict_lookup %>% 
                          left_join(tbl, by = "doc_id") %>% # Joining the tbl with the total number of clean tokens to the dictionary lookup
                          mutate(perc_of_populist_toks = anti_elitism / n_of_cleaned_toks) %>% # Computing the proportion of populist tokens over all cleaned tokens
                          mutate(standardized_perc_of_populist_toks = as.double(scale(perc_of_populist_toks))) %>% # Standardizing the %
                          mutate(perc_of_populist_toks = replace_na(perc_of_populist_toks, 0)) %>% # Filling NAs with zeroes
                          relocate(doc_id, year, party, group_cluster, anti_elitism,  # Reordering the columns
                                   n_of_cleaned_toks, perc_of_populist_toks, standardized_perc_of_populist_toks)
        
        return(my_dict_lookup)
    }
            
}

# Rooduijn & Pauwels <a class="anchor" id="chapter4"></a>

Let's run the dictionary analysis by using Roodujin and Pauwels' dictionary

In [8]:
df_rp <- dict_analysis(data = texts, dictionary = "Rooduijn_Pauwels")

Let's save the results as an .Rdata file

In [10]:
save(df_rp, file="C:/Users/jacop/OneDrive/Desktop/Tesi/data/df_rp.RData")

The first rows of the dataframe

In [9]:
head(df_rp) 

Unnamed: 0_level_0,doc_id,year,party,group_cluster,anti_elitism,n_of_cleaned_toks,perc_of_populist_toks,standardized_perc_of_populist_toks
Unnamed: 0_level_1,<chr>,<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<dbl>
1,text1,1994,F-ITA,Conservative,0,9,0,-0.1631127
2,text2,1994,PROGR-F,CommunistSocialDemocratic,0,4,0,-0.1631127
3,text3,1994,LEGA-N,EthnicRegional,0,26,0,-0.1631127
4,text4,1994,RC-PROGR,Agrarian,0,4,0,-0.1631127
5,text5,1994,PROGR-F,CommunistSocialDemocratic,0,4,0,-0.1631127
6,text6,1994,PPI,Christian Democratic,0,3,0,-0.1631127


## Construct validity <a class="anchor" id="subparagraph1"></a>

Rooduijn and Pauwels' dictionary captures the "anti-elitism" component of populism, but not the "people-centrism" one. As a result, from a construct validity standpoint, it is only partially valid. The authors motivated the decision to leave out the "people-centrism" dimension by pointing out that the "people" is often referenced to by words such as "us", "we" and "our" which are also used to reference entities other than the people (such as political parties). The inclusion of these words in the dictionary, they argue, would result in a large number of false positives.

## Face validity <a class="anchor" id="subparagraph2"></a>

A populist dictionary has face validity if the allegedly populist parties are indeed populist. In the Italian case, we would expect populist values to be higher for parties that the literature deems populist (i.e. Five Star Movement, Lega Nord, Forza Italia and Il Popolo delle Libertà).

Let's first run an Analysis Of Variance (ANOVA) by using the % of populist tokens as our dependent variable and the group cluster as our predictor. 

The results indicate that the differences in the % of populist tokens between the group clusters are statistically significant.

In [11]:
load("df_rp.RData")

anova_rp <- aov(perc_of_populist_toks ~ group_cluster, data = df_rp)

summary(anova_rp)

                  Df Sum Sq   Mean Sq F value Pr(>F)    
group_cluster     11  0.009 0.0008159   10.77 <2e-16 ***
Residuals     297661 22.560 0.0000758                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can assess how Rooduijn and Pauwels' dictionary fares in terms of face validity by grouping the results of the dictionary analysis by cluster and computing the average % of populist tokens (both standardized and unstandardized). 

The following are the results for the 1994-2021 period. Consistenly with our expectations and with the literature consensus, M5S and Lega Nord ('EthnicRegional') rank among the most populist clusters. However, the standardized % of populist tokens for Forza Italia and Il Popolo delle Libertà (the "Conservative" cluster) turned out to be negative. Interestingly, left-wing clusters such as the "Agrarian" cluster and "Italia dei Valori" received strong populist scores.

In [12]:
df_rp %>% 
group_by(group_cluster) %>% 
summarize(mean_perc_of_populist_toks = mean(perc_of_populist_toks, na.rm = TRUE),
          mean_standardized_perc_of_populist_toks = mean(standardized_perc_of_populist_toks, na.rm = TRUE)) %>% 
arrange(desc(mean_standardized_perc_of_populist_toks))

group_cluster,mean_perc_of_populist_toks,mean_standardized_perc_of_populist_toks
<chr>,<dbl>,<dbl>
M5S,0.0018443707,0.048776717
Agrarian,0.0018252508,0.046528893
IdV,0.0016374709,0.024924353
EthnicRegional,0.0016032646,0.021070849
FarRight,0.0015502313,0.014960523
Liberal,0.0014360122,0.001790075
Conservative,0.0013935367,-0.003057208
Misto,0.0013683535,-0.005969932
CommunistSocialDemocratic,0.0013249569,-0.010940262
Christian Democratic,0.0011804321,-0.027552272


## External validity <a class="anchor" id="subparagraph3"></a>

### Chapel Hill Expert Survey <a class="anchor" id="subparagraph4"></a>

As Rooduijn and Pauwels' dictionary only captures the anti-elite dimension of populism, the external validity will be carried out against the anti-elite salience variable from the CHES dataset, which has been introduced in 2014.

Let's load the CHES dataset

In [13]:
ches <- read_csv("1999-2019_CHES_dataset_means(v2).csv", show_col_types = FALSE)

The countrycode for Italy is 8. The following is a list of all Italian parties in the CHES dataset in the 2014-2019 time period.

In [17]:
ches %>% filter(country == 8 & year >= 2014 & year <= 2019) %>% distinct(party)

party
<chr>
UDC
SC
VdA
PD
FI
LN
FdI
SEL
M5S
CD


While these are the parties included in our dataset in the same timeframe

In [16]:
df_rp %>% filter(year >= 2014 & year <= 2019) %>% distinct(party)

party
<chr>
CI
PD
FI-PDL
M5S
MDP-LU
NCI-SCPI-MAIE
MISTO
SI-SEL-POS-LU
AP-CPE-NCD-NCI
LNA


'Vallée d'Aoste', 'Südtiroler Volkspartei' and 'Radicali Italiani' are not part of our dataset, so let's drop them from the CHES dataset.

In [23]:
to_drop <- c('VdA', 'SVP', 'RI')

ches <- ches %>% 
        filter(country == 8 & year >= 2014 & year <= 2019 & (!party %in% to_drop)) 

Creating a new column in the CHES dataset called 'group_cluster' that matches the 'group_cluster2' column in the dataset

In [27]:
ches <- ches %>%
mutate(group_cluster = case_when(
party == 'UDC' ~ "Christian Democratic",
party == 'SC' ~ "Liberal",
party == 'PD' ~ "CommunistSocialDemocratic",
party == 'FI' ~ "Conservative",
party == 'LN' ~ "EthnicRegional",
party == 'FdI'~ "FarRight",
party == 'SEL'~ "Agrarian",
party == 'M5S'~ "M5S",
party == 'CD' ~ "Christian Democratic",
party == 'RC' ~ "Agrarian",
party == 'NCD'~ "Christian Democratic",
party == 'SI' ~ "Agrarian"))

Let's now have a look at the average % of populist tokens for each cluster during the 2014-2019 timeframe (i.e. the CHES timeframe). We'll drop the "Mixed group" and "Italia Viva" as these two parliamentary groups are absent from the CHES dataset.

The results of the dictionary analysis for the 2014-2019 timeframe are surprisingly different from the ones for the 1994-2021 timeframe. Firstly, all clusters turned out to be populist. Secondly, the FarRight, the Conservative and the Agrarian clusters are now the three leaders in the use of anti-establishment language. Thirdly, and most importantly, M5S and Lega now rank among the least populist clusters. These results (especially the third one) are unexpected and call into question the face validity of Rooduijn and Pauwels' dictionary.

In [32]:
rp_corr <- df_rp %>% 
filter(year >= 2014 & year <= 2019 & group_cluster != "Misto" & party != "Misto" & party != "IV") %>% 
group_by(group_cluster) %>% 
summarize(mean_perc_of_populist_toks = mean(perc_of_populist_toks),
          mean_standardized_perc_of_populist_toks = mean(standardized_perc_of_populist_toks, na.rm = TRUE)) %>% 
arrange(desc(mean_perc_of_populist_toks))

rp_corr

group_cluster,mean_perc_of_populist_toks,mean_standardized_perc_of_populist_toks
<chr>,<dbl>,<dbl>
FarRight,0.002502754,0.12428809
Conservative,0.002205361,0.09035805
Agrarian,0.002199634,0.08947969
Christian Democratic,0.001941371,0.05982241
CommunistSocialDemocratic,0.001906417,0.0558781
M5S,0.001880763,0.0529755
EthnicRegional,0.001817698,0.04591648
Liberal,0.001611024,0.02188732


Here's the average anti-elite salience grouped by group clusters. The difference with Rooduijn and Pauwels' dictionary is stark. Here M5S and Lega rank among the most populist while the Christian Democratic are the second to least populist.

In [33]:
ches_corr <- ches %>% 
group_by(group_cluster) %>% 
summarize(mean_anti_elite_salience = mean(antielite_salience)) %>% 
arrange(desc(mean_anti_elite_salience))

ches_corr

group_cluster,mean_anti_elite_salience
<chr>,<dbl>
M5S,9.444445
EthnicRegional,8.566667
FarRight,7.125
Agrarian,6.639683
Conservative,4.088235
CommunistSocialDemocratic,3.141176
Christian Democratic,2.344444
Liberal,1.0


Let's join the average anti-elite salience from the CHES dataset to the Rooduijn and Pauwels' data.

In [34]:
ches_rp <- rp_corr %>% 
inner_join(ches_corr, by = "group_cluster")

ches_rp

group_cluster,mean_perc_of_populist_toks,mean_standardized_perc_of_populist_toks,mean_anti_elite_salience
<chr>,<dbl>,<dbl>,<dbl>
FarRight,0.002502754,0.12428809,7.125
Conservative,0.002205361,0.09035805,4.088235
Agrarian,0.002199634,0.08947969,6.639683
Christian Democratic,0.001941371,0.05982241,2.344444
CommunistSocialDemocratic,0.001906417,0.0558781,3.141176
M5S,0.001880763,0.0529755,9.444445
EthnicRegional,0.001817698,0.04591648,8.566667
Liberal,0.001611024,0.02188732,1.0


The correlation between the % of populist tokens as calculated with Rooduijn and Pauwels' dictionary and the CHES anti-élite salience variable is not statistically different from zero: the large p-value prevents us from rejecting the null hypothesis. 

In [35]:
cor.test(x = ches_rp$mean_perc_of_populist_toks, 
         y = ches_rp$mean_anti_elite_salience)


	Pearson's product-moment correlation

data:  ches_rp$mean_perc_of_populist_toks and ches_rp$mean_anti_elite_salience
t = 0.81322, df = 6, p-value = 0.4472
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.5007769  0.8344771
sample estimates:
      cor 
0.3150841 


### The PopuList <a class="anchor" id="subparagraph5"></a>

Reading the PopuList dataset

In [36]:
populist <- readxl::read_xlsx("populist-version-2-20200626.xlsx")

All the Italian parties in the PopuList dataset

In [37]:
populist %>% filter(country_name == "Italy") %>% distinct(party_name)

party_name
<chr>
Fiamma Tricolore
Forza Italia – Il Popolo della Libertà
Fratelli d'Italia – Centrodestra Nazionale
Il Popolo della Libertà
Lega (Nord)
Lega d'Azione Meridionale
Liga Veneta
Movimento 5 Stelle
Movimento Sociale Italiano
Partito dei Comunisti Italiani


The parties in our dataset (grouped by cluster)

In [41]:
df_rp %>% group_by(group_cluster) %>% distinct(party) %>% arrange(group_cluster) %>% select(group_cluster, party)

group_cluster,party
<chr>,<chr>
Agrarian,RC-PROGR
Agrarian,COMUNISTA
Agrarian,RC
Agrarian,COM/IT/
Agrarian,RC-SE
Agrarian,SI-SEL-POS-LU
Christian Democratic,PPI
Christian Democratic,DEMO
Christian Democratic,CCD
Christian Democratic,UDEUR


"Fiamma tricolore", "Lega d'Azione Meridionale", "Movimento Sociale Italiano" are not in our dataset. So let's drop them from the PopuList dataset.

In [42]:
to_drop <- c("Fiamma Tricolore", "Lega d'Azione Meridionale", "Movimento Sociale Italiano")

populist <- populist %>% 
filter(country_name == "Italy" & (!party_name %in% to_drop))

Create a new variable called 'group_cluster' whose values match the ones of the group_cluster variable in our dataset

In [43]:
populist <- populist %>% 
mutate(group_cluster = case_when(
party_name == "Forza Italia – Il Popolo della Libertà" ~ "Conservative",
party_name == "Fratelli d'Italia – Centrodestra Nazionale" ~ "FarRight",
party_name == "Il Popolo della Libertà" ~ "Conservative",
party_name == "Lega (Nord)" ~ "EthnicRegional",
party_name == "Liga Veneta" ~ "EthnicRegional",
party_name == "Movimento 5 Stelle" ~ "M5S",
party_name == "Partito dei Comunisti Italiani" ~ "Agrarian",
party_name == "Partito della Rifondazione Comunista" ~ "Agrarian",
party_name == "Rivoluzione Civile" ~ "Agrarian",
party_name == "Sinistra" ~ "Agrarian"
)) 

Let's compute the mean populism score for each cluster in the PopuList dataset. Due to the small number of parties in the dataset, we ended up with only 5 clusters. However, we can see that all populist parties have a score of 1.

In [44]:
populist_corr <- populist %>% 
group_by(group_cluster) %>% 
summarize(mean_populist = mean(populist)) %>% 
arrange(desc(mean_populist))

populist_corr

group_cluster,mean_populist
<chr>,<dbl>
Conservative,1
EthnicRegional,1
FarRight,1
M5S,1
Agrarian,0


Let's join the mean populism score from the PopuList dataset to the results we got from the dictionary analysis (which we've filtered to keep only those parties appearing in both our dataset and in the PopuList dataset). The missing values in the mean_populist variable will be filled with zeros: this makes sense given that these are all mainstream clusters.

In [51]:
to_keep <- c("F-ITA", "FI", "PDL", "FI-PDL", "FDI-AN", "FDI", "LEGA-N", "LEGA-NORD-P", "LNA", "LEGA", "LNP", "M5S", 
             "RC-PROGR", "COMUNISTA", "RC", "COM/IT/", "RC-SE", "SI-SEL-POS-LU")

populist_rp <- df_rp %>% 
filter(party %in% to_keep) %>% 
group_by(group_cluster) %>% 
summarize(mean_perc_of_populist_toks = mean(perc_of_populist_toks)) %>% 
left_join(populist_corr, by = "group_cluster") %>% 
mutate(mean_populist = replace_na(mean_populist, replace = 0)) %>% 
arrange(desc(mean_perc_of_populist_toks))

populist_rp

group_cluster,mean_perc_of_populist_toks,mean_populist
<chr>,<dbl>,<dbl>
FarRight,0.002508086,1
M5S,0.001844371,1
Agrarian,0.001825251,0
EthnicRegional,0.001730049,1
Conservative,0.001393537,1


The correlation with the PopuList score is very close to zero. Based on this result and the one we got with the CHES dataset, we can conclude that Rooduijn and Pauwels' dictionary lacks external validity.

In [52]:
cor.test(x = populist_rp$mean_perc_of_populist_toks,
         y = populist_rp$mean_populist)


	Pearson's product-moment correlation

data:  populist_rp$mean_perc_of_populist_toks and populist_rp$mean_populist
t = 0.083803, df = 3, p-value = 0.9385
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.8710799  0.8925380
sample estimates:
       cor 
0.04832688 


# Decadri & Boussalis <a class="anchor" id="chapter5"></a>

Let's run the dictionary analysis with Decadri and Boussalis' dictionary

In [53]:
df_db <- dict_analysis(data = texts, dictionary = "Decadri_Boussalis")

Let's save the output as an .Rdata file

In [54]:
save(df_db, file="C:/Users/jacop/OneDrive/Desktop/Tesi/data/df_db.RData")

The first rows of the dataframe

In [55]:
head(df_db)

Unnamed: 0_level_0,doc_id,year,party,group_cluster,anti_elitism,people_centrism,populist_toks,n_of_cleaned_toks,perc_of_populist_toks,standardized_perc_of_populist_toks
Unnamed: 0_level_1,<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
1,text1,1994,F-ITA,Conservative,0,0,0,9,0,-0.3053681
2,text2,1994,PROGR-F,CommunistSocialDemocratic,0,0,0,4,0,-0.3053681
3,text3,1994,LEGA-N,EthnicRegional,0,0,0,26,0,-0.3053681
4,text4,1994,RC-PROGR,Agrarian,0,0,0,4,0,-0.3053681
5,text5,1994,PROGR-F,CommunistSocialDemocratic,0,0,0,4,0,-0.3053681
6,text6,1994,PPI,Christian Democratic,0,0,0,3,0,-0.3053681


## Construct validity <a class="anchor" id="subparagraph6"></a>

Decadri and Boussalis' dictionary catpures both the "anti-elitism" and "people-centrism" dimenions of populist ideology and it thus constitutes an improvement over Rooduijn and Pauwels' dictionary in terms of construct validity.

## Face validity <a class="anchor" id="subparagraph7"></a>

Let's run an Analysis Of Variance (ANOVA) as we did before. Again, the difference in the % of populist tokens between the clusters is statistically significant.

In [56]:
load("df_db.RData")

anova_db <- aov(perc_of_populist_toks ~ group_cluster, data = df_db)

summary(anova_db)

                  Df Sum Sq  Mean Sq F value Pr(>F)    
group_cluster     11   0.14 0.012888   60.74 <2e-16 ***
Residuals     297661  63.15 0.000212                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To assess the face validity of Decadri and Boussalis' dictionary we'll have a look at the mean % of populist tokens (both anti-establishment and people-centrism) grouped by cluster.

Consistently with our expectations, M5S and Lega rank among the most populist clusters. However, the language used by the "Conservative" (Forza Italia and Il Popolo delle Libertà) and the "FarRight" (Fratelli d'Italia, Alleanza Nazionale) clusters turned out to be less populist than the average. 

In [57]:
df_db %>% 
group_by(group_cluster) %>% 
summarize(mean_perc_of_populist_toks = mean(perc_of_populist_toks, na.rm = TRUE),
          mean_standardized_perc_of_populist_toks = mean(standardized_perc_of_populist_toks, na.rm = TRUE)) %>% 
arrange(desc(mean_standardized_perc_of_populist_toks))

group_cluster,mean_perc_of_populist_toks,mean_standardized_perc_of_populist_toks
<chr>,<dbl>,<dbl>
M5S,0.006765723,0.158780035
IdV,0.0056841,0.084405945
EthnicRegional,0.005521892,0.073436166
Liberal,0.004949603,0.034039489
Agrarian,0.004719057,0.018293719
Conservative,0.004385874,-0.004559934
Misto,0.004282996,-0.011653767
Christian Democratic,0.004256468,-0.013475787
Green,0.004173576,-0.019174781
FarRight,0.004090657,-0.024775337


## External validity <a class="anchor" id="subparagraph8"></a>

### Chapel Hill Expert Survey <a class="anchor" id="subparagraph9"></a>

As Decadri and Boussalis' dictionary captures both dimensions of populism we will validate it against a combination of two different variables from the CHES dataset, i.e. "anti-élite salience" and "people_vs_élite". We'll use the former as a proxy for the anti-establishment component and the latter as a proxy for the people-centrist one. The "people_vs_élite" variable has been introduced in the 2019 edition of the dataset, so we'll only work with observations from this year.

In [64]:
ches <- read_csv("1999-2019_CHES_dataset_means(v2).csv", show_col_types = FALSE)

The following are the Italian parties in the CHES dataset for the year 2019

In [65]:
ches %>% filter(country == 8 & year == 2019) %>% select(party, antielite_salience, people_vs_elite)

party,antielite_salience,people_vs_elite
<chr>,<dbl>,<dbl>
RI,2.2,3.357143
M5S,8.888889,9.529411
SI,3.785714,2.666667
FdI,8.0,6.625
PD,1.882353,2.0625
LN,8.333333,6.9375
SVP,2.166667,1.4
FI,4.176471,4.066667


The parties in our dataset in the same year

In [61]:
df_db %>% filter(year == 2019) %>% distinct(party)

party
<chr>
M5S
LEGA
PD
IV
FI
FDI
MISTO
LEU


"Radicali Italiani" and "Südtiroler Volkspartei" are not in our dataset so we'll drop them from CHES

In [68]:
to_drop <- c("RI", "SVP")

ches <- ches %>% 
filter(country == 8 & year == 2019 & (!party %in% to_drop))

Let's create another group_cluster variable

In [74]:
ches <- ches %>% 
mutate(group_cluster = case_when(
party == "M5S" ~ "M5S",
party == "LN" ~ "EthnicRegional",
party == "PD" ~ "CommunistSocialDemocratic",
party == "FI" ~ "Conservative",
party == "FdI" ~ "FarRight",
party == "SI" ~ "Agrarian"))

Let's compute the average populist value for each cluster in the CHES dataset by summing the people vs elite and the anti-elite salience variables and then taking the mean

In [78]:
ches_corr <- ches %>% 
group_by(group_cluster) %>% 
summarize(mean_populism = mean(people_vs_elite + antielite_salience)) %>% 
arrange(desc(mean_populism))

ches_corr

group_cluster,mean_populism
<chr>,<dbl>
M5S,18.418301
EthnicRegional,15.270833
FarRight,14.625
Conservative,8.243137
Agrarian,6.452381
CommunistSocialDemocratic,3.944853


Similarly to what we saw for Rooduijn and Pauwels', when we zoom in on the recent legislatures the face validity is compromised. Here, PD got a higher % of populist tokens compared with la Lega and M5S. 

In [89]:
to_drop <- c("IV", "MISTO")

df_db %>% 
filter(year == 2019 & (! party %in% to_drop)) %>% 
group_by(group_cluster) %>% 
summarize(mean_perc_of_populist_toks = mean(perc_of_populist_toks)) %>% 
arrange(desc(mean_perc_of_populist_toks))

group_cluster,mean_perc_of_populist_toks
<chr>,<dbl>
Conservative,0.007566963
FarRight,0.006830518
CommunistSocialDemocratic,0.006006262
EthnicRegional,0.004312354
M5S,0.004217489


In [84]:
to_drop <- c("IV", "MISTO")

ches_db <- df_db %>% 
filter(year == 2019 & (! party %in% to_drop)) %>% 
group_by(group_cluster) %>% 
summarize(mean_perc_of_populist_toks = mean(perc_of_populist_toks)) %>% 
inner_join(ches_corr, by = "group_cluster")

ches_db

group_cluster,mean_perc_of_populist_toks,mean_populism
<chr>,<dbl>,<dbl>
CommunistSocialDemocratic,0.006006262,3.944853
Conservative,0.007566963,8.243137
EthnicRegional,0.004312354,15.270833
FarRight,0.006830518,14.625
M5S,0.004217489,18.418301


The p-value is too large for us to reject the null hypothesis. 

In [85]:
cor.test(x = ches_db$mean_perc_of_populist_toks, 
         y = ches_db$mean_populism)


	Pearson's product-moment correlation

data:  ches_db$mean_perc_of_populist_toks and ches_db$mean_populism
t = -1.2567, df = 3, p-value = 0.2978
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.9679910  0.6121996
sample estimates:
      cor 
-0.587259 


### The PopuList <a class="anchor" id="subparagraph10"></a>

Let's join the mean populism score from the PopuList dataset to the dictionary analysis results 

In [90]:
populist_db <- df_db %>% 
filter(group_cluster != 'Misto') %>% 
group_by(group_cluster) %>% 
summarize(mean_perc_of_populist_toks = mean(perc_of_populist_toks)) %>% 
left_join(populist_corr, by = "group_cluster") %>% 
mutate(mean_populist = replace_na(mean_populist, replace = 0)) %>% 
arrange(desc(mean_perc_of_populist_toks))

populist_db

group_cluster,mean_perc_of_populist_toks,mean_populist
<chr>,<dbl>,<dbl>
M5S,0.006765723,1
IdV,0.0056841,0
EthnicRegional,0.005521892,1
Liberal,0.004949603,0
Agrarian,0.004719057,0
Conservative,0.004385874,1
Christian Democratic,0.004256468,0
Green,0.004173576,0
FarRight,0.004090657,1
CommunistSocialDemocratic,0.003906236,0


The relationship between the mean % of populist tokens as calculated with Decadri and Boussalis' dictionary and the mean populist score from the PopuList dataset is somewhat strong. However, based on the the P-value we can't reject the null hypothesis.

In [91]:
cor.test(x = populist_db$mean_perc_of_populist_toks,
         y = populist_db$mean_populist)


	Pearson's product-moment correlation

data:  populist_db$mean_perc_of_populist_toks and populist_db$mean_populist
t = 1.2996, df = 9, p-value = 0.226
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2657295  0.8053455
sample estimates:
      cor 
0.3975105 
