<a href="https://colab.research.google.com/github/MoritzLaurer/zeroshot-classifier/blob/main/1_data_harmonization_manifesto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prepare the data from the Manifesto Corpus
Dataset details: https://manifesto-project.wzb.eu/

In [None]:
import pandas as pd
import numpy as np
import os

SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

## Load data

In [None]:
## connect to google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

#set wd
print(os.getcwd())
os.chdir("/content/drive/My Drive/PhD/zero-shot-models")
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/My Drive/PhD/zero-shot-models


In [None]:
## load dfs
# correct v5 codebook: https://manifesto-project.wzb.eu/down/papers/handbook_2021_version_5.pdf - the PDF on the following website is wrong, but html is correct: https://manifesto-project.wzb.eu/coding_schemes/mp_v5
# we are working with v4 for backwards compatibility https://manifesto-project.wzb.eu/down/papers/handbook_2011_version_4.pdf
# overview of changes from v4 to v5: https://manifesto-project.wzb.eu/down/papers/Evolution_of_the_Manifesto_Coding_Instructions_and_the_Category_Scheme.pdf
# switch was 2016/2017
# working with version provided by Manifesto team

df = pd.read_csv("./datasets_raw/manifesto_all.zip", index_col="Unnamed: 0")

print(df.columns)
print(len(df))


Index(['text', 'cmp_code', 'eu_code', 'manifesto_id', 'party', 'date',
       'country_code', 'country_name', 'coderid', 'coderyear', 'testresult',
       'testeditsim', 'text_cleaned', 'cmp_code_hb4'],
      dtype='object')
1264301


## Clean data

In [None]:
# deep copy
df_cl = df.copy(deep=True)
df_cl = df_cl[["text", "cmp_code", "cmp_code_hb4", "manifesto_id", "party", "date", "country_name", "testresult"]]  # "eu_code", "Text_CharsReplaced"

print(len(df_cl))

# only English texts from English speaking countries
country_lst = ["New Zealand", "United Kingdom", "Ireland", "Australia", "United States", "South Africa"] # Canada
df_cl = df_cl[df_cl.country_name.isin(country_lst)]
print(len(df_cl))

# check for NAs
df_cl = df_cl[~df_cl["text"].isna()]
print(len(df_cl))

# remove headlines
df_cl = df_cl[~df_cl["cmp_code_hb4"].isna()]  # 13k NA in English data. seem to be headlines and very short texts. they can have meaning
print(len(df_cl))
df_cl = df_cl[~df_cl["cmp_code_hb4"].str.match("H", na=False)]  # 7.6k headlines
print(len(df_cl))

# remove very short and long strings - too much noise
#df_cl = df_cl[df_cl.text.str.len().ge(30)]  # removes  67
#print(len(df_cl))
df_cl = df_cl[~df_cl.text.str.len().ge(1200)]  # remove very long descriptions, assuming that they contain too much noise from other types and unrelated language. 1000 characters removes around 9k
print(len(df_cl))

## duplicates
# remove texts where exact same string has different code? Can keep it for experiments with context - shows value of context for disambiguation
#df_cl = df_cl.groupby(by="text").filter(lambda x: len(x.cmp_code.unique()) == 1)
#print(len(df_cl))
# maintain duplicates to maintain sequentiality of texts
#df_cl = df_cl[~df_cl.text.duplicated(keep="first")]  # around 7k
#print(len(df_cl))


1264301
145950
145950
133331
126109
126109


In [None]:
# translating label codes to label text with codebook mapping. MPDS2020a-1
# see codebook https://manifesto-project.wzb.eu/down/papers/handbook_2011_version_4.pdf
# Note that the "main" codes are from v4 for backwards compatibility with older data
# for new v5 categories: everything was aggregated up into the old v4 categories, except for 202.2, 605.2 und 703.2, which where added to 000.
df_label_map = pd.read_csv("./datasets_raw/manifesto_codebook.csv")

df_label_map.domain_name = df_label_map.domain_name.fillna("No other category applies")  # for some reason domain_name in case of no label is NaN. replace with expressive string

# translating label codes to label text with codebook mapping
# info on two column cmp_codes (v5 codebook) and cmp_code_hb4 (v4 codebook - backwardscompatible): "Außerdem enthält die Spalte cmp_code jetzt einfach die unmodifizierten original cmp_codes (also auch die neuen handbuch 5 Kategorien, wo sie angewendet wurden). Dafür gibt es jetzt cmp_code_hb4, in der dann alles in hb4 umgewandelt wurde (also 605.2 zu "000", 202.2 zu "000" und 703.2 zu "000", alle übrigen 5er Kategorien hochaggregiert)
# labels were name changed from v4 to v5 - but not changing it because working with v4.

# mapping of numeric codes to domain and subcat titles. only use v4 codebook numeric codes with XX.0 floats, ignore XX.1 codes from codebook because not present in masterfile shared by Tobias due to backwords compatibility
code_to_domain_map = {int(row["code"]): row["domain_name"] for i, row in df_label_map.iterrows() if str(row["code"])[-1] == "0"}  # only take labels which don't have old sub category. old subcategories indicated by XX.1 floats, main categories indicated by XX.0 floats
code_to_subcat_map = {int(row["code"]): row["title"] for i, row in df_label_map.iterrows() if str(row["code"])[-1] == "0"}

df_cl["label_domain_text"] = df_cl.cmp_code_hb4.astype(int).map(code_to_domain_map)
df_cl["label_subcat_text"] = df_cl.cmp_code_hb4.astype(int).map(code_to_subcat_map)
print(len(df_cl.label_domain_text.value_counts()))
print(len(df_cl.label_subcat_text.value_counts()))

# remove "No other category applies"
df_cl = df_cl[df_cl["label_subcat_text"] != "No other category applies"]

# ! decide on label level to use for downstream analysis
df_cl["label_text"] = df_cl["label_subcat_text"]
df_cl["label"] = pd.factorize(df_cl["label_text"], sort=True)[0]

# test that label and label_text correspond
assert len(df_cl[df_cl.label_text.isna()]) == 0  # each label_cap2 could be mapped to a label text. no label text is missing.
print(np.sort(df_cl["label_text"].value_counts().tolist()) == np.sort(df_cl["label"].value_counts().tolist()))

# final update
df_cl = df_cl.reset_index(drop=True)
df_cl.index = df_cl.index.rename("idx")  # name index. provides proper column name in dataset object downstream

print(df_cl.label_text.value_counts(), "\n")
print(df_cl.country_name.value_counts())


8
57
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True]
Welfare State Expansion                       14493
Technology and Infrastructure: Positive        9424
Education Expansion                            7698
Environmental Protection                       7597
Equality: Positive                             6857
Law and Order: Positive                        5249
Market Regulation                              4031
Labour Groups: Positive                        3956
Governmental and Administrative Efficiency     3687
Political Authority                            3569
Incentives: Positive                           3507
Economic Growth: Positive                      3252
Agriculture and Farmers: Posi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl["label_text"] = df_cl["label_subcat_text"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl["label"] = pd.factorize(df_cl["label_text"], sort=True)[0]


In [None]:
# merge all 2/3 consecutive rows that have same label into one row
# concatenation of text increases data quality and makes texts longer

text_merged = []
label_text_merged = []

for name_df, group_df in df_cl.groupby(by="manifesto_id", group_keys=False, as_index=False, sort=False):
    # iterate over each manifesto separately to avoid merging sentences across manifestos
    group_df["text"].fillna("", inplace=True)
    idx_already_used = []
    for i in range(len(group_df) - 1):
        if i in idx_already_used:
            continue
        # 3 text window
        """if group_df["label_text"].iloc[i] == group_df["label_text"].iloc[i+1] == group_df["label_text"].iloc[i+2]:
            # Concatenate the strings first and then append to the list
            merged_text = group_df["text"].iloc[i] + " " + group_df["text"].iloc[i+1] + " " + group_df["text"].iloc[i+2]
            text_merged.append(merged_text)
            label_text_merged.append(group_df["label_text"].iloc[i])
            idx_already_used.extend([i, i+1, i+2])  # also include 'i' since this index is used now"""
        # 2 text window
        if group_df["label_text"].iloc[i] == group_df["label_text"].iloc[i+1]:
            # Concatenate the strings first and then append to the list
            merged_text = group_df["text"].iloc[i] + " " + group_df["text"].iloc[i+1]
            text_merged.append(merged_text)
            label_text_merged.append(group_df["label_text"].iloc[i])
            idx_already_used.extend([i, i+1])  # also include 'i' since this index is used now


In [None]:
df_merged = pd.DataFrame({"text": text_merged, "label_text": label_text_merged})
df_merged["label_standard"] = df_merged.label_text.factorize(sort=True)[0]
df_merged.label_text.value_counts()


Welfare State Expansion                       5775
Technology and Infrastructure: Positive       3458
Education Expansion                           3109
Environmental Protection                      2952
Law and Order: Positive                       2118
Equality: Positive                            1988
Market Regulation                             1389
Agriculture and Farmers: Positive             1272
Incentives: Positive                          1166
Governmental and Administrative Efficiency    1132
Labour Groups: Positive                       1098
Political Authority                           1041
Anti-Growth Economy: Positive                 1016
Decentralization                               997
Non-economic Demographic Groups                938
Internationalism: Positive                     920
Democracy                                      902
Culture: Positive                              885
Military: Positive                             870
Economic Growth: Positive      

In [None]:
df_merged.sample(100, random_state=SEED_GLOBAL)

Unnamed: 0,text,label_text,label_standard
24379,"give buses and trams on-road priority, and pro...",Technology and Infrastructure: Positive,50
21978,We support a cap on discretionary spending tha...,Economic Planning,15
22265,President Bush led the G-8 in endorsing the es...,Technology and Infrastructure: Positive,50
322,Stop further private finance initiative (PFI) ...,Nationalisation,43
32971,Poverty is not inevitable in a decent society....,Equality: Positive,19
...,...,...,...
42018,We want our whānau to be the best that they ca...,Welfare State Expansion,54
16501,Hague Preferences: While negotiating the new C...,Agriculture and Farmers: Positive,0
27287,These Action Plans will allow organisations to...,Labour Groups: Positive,32
7250,The Governor will also have a statutory requir...,Decentralization,10


## Train-test split

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df_merged, test_size=0.4, random_state=SEED_GLOBAL,
    shuffle=True, stratify=df_merged["label_text"]
)

print(f"Overall train size: {len(df_train)}")
print(f"Overall test size: {len(df_test)}")
df_train_test_distribution = pd.DataFrame([
    df_train.label_text.value_counts().rename("train"),
    df_test.label_text.value_counts().rename("test"),
    df_merged.label_text.value_counts().rename("all")
]).transpose()

df_train_test_distribution


Overall train size: 25527
Overall test size: 17018


Unnamed: 0,train,test,all
Welfare State Expansion,3465,2310,5775
Technology and Infrastructure: Positive,2075,1383,3458
Education Expansion,1865,1244,3109
Environmental Protection,1771,1181,2952
Law and Order: Positive,1271,847,2118
Equality: Positive,1193,795,1988
Market Regulation,833,556,1389
Agriculture and Farmers: Positive,763,509,1272
Incentives: Positive,700,466,1166
Governmental and Administrative Efficiency,679,453,1132


## Save data

In [None]:
print(os.getcwd())

dataset_name = "manifesto"
df_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

/content/drive/MyDrive/PhD/zero-shot-models
