<a href="https://colab.research.google.com/github/MoritzLaurer/zeroshot-classifier/blob/main/1_data_harmonization_capsotu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prepare data from the Comparative Agendas Project (CAP) - State of the Union (SotU) dataset
Dataset details: https://www.comparativeagendas.net/datasets_codebooks

In [None]:
import pandas as pd
import numpy as np
import os

SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

## Download data

In [None]:
## connect to google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

#set wd
print(os.getcwd())
os.chdir("/content/drive/My Drive/PhD/zero-shot-models")
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/My Drive/PhD/zero-shot-models


In [None]:
# overview of CAP data: https://www.comparativeagendas.net/datasets_codebooks
# overall CAP master codebook: https://www.comparativeagendas.net/pages/master-codebook
# SOTU codebook 2015: https://comparativeagendas.s3.amazonaws.com/codebookfiles/State_of_the_Union_Address_Codebook.pdf
df = pd.read_csv("https://comparativeagendas.s3.amazonaws.com/datasetfiles/US-Exec_SOTU_2023.csv")
print(df.columns)
print(len(df))


Index(['id', 'doc_count', 'filter_PolicySentence', 'date', 'oral_delivery',
       'outgoing', 'congress', 'president', 'pres_party', 'divided',
       'control_house', 'control_senate', 'year', 'month', 'day', 'source',
       'description', 'pap_majortopic', 'pap_subtopic', 'majortopic',
       'subtopic'],
      dtype='object')
24770


## Data cleaning

In [None]:
# contains two types of CAP topics
# based on codebook, seems like PAP is older policy agendas project code from US, while CAP is newer, more international project code
# ! in CAP-us-courts it made more sense to use pap_majortopic
df_cl = df[["description", 'majortopic', 'subtopic', "year", "president", "pres_party", "id"]].copy(deep=True)
print(len(df_cl))

# remove NAs
df_cl = df_cl[~df_cl.description.isna()]
print(len(df_cl))
# remove very short strings
#df_cl = df_cl[df_cl.description.str.len().ge(30)]  # removes X. mostly noise, some content like "Amen.	"
#print(len(df_cl))
df_cl = df_cl[~df_cl.description.str.len().ge(1200)]  # remove very long descriptions, assuming that they contain too much noise from other types and unrelated language. 1000 characters removes around 9k
print(len(df_cl))
# are there unique texts which are annotated with more than one type? Yes, 105. String like " ", ".", "Thank you very much", "#NAME?", "It's the right thing to do."
#df_cl = df_cl.groupby(by="description").filter(lambda x: len(x.value_counts("majortopic")) == 1)
#print(len(df_cl))
# remove duplicates
# maintain duplicates to maintain sequentiality of texts
#df_cl = df_cl[~df_cl.description.duplicated(keep="first")]  # 170 duplicates
#print(len(df_cl))

# renumber "Other" cateogry label from -555 to 99
df_cl.majortopic = df_cl.majortopic.replace(-555, 99)
df_cl.subtopic = df_cl.subtopic.replace(-555, 99)

# rename columns
df_cl = df_cl.rename(columns={"majortopic": "label_cap2", "subtopic": "label_cap4", "description": "text", "id": "id_original"})

# remove "Other" class
df_cl = df_cl[df_cl.label_cap4 != 99]
df_cl = df_cl[df_cl.label_cap2 != 0]
print(len(df_cl))

df_cl = df_cl.reset_index(drop=True)
df_cl.index = df_cl.index.rename("idx")  # name index. provides proper column name in dataset object downstream


24770
24730
24730
19969


In [None]:
# adding label_text to label ids
# label names from master codebook as of Oct. 2021, https://www.comparativeagendas.net/pages/master-codebook
label_text_map_cap2 = {
    1: "Macroeconomics",
    2: "Civil Rights",
    3: "Health",
    4: "Agriculture",
    5: "Labor",
    6: "Education",
    7: "Environment",
    8: "Energy",
    9: "Immigration",
    10: "Transportation",
    12: "Law and Crime",
    13: "Social Welfare",
    14: "Housing",
    15: "Domestic Commerce",
    16: "Defense",
    17: "Technology",
    18: "Foreign Trade",
    19: "International Affairs",
    20: "Government Operations",
    21: "Public Lands",
    23: "Culture",
    #99: "Other",
}

df_cl["label_cap2_text"] = df_cl.label_cap2.map(label_text_map_cap2)
print(f"Maybe label_cap4 later too. Very fine-grained number of classes: {len(df_cl.label_cap4.unique())}. Makes for interesting data")

# labels numbers in alphabetical order of text
df_cl["label"] = pd.factorize(df_cl["label_cap2_text"], sort=True)[0]
df_cl["label_text"] = df_cl["label_cap2_text"]

df_cl = df_cl[["label", "label_text", "text", 'label_cap2', "label_cap2_text", 'label_cap4',  "year", "president", "pres_party", "id_original"]]

# test that label_cap2 and label_cap2_text correspond
assert len(df_cl[df_cl.label_cap2_text.isna()]) == 0  # each label_cap2 could be mapped to a label text. no label text is missing.
print(np.sort(df_cl["label_cap2_text"].value_counts().tolist()) == np.sort(df_cl["label_cap2"].value_counts().tolist()))

df_cl.label_cap2_text.value_counts()


Maybe label_cap4 later too. Very fine-grained number of classes: 195. Makes for interesting data
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]


Macroeconomics           3413
International Affairs    3293
Defense                  2975
Health                   1249
Government Operations    1158
Education                 963
Law and Crime             947
Labor                     909
Social Welfare            816
Civil Rights              647
Foreign Trade             601
Energy                    507
Domestic Commerce         393
Agriculture               377
Environment               334
Technology                329
Housing                   297
Immigration               279
Transportation            260
Public Lands              205
Culture                    17
Name: label_cap2_text, dtype: int64

In [None]:
# merge all 3 consecutive rows that have same label into one row
# concatenation of text increases data quality and makes texts longer

text_merged = []
label_text_merged = []

for name_df, group_df in df_cl.groupby(by=["president", "year"], group_keys=False, as_index=False, sort=False):
    # iterate over each manifesto separately to avoid merging sentences across manifestos
    group_df["text"].fillna("", inplace=True)
    idx_already_used = []
    for i in range(len(group_df) - 1):
        if i in idx_already_used:
            continue
        # 3 text window
        """if group_df["label_text"].iloc[i] == group_df["label_text"].iloc[i+1] == group_df["label_text"].iloc[i+2]:
            # Concatenate the strings first and then append to the list
            merged_text = group_df["text"].iloc[i] + " " + group_df["text"].iloc[i+1] + " " + group_df["text"].iloc[i+2]
            text_merged.append(merged_text)
            label_text_merged.append(group_df["label_text"].iloc[i])
            idx_already_used.extend([i, i+1, i+2])  # also include 'i' since this index is used now"""
        # 2 text window
        if group_df["label_text"].iloc[i] == group_df["label_text"].iloc[i+1]:
            # Concatenate the strings first and then append to the list
            merged_text = group_df["text"].iloc[i] + " " + group_df["text"].iloc[i+1]
            text_merged.append(merged_text)
            label_text_merged.append(group_df["label_text"].iloc[i])
            idx_already_used.extend([i, i+1])  # also include 'i' since this index is used now


In [None]:
df_merged = pd.DataFrame({"text": text_merged, "label_text": label_text_merged})
df_merged["label_standard"] = df_merged.label_text.factorize(sort=True)[0]
df_merged.label_text.value_counts()


International Affairs    1468
Macroeconomics           1455
Defense                  1304
Health                    524
Government Operations     480
Education                 416
Law and Crime             398
Social Welfare            338
Labor                     330
Civil Rights              253
Foreign Trade             238
Energy                    209
Domestic Commerce         149
Agriculture               149
Technology                126
Immigration               121
Environment               120
Housing                   116
Transportation            104
Public Lands               83
Culture                     5
Name: label_text, dtype: int64

In [None]:
df_merged

Unnamed: 0,text,label_text,label_standard
0,To the Congress of the United States: A quarte...,Macroeconomics,16
1,The Congress has shown its satisfaction with t...,Macroeconomics,16
2,The President bears the responsibility for rec...,Government Operations,9
3,And that program requires consideration in con...,Macroeconomics,16
4,Where increased programs have been recommended...,Government Operations,9
...,...,...,...
8381,And we cannot go on losing 17 veterans a day ...,Health,10
8382,"And fourth, last year, Jill and I re-ignited ...",Health,10
8383,Turn more cancers from death sentences to tre...,Health,10
8384,She was just a year old when she was diagnose...,Health,10


## Train-test split

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df_merged, test_size=0.4, random_state=SEED_GLOBAL,
    shuffle=True, stratify=df_merged["label_text"]
)

print(f"Overall train size: {len(df_train)}")
print(f"Overall test size: {len(df_test)}")
df_train_test_distribution = pd.DataFrame([
    df_train.label_text.value_counts().rename("train"),
    df_test.label_text.value_counts().rename("test"),
    df_merged.label_text.value_counts().rename("all")
]).transpose()

df_train_test_distribution


Overall train size: 5031
Overall test size: 3355


Unnamed: 0,train,test,all
International Affairs,881,587,1468
Macroeconomics,873,582,1455
Defense,782,522,1304
Health,314,210,524
Government Operations,288,192,480
Education,249,167,416
Law and Crime,239,159,398
Social Welfare,203,135,338
Labor,198,132,330
Civil Rights,152,101,253


## Save data

In [None]:
print(os.getcwd())

dataset_name = "capsotu"
df_train.to_parquet(f"./datasets_standardized/ds_{dataset_name}_train.gzip", compression='gzip')
df_test.to_parquet(f"./datasets_standardized/ds_{dataset_name}_test.gzip", compression='gzip')

/content/drive/My Drive/PhD/zero-shot-models
