# Manifesto data retrieval
For US, Japan

## Download Manifesto Data
- https://manifesto-project.wzb.eu/information/documents/api

In [None]:
# For Google Colab
# !pip install dotenv langdetect deep_translator googletrans==4.0.0-rc1

In [2]:
import dotenv
import os
import sys
import pandas as pd

In [None]:
sys.path.append("../")
from src.data.download_manifesto import DownloadManifesto

dotenv.load_dotenv()
dataset_key = "MPDS2024a"
version = '2024-1'
api_key = os.getenv("MANIFESTO_API")
downloader =  DownloadManifesto(dataset_key, version, api_key)

In [None]:
countries = ['United States', 'Japan']

In [None]:
from tqdm import tqdm

df_all_countries = []
for country in tqdm(countries):
    print(country)
    result = downloader.get_country_data(country)
    df, metadata = downloader.get_metadata(result)
    df_country = downloader.get_texts(df)  # Get texts
    df_all_countries.append(df_country)

In [None]:
df_all = pd.concat(df_all_countries, axis='rows')
df_all = df_all.rename(lambda x: pd.to_datetime(x, format = "%Y%m"), axis=0, level=1) # convert date to date time
df_all = df_all.reset_index()
df_all.head(2)

In [None]:
file_name = "../data/processed/manifesto_us_japan.parquet"
df_all.to_parquet(file_name)

## Import generated data

In [4]:
df_all = pd.read_parquet("../data/processed/manifesto_us_japan.parquet")

## Count words across time

### Count by country (only for English speaking countries)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vocab_list = ['ghg', 'greenhouse', 'net-zero', 'carbon']
vectorizer = CountVectorizer(
    stop_words = 'english',     # Remove stop words. Can be a list of stop words or a string from {'english', 'spanish'}.
    lowercase = True,           # Convert text to lowercase.
    ngram_range = (1,1),
    vocabulary = vocab_list

)
counts = vectorizer.fit_transform(df_all['text']).toarray().sum(axis=0)
word_freq = dict(zip(vectorizer.get_feature_names_out(), counts))
word_freq

{'ghg': 0, 'greenhouse': 23, 'net-zero': 0, 'carbon': 42}

### Count by year and date

In [None]:
# add year column
df_all = df_all.reset_index().assign(year = lambda column: column['date'].dt.year)
df_all.head(2)

Unnamed: 0,index,countryname,date,party,partyname,keys,manifesto_id,text,year
0,0,United States,1960-11-01,61320,Democratic Party,61320_196011,61320_196011,"In 1796, in America's first contested national...",1960
1,1,United States,1960-11-01,61620,Republican Party,61620_196011,61620_196011,PREAMBLE The United States is living in an age...,1960


In [None]:
word_freq_by_country = {}
for country_name, country_df in df_all.groupby(['countryname', 'year']):
    vectorizer = CountVectorizer(
        stop_words='english',
        lowercase=True,
        ngram_range=(1, 1),
        vocabulary=vocab_list
    )
    counts = vectorizer.fit_transform(country_df['text']).toarray().sum(axis=0)
    word_freq_by_country[country_name] = dict(zip(vectorizer.get_feature_names_out(), counts))
df_timeseries = pd.DataFrame(word_freq_by_country).transpose()
df_timeseries.head()

Unnamed: 0,Unnamed: 1,ghg,greenhouse,net-zero,carbon
Japan,2014,0,0,0,0
Japan,2017,0,0,0,0
United States,1960,0,0,0,0
United States,1964,0,0,0,0
United States,1968,0,0,0,0


In [None]:
df_long = df_timeseries.stack().to_frame('count').reset_index()
df_long.head(3)

Unnamed: 0,level_0,level_1,level_2,count
0,Japan,2014,ghg,0
1,Japan,2014,greenhouse,0
2,Japan,2014,net-zero,0


In [None]:
df_long = df_long.rename({"level_0": 'country', 'level_1':'year', 'level_2': 'vocab'}, axis='columns')
df_long.head(2)

Unnamed: 0,country,year,vocab,count
0,Japan,2014,ghg,0
1,Japan,2014,greenhouse,0


#### Export

In [None]:
df_long.to_csv("../data/processed/manifesto_us_japan_word_freq.csv", index=False)

## Count with LLM with translation of Japanese
It needs CUDA to run, running this with CPU might be slow

In [None]:
import pandas as pd
import re
from langdetect import detect
from deep_translator import GoogleTranslator
from transformers import pipeline
from datasets import Dataset
from googletrans import Translator
from tqdm import tqdm


# Model loading for GPU usage
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)
labels = ["climate-related"]

# Translation function using Google Translator by sentence

def translate_japanese_to_english(text):
    translator = Translator()
    sentences = re.split(r'(?<=[。！？])', text)
    translated_sentences = []

    for sentence in sentences:
       sentence = sentence.strip()
       if sentence:
          try:
              translated = translator.translate(sentence, src='ja', dest='en')
              if translated and hasattr(translated, 'text') and translated.text:
                    translated_sentences.append(translated.text)
              else:
                  print(f"翻訳失敗: {sentence}")
          except Exception as e:
              print(f"翻訳エラー: {e}（文: {sentence}）")
    return ' '.join(translated_sentences)

# Multilingual classification function for batch processing
def multilang_classify_climate_sentences_batch(batch):
    texts = batch["text"]
    eng_texts = []
    valid_indices = []

    # Translate each text
    for i, text in enumerate(texts):
        lang = detect(text)
        if lang == 'ja':
            translated = translate_japanese_to_english(text)
            if translated.strip():
                eng_texts.append(translated)
                valid_indices.append(i)
            else:
                eng_texts.append("")  # Add empty string if translation fails
        else:
            if text.strip():
                eng_texts.append(text)
                valid_indices.append(i)
            else:
                eng_texts.append("")  # Add empty string if translation fails

    # Classify the translated texts
    scores = [0.0] * len(texts)
    if any(t.strip() for t in eng_texts):
        non_empty_texts = [t if t.strip() else "empty" for t in eng_texts]
        results = classifier(non_empty_texts, candidate_labels=labels)
        for i, r in enumerate(results):
            scores[i] = r["scores"][0]

    return {"eng_text": eng_texts, "score": scores}


# df_all = pd.DataFrame({'text': [...text in Japanese or English...]})

# DataFrame -> Hugging Face Dataset
dataset = Dataset.from_pandas(df_all)

# Batch processing for translation and multilingual classification
dataset = dataset.map(multilang_classify_climate_sentences_batch, batched=True, batch_size=16)

# Dataset -> DataFrame (reverting back to pandas DataFrame)
df_all_result = dataset.to_pandas()

# Add columns to original DataFrame
df_all["eng_text"] = df_all_result["eng_text"]
df_all["score"] = df_all_result["score"]

df_all.head()


In [6]:
df_all.to_csv("../data/processed/manifesto_us_japan_related_score.csv")