# Manifesto data retrieval
For US, Japan

## Download Manifesto Data
- https://manifesto-project.wzb.eu/information/documents/api

In [5]:
# For Google Colab
!pip install dotenv langdetect deep_translator googletrans==4.0.0-rc1

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.

In [6]:
import dotenv
import os
import sys
import pandas as pd

sys.path.append("../")
from src.data.download_manifesto import DownloadManifesto

dotenv.load_dotenv()
dataset_key = "MPDS2024a"
version = '2024-1'
api_key = os.getenv("MANIFESTO_API")
downloader =  DownloadManifesto(dataset_key, version, api_key)

ModuleNotFoundError: No module named 'src'

In [None]:
countries = ['United States', 'Japan']

In [None]:
from tqdm import tqdm

df_all_countries = []
for country in tqdm(countries):
    print(country)
    result = downloader.get_country_data(country)
    df, metadata = downloader.get_metadata(result)
    df_country = downloader.get_texts(df)  # Get texts
    df_all_countries.append(df_country)

In [None]:
df_all = pd.concat(df_all_countries, axis='rows')
df_all = df_all.rename(lambda x: pd.to_datetime(x, format = "%Y%m"), axis=0, level=1) # convert date to date time
df_all = df_all.reset_index()
df_all.head(2)

In [None]:
file_name = "../data/processed/manifesto_us_japan.parquet"
df_all.to_parquet(file_name)

## Import generated data

In [8]:
df_all = pd.read_parquet("../data/processed/manifesto_us_japan.parquet")

## Count words across time

### Count by country (only for English speaking countries)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vocab_list = ['ghg', 'greenhouse', 'net-zero', 'carbon']
vectorizer = CountVectorizer(
    stop_words = 'english',     # Remove stop words. Can be a list of stop words or a string from {'english', 'spanish'}.
    lowercase = True,           # Convert text to lowercase.
    ngram_range = (1,1),
    vocabulary = vocab_list

)
counts = vectorizer.fit_transform(df_all['text']).toarray().sum(axis=0)
word_freq = dict(zip(vectorizer.get_feature_names_out(), counts))
word_freq

{'ghg': 0, 'greenhouse': 23, 'net-zero': 0, 'carbon': 42}

### Count by year and date

In [None]:
# add year column
df_all = df_all.reset_index().assign(year = lambda column: column['date'].dt.year)
df_all.head(2)

Unnamed: 0,index,countryname,date,party,partyname,keys,manifesto_id,text,year
0,0,United States,1960-11-01,61320,Democratic Party,61320_196011,61320_196011,"In 1796, in America's first contested national...",1960
1,1,United States,1960-11-01,61620,Republican Party,61620_196011,61620_196011,PREAMBLE The United States is living in an age...,1960


In [None]:
word_freq_by_country = {}
for country_name, country_df in df_all.groupby(['countryname', 'year']):
    vectorizer = CountVectorizer(
        stop_words='english',
        lowercase=True,
        ngram_range=(1, 1),
        vocabulary=vocab_list
    )
    counts = vectorizer.fit_transform(country_df['text']).toarray().sum(axis=0)
    word_freq_by_country[country_name] = dict(zip(vectorizer.get_feature_names_out(), counts))
df_timeseries = pd.DataFrame(word_freq_by_country).transpose()
df_timeseries.head()

Unnamed: 0,Unnamed: 1,ghg,greenhouse,net-zero,carbon
Japan,2014,0,0,0,0
Japan,2017,0,0,0,0
United States,1960,0,0,0,0
United States,1964,0,0,0,0
United States,1968,0,0,0,0


In [None]:
df_long = df_timeseries.stack().to_frame('count').reset_index()
df_long.head(3)

Unnamed: 0,level_0,level_1,level_2,count
0,Japan,2014,ghg,0
1,Japan,2014,greenhouse,0
2,Japan,2014,net-zero,0


In [None]:
df_long = df_long.rename({"level_0": 'country', 'level_1':'year', 'level_2': 'vocab'}, axis='columns')
df_long.head(2)

Unnamed: 0,country,year,vocab,count
0,Japan,2014,ghg,0
1,Japan,2014,greenhouse,0


#### Export

In [None]:
df_long.to_csv("../data/processed/manifesto_us_japan_word_freq.csv", index=False)

## Count with LLM with translation of Japanese
It needs CUDA to run, running this with CPU might be slow

In [7]:
from transformers import pipeline
from langdetect import detect
from deep_translator import GoogleTranslator
import re


def translate_japanese_to_english(text, max_chunk_size=500):
    # 文単位で分割（句点「。」「！」「？」などで）
    sentences = re.split(r'(?<=[。！？])', text)

    translated_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence:
            try:
                translated = GoogleTranslator(source='ja', target='en').translate(sentence)
                translated_sentences.append(translated)
            except Exception as e:
                print(f"翻訳エラー: {e}（文: {sentence}）")

    return ' '.join(translated_sentences)

from transformers import pipeline
import re

def classify_climate_sentences(text):
    # Load zero-shot classification pipeline
    classifier = pipeline("zero-shot-classification",
                          model="facebook/bart-large-mnli",
                          device=0)

    labels = ["climate-related"]

    # Devide text into sentences
    sentences = re.split(r'(?<=[.!?]) +', text)

    total_score = 0
    count = 0

    for sentence in sentences:
        result = classifier(sentence, candidate_labels=labels)
        total_score += result["scores"][0]
        count += 1

    # Calculate average score
    average_score = total_score / count if count > 0 else 0
    return average_score


def multilang_classify_climate_sentences(text):
    # Detect language
    lang = detect(text)

    # Translate to English if the text is in Japanese
    if lang == 'ja':
        txt = translate_japanese_to_english(text, max_chunk_size=500)
    else:
        txt = text

    return txt, classify_climate_sentences(txt)

In [9]:
from tqdm import tqdm

tqdm.pandas()
df_all['eng_text'], df_all['score'] = df_all['text'].progress_apply(multilang_classify_climate_sentences)

df_all.head()

  0%|          | 0/41 [00:00<?, ?it/s]Device set to use cuda:0
  5%|▍         | 2/41 [00:21<07:03, 10.86s/it]Device set to use cuda:0
  7%|▋         | 3/41 [00:32<06:58, 11.00s/it]Device set to use cuda:0
 10%|▉         | 4/41 [00:39<05:42,  9.26s/it]Device set to use cuda:0
 12%|█▏        | 5/41 [00:47<05:16,  8.80s/it]Device set to use cuda:0
 15%|█▍        | 6/41 [01:11<08:04, 13.85s/it]Device set to use cuda:0
 17%|█▋        | 7/41 [01:21<07:15, 12.80s/it]Device set to use cuda:0
 20%|█▉        | 8/41 [01:48<09:30, 17.29s/it]Device set to use cuda:0
 22%|██▏       | 9/41 [02:13<10:26, 19.58s/it]Device set to use cuda:0
 24%|██▍       | 10/41 [02:33<10:07, 19.61s/it]Device set to use cuda:0
 27%|██▋       | 11/41 [02:53<09:55, 19.87s/it]Device set to use cuda:0
 29%|██▉       | 12/41 [03:30<12:06, 25.06s/it]Device set to use cuda:0
 32%|███▏      | 13/41 [04:03<12:51, 27.55s/it]Device set to use cuda:0
 34%|███▍      | 14/41 [04:46<14:23, 31.97s/it]Device set to use cuda:0
 37%|███▋

KeyboardInterrupt: 

In [9]:
import pandas as pd
import re
from langdetect import detect
from deep_translator import GoogleTranslator
from transformers import pipeline
from datasets import Dataset
from googletrans import Translator
from tqdm import tqdm


# GPUでモデルをロード（1回だけ）
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)
labels = ["climate-related"]

# 翻訳関数（文単位で分割して翻訳）

def translate_japanese_to_english(text):
    translator = Translator()
    sentences = re.split(r'(?<=[。！？])', text)
    translated_sentences = []

    for sentence in sentences:
       sentence = sentence.strip()
       if sentence:
          try:
              translated = translator.translate(sentence, src='ja', dest='en')
              if translated and hasattr(translated, 'text') and translated.text:
                    translated_sentences.append(translated.text)
              else:
                  print(f"翻訳失敗: {sentence}")
          except Exception as e:
              print(f"翻訳エラー: {e}（文: {sentence}）")
    return ' '.join(translated_sentences)

# 多言語対応分類関数（翻訳＋分類）
def multilang_classify_climate_sentences_batch(batch):
    texts = batch["text"]
    eng_texts = []
    valid_indices = []

    # 翻訳とフィルタリング
    for i, text in enumerate(texts):
        lang = detect(text)
        if lang == 'ja':
            translated = translate_japanese_to_english(text)
            if translated.strip():
                eng_texts.append(translated)
                valid_indices.append(i)
            else:
                eng_texts.append("")  # 空でも追加
        else:
            if text.strip():
                eng_texts.append(text)
                valid_indices.append(i)
            else:
                eng_texts.append("")  # 空でも追加

    # 分類（空文字列は除外）
    scores = [0.0] * len(texts)
    if any(t.strip() for t in eng_texts):
        non_empty_texts = [t if t.strip() else "empty" for t in eng_texts]
        results = classifier(non_empty_texts, candidate_labels=labels)
        for i, r in enumerate(results):
            scores[i] = r["scores"][0]

    return {"eng_text": eng_texts, "score": scores}


# DataFrameの例（df_all に 'text' カラムがある前提）
# df_all = pd.DataFrame({'text': [...日本語や英語の文章...]})

# DataFrame → Hugging Face Dataset に変換
dataset = Dataset.from_pandas(df_all)

# バッチ処理で翻訳＋分類（GPU活用）
dataset = dataset.map(multilang_classify_climate_sentences_batch, batched=True, batch_size=16)

# Dataset → DataFrame に戻す
df_all_result = dataset.to_pandas()

# 結果を元の DataFrame に統合
df_all["eng_text"] = df_all_result["eng_text"]
df_all["score"] = df_all_result["score"]

# 結果表示
df_all.head()


Device set to use cuda:0


Map:   0%|          | 0/41 [00:00<?, ? examples/s]

翻訳エラー: the JSON object must be str, bytes or bytearray, not NoneType（文: 株式会社の参入促進をはじめとする医療法人、社会福祉法人の制度改革。）
翻訳エラー: The read operation timed out（文: 原発の稼働がなくとも 日本経済は成り立ちます。）
翻訳エラー: the JSON object must be str, bytes or bytearray, not NoneType（文: 現行憲法は、日本の民主主義を進展させ、戦後秩序の基本となりました。）
翻訳エラー: the JSON object must be str, bytes or bytearray, not NoneType（文: 北朝鮮の脅威から、 国民を守り抜きます わが国の上空を飛び越える弾道ミサイルの相次ぐ発射、核実験の強行など、 北朝鮮による挑発行為はエスカレートし、重大かつ差し迫った脅威となっています このような時こそ、世界をリードできる、経験豊かで安定した政権が必要です わが党は平和に向けた外交努力を続け、断固、国民を守り抜きます 北朝鮮に対する国際社会による圧力強化を主導し、完全で検証可能かつ不可逆的な方法で すべての核・弾道ミサイル計画を放棄させることを目指すとともに、拉致問題の解決に全力を尽くします 日米同盟をより一層強固にすることで、わが国の抑止力を高めます ミサイル対処能力の強化をはじめ、国民保護を最優先に対応し、国民の生命と財産を守り抜きます 世界の中心で、動かす外交 アベノミクスの加速で、景気回復・デフレ脱 却を実現します 全力を傾注したアベノミクスの5年間 いま、多くの指標が示す通り、わが国の経済は確実に回復しています この流れを確かなものにするため、「生産性革命」と「人づくり革命」の2つの大改革を断行することによって、 力強い消費を実現し、経済の好循環を完遂します アベノミクス5年間の実績名目GDP 過去最高 50兆円増加493 兆円(2012 年 10-12 月期)➡ 543 兆円(2017 年 4-6 月期) 就業者数 185万人増加6,271 万人(2012 年)➡ 6,456 万人(2016 年)  正社員有

ArrowInvalid: Column 7 named eng_text expected length 9 but got length 8

In [None]:
df_all.to_csv("../data/processed/manifesto_us_japan_related_score.csv")