# Manifesto data retrieval
For US, Japan

## Download Manifesto Data
- https://manifesto-project.wzb.eu/information/documents/api

In [3]:
import dotenv
import os
import sys
import pandas as pd

sys.path.append("../")
from src.data.download_manifesto import DownloadManifesto

dotenv.load_dotenv()
dataset_key = "MPDS2024a"
version = '2024-1'
api_key = os.getenv("MANIFESTO_API")
downloader =  DownloadManifesto(dataset_key, version, api_key)

In [None]:
countries = ['United States', 'Japan']

In [None]:
from tqdm import tqdm

df_all_countries = []
for country in tqdm(countries):
    print(country)
    result = downloader.get_country_data(country)
    df, metadata = downloader.get_metadata(result)
    df_country = downloader.get_texts(df)  # Get texts
    df_all_countries.append(df_country)

In [None]:
df_all = pd.concat(df_all_countries, axis='rows')
df_all = df_all.rename(lambda x: pd.to_datetime(x, format = "%Y%m"), axis=0, level=1) # convert date to date time
df_all = df_all.reset_index()
df_all.head(2)

In [None]:
file_name = "../data/processed/manifesto_us_japan.parquet"
df_all.to_parquet(file_name)

## Import generated data

In [38]:
df_all = pd.read_parquet("../data/processed/manifesto_us_japan.parquet")

## To do: Translate Japanese manifesto into English

In [5]:
df_all[df_all["countryname"] == "Japan"]["text"]

32    安倍政権の暴走ストップ！ 国民の声が生きる新しい政治を 日本共産党の総選挙政策 日本共産党 ...
33    約束１　アベノミクスによる生活破壊を許さず、拡大した格差を是正します （１）景気を悪化させる...
34    日本共産党の総選挙政策 日本共産党 安倍首相は、臨時国会の冒頭解散に打って出ました。 「森友...
35    くらし支えます 1家計を温めボトムアップの経済政策でくらしの再建 憲法１３条の幸福追求権、２...
36    消費増税凍結! 維新ならできる! 増税なしで改革実現! 身を切る改革で財源を生み出し、議員報...
37    1 生活の現場から暮らしを立て直します アベノミクスの成果は上がらず、国民の所得を削り、中間...
38    教育負担の軽減へ。 衆院選で公明党は、「教育負担の軽減へ。」を掲げます。 国づくりの基...
39    北朝鮮の脅威から、 国民を守り抜きます わが国の上空を飛び越える弾道ミサイルの相次ぐ発...
40    私たちが希求するものは、党の利益ではなく、議員の利益でもなく、 国民のため、つまり国民...
Name: text, dtype: object

## Need to fix this code

In [None]:
""" # Translation of Japanese text to English using a pre-trained model
import transformers
from transformers import EncoderDecoderModel
import torch

encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "openai-community/gpt2"
src_tokenizer = transformers.BertJapaneseTokenizer.from_pretrained(encoder_model_name)
trg_tokenizer = transformers.PreTrainedTokenizerFast.from_pretrained(decoder_model_name)

EncoderDecoderModel.from_pretrained(
    "sappho192/jesc-ja-en-translator").save_pretrained("../src/model/jesc-ja-en-translator")
model = EncoderDecoderModel.from_pretrained("../src/model/jesc-ja-en-translator").to("cpu")

MAX_TOKENS = 480

def chunk_text(text, tokenizer, max_tokens):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i + max_tokens]
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk_text)
    return chunks

def translate(text_src):
    chunks = chunk_text(text_src, src_tokenizer, MAX_TOKENS)
    translated_chunks = []
    for chunk in chunks:
        embeddings = src_tokenizer(chunk, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
        output = model.generate(**embeddings, max_length=512)[0, 1:-1]
        text_trg = trg_tokenizer.decode(output.cpu(), skip_special_tokens=True)
        translated_chunks.append(text_trg)
    return " ".join(translated_chunks)

# Only Japan data will be processed
tqdm.pandas()
df_all.loc[df_all["countryname"] == "Japan", "text_eng"] = \
    df_all.loc[df_all["countryname"] == "Japan", "text"].progress_apply(translate)

df_all.head()
"""

## Count words across time

### Count by country

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vocab_list = ['ghg', 'greenhouse', 'net-zero', 'carbon']
vectorizer = CountVectorizer(
    stop_words = 'english',     # Remove stop words. Can be a list of stop words or a string from {'english', 'spanish'}.
    lowercase = True,           # Convert text to lowercase.
    ngram_range = (1,1),
    vocabulary = vocab_list

)
counts = vectorizer.fit_transform(df_all['text']).toarray().sum(axis=0)
word_freq = dict(zip(vectorizer.get_feature_names_out(), counts))
word_freq

{'ghg': 0, 'greenhouse': 23, 'net-zero': 0, 'carbon': 42}

### Count by year and date

In [7]:
# add year column
df_all = df_all.reset_index().assign(year = lambda column: column['date'].dt.year)
df_all.head(2)

Unnamed: 0,index,countryname,date,party,partyname,keys,manifesto_id,text,year
0,0,United States,1960-11-01,61320,Democratic Party,61320_196011,61320_196011,"In 1796, in America's first contested national...",1960
1,1,United States,1960-11-01,61620,Republican Party,61620_196011,61620_196011,PREAMBLE The United States is living in an age...,1960


In [9]:
word_freq_by_country = {}
for country_name, country_df in df_all.groupby(['countryname', 'year']):
    vectorizer = CountVectorizer(
        stop_words='english',
        lowercase=True,
        ngram_range=(1, 1),
        vocabulary=vocab_list
    )
    counts = vectorizer.fit_transform(country_df['text']).toarray().sum(axis=0)
    word_freq_by_country[country_name] = dict(zip(vectorizer.get_feature_names_out(), counts))
df_timeseries = pd.DataFrame(word_freq_by_country).transpose()
df_timeseries.head()

Unnamed: 0,Unnamed: 1,ghg,greenhouse,net-zero,carbon
Japan,2014,0,0,0,0
Japan,2017,0,0,0,0
United States,1960,0,0,0,0
United States,1964,0,0,0,0
United States,1968,0,0,0,0


In [10]:
df_long = df_timeseries.stack().to_frame('count').reset_index()
df_long.head(3)

Unnamed: 0,level_0,level_1,level_2,count
0,Japan,2014,ghg,0
1,Japan,2014,greenhouse,0
2,Japan,2014,net-zero,0


In [11]:
df_long = df_long.rename({"level_0": 'country', 'level_1':'year', 'level_2': 'vocab'}, axis='columns')
df_long.head(2)

Unnamed: 0,country,year,vocab,count
0,Japan,2014,ghg,0
1,Japan,2014,greenhouse,0


#### Export

In [12]:
df_long.to_csv("../data/processed/manifesto_us_japan_word_freq.csv", index=False)

## Count with LLM

In [None]:
from transformers import pipeline
from langdetect import detect
from deep_translator import GoogleTranslator
import re


def translate_japanese_to_english(text, max_chunk_size=500):
    # 文単位で分割（句点「。」「！」「？」などで）
    sentences = re.split(r'(?<=[。！？])', text)

    translated_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence:
            try:
                translated = GoogleTranslator(source='ja', target='en').translate(sentence)
                translated_sentences.append(translated)
            except Exception as e:
                print(f"翻訳エラー: {e}（文: {sentence}）")

    return ' '.join(translated_sentences)

from transformers import pipeline
import re

def classify_climate_sentences(text):
    # Load zero-shot classification pipeline
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

    labels = ["climate-related"]

    # Devide text into sentences
    sentences = re.split(r'(?<=[.!?]) +', text)

    total_score = 0
    count = 0

    for sentence in sentences:
        result = classifier(sentence, candidate_labels=labels)
        total_score += result["scores"][0]
        count += 1

    # Calculate average score
    average_score = total_score / count if count > 0 else 0
    return average_score


def multilang_classify_climate_sentences(text):
    # Detect language
    lang = detect(text)

    # Translate to English if the text is in Japanese
    if lang == 'ja':
        txt = translate_japanese_to_english(text, max_chunk_size=500)
    else:
        txt = text

    return classify_climate_sentences(txt)

In [None]:
import swifter

df_all['score'] = df_all['text'].swifter.apply(
    multilang_classify_climate_sentences)

df_all.head()

  0%|          | 0/41 [00:00<?, ?it/s]Python(19768) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19769) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19770) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19771) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19772) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19773) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19774) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19775) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19776) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be ava

RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

In [120]:
df_all

Unnamed: 0,countryname,date,party,partyname,keys,manifesto_id,text
0,United States,1960-11-01,61320,Democratic Party,61320_196011,61320_196011,"In 1796, in America's first contested national..."
1,United States,1960-11-01,61620,Republican Party,61620_196011,61620_196011,PREAMBLE The United States is living in an age...
2,United States,1964-11-01,61320,Democratic Party,61320_196411,61320_196411,"ONE NATION, ONE PEOPLE. America is One Nation,..."
3,United States,1964-11-01,61620,Republican Party,61620_196411,61620_196411,"""FOR THE PEOPLE"" SECTION ONE For a Free People..."
4,United States,1968-11-01,61320,Democratic Party,61320_196811,61320_196811,THE TERMS OF OUR DUTY America belongs to the p...
5,United States,1968-11-01,61620,Republican Party,61620_196811,61620_196811,"PREAMBLE, PURPOSES AND PLEDGES Twice before, o..."
6,United States,1972-11-01,61320,Democratic Party,61320_197211,61320_197211,New Directions: '72-'76 Skepticism and cynicis...
7,United States,1972-11-01,61620,Republican Party,61620_197211,61620_197211,PREAMBLE This year our Republican Party has gr...
8,United States,1976-11-01,61320,Democratic Party,61320_197611,61320_197611,PREAMBLE We meet to adopt a Democratic platfor...
9,United States,1976-11-01,61620,Republican Party,61620_197611,61620_197611,"Preamble To you, an American citizen: You are ..."
