# Manifesto data retrieval
For US, Japan

## Download Manifesto Data
- https://manifesto-project.wzb.eu/information/documents/api

In [4]:
# For Google Colab
# !pip install dotenv langdetect deep_translator



In [5]:
import dotenv
import os
import sys
import pandas as pd

sys.path.append("../")
from src.data.download_manifesto import DownloadManifesto

dotenv.load_dotenv()
dataset_key = "MPDS2024a"
version = '2024-1'
api_key = os.getenv("MANIFESTO_API")
downloader =  DownloadManifesto(dataset_key, version, api_key)

ModuleNotFoundError: No module named 'src'

In [None]:
countries = ['United States', 'Japan']

In [None]:
from tqdm import tqdm

df_all_countries = []
for country in tqdm(countries):
    print(country)
    result = downloader.get_country_data(country)
    df, metadata = downloader.get_metadata(result)
    df_country = downloader.get_texts(df)  # Get texts
    df_all_countries.append(df_country)

In [None]:
df_all = pd.concat(df_all_countries, axis='rows')
df_all = df_all.rename(lambda x: pd.to_datetime(x, format = "%Y%m"), axis=0, level=1) # convert date to date time
df_all = df_all.reset_index()
df_all.head(2)

In [None]:
file_name = "../data/processed/manifesto_us_japan.parquet"
df_all.to_parquet(file_name)

## Import generated data

In [1]:
df_all = pd.read_parquet("../data/processed/manifesto_us_japan.parquet")

NameError: name 'pd' is not defined

## Count words across time

### Count by country (only for English speaking countries)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vocab_list = ['ghg', 'greenhouse', 'net-zero', 'carbon']
vectorizer = CountVectorizer(
    stop_words = 'english',     # Remove stop words. Can be a list of stop words or a string from {'english', 'spanish'}.
    lowercase = True,           # Convert text to lowercase.
    ngram_range = (1,1),
    vocabulary = vocab_list

)
counts = vectorizer.fit_transform(df_all['text']).toarray().sum(axis=0)
word_freq = dict(zip(vectorizer.get_feature_names_out(), counts))
word_freq

{'ghg': 0, 'greenhouse': 23, 'net-zero': 0, 'carbon': 42}

### Count by year and date

In [None]:
# add year column
df_all = df_all.reset_index().assign(year = lambda column: column['date'].dt.year)
df_all.head(2)

Unnamed: 0,index,countryname,date,party,partyname,keys,manifesto_id,text,year
0,0,United States,1960-11-01,61320,Democratic Party,61320_196011,61320_196011,"In 1796, in America's first contested national...",1960
1,1,United States,1960-11-01,61620,Republican Party,61620_196011,61620_196011,PREAMBLE The United States is living in an age...,1960


In [None]:
word_freq_by_country = {}
for country_name, country_df in df_all.groupby(['countryname', 'year']):
    vectorizer = CountVectorizer(
        stop_words='english',
        lowercase=True,
        ngram_range=(1, 1),
        vocabulary=vocab_list
    )
    counts = vectorizer.fit_transform(country_df['text']).toarray().sum(axis=0)
    word_freq_by_country[country_name] = dict(zip(vectorizer.get_feature_names_out(), counts))
df_timeseries = pd.DataFrame(word_freq_by_country).transpose()
df_timeseries.head()

Unnamed: 0,Unnamed: 1,ghg,greenhouse,net-zero,carbon
Japan,2014,0,0,0,0
Japan,2017,0,0,0,0
United States,1960,0,0,0,0
United States,1964,0,0,0,0
United States,1968,0,0,0,0


In [None]:
df_long = df_timeseries.stack().to_frame('count').reset_index()
df_long.head(3)

Unnamed: 0,level_0,level_1,level_2,count
0,Japan,2014,ghg,0
1,Japan,2014,greenhouse,0
2,Japan,2014,net-zero,0


In [None]:
df_long = df_long.rename({"level_0": 'country', 'level_1':'year', 'level_2': 'vocab'}, axis='columns')
df_long.head(2)

Unnamed: 0,country,year,vocab,count
0,Japan,2014,ghg,0
1,Japan,2014,greenhouse,0


#### Export

In [None]:
df_long.to_csv("../data/processed/manifesto_us_japan_word_freq.csv", index=False)

## Count with LLM with translation of Japanese
It needs CUDA to run, running this with CPU might be slow

In [None]:
from transformers import pipeline
from langdetect import detect
from deep_translator import GoogleTranslator
import re


def translate_japanese_to_english(text, max_chunk_size=500):
    # 文単位で分割（句点「。」「！」「？」などで）
    sentences = re.split(r'(?<=[。！？])', text)

    translated_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence:
            try:
                translated = GoogleTranslator(source='ja', target='en').translate(sentence)
                translated_sentences.append(translated)
            except Exception as e:
                print(f"翻訳エラー: {e}（文: {sentence}）")

    return ' '.join(translated_sentences)

from transformers import pipeline
import re

def classify_climate_sentences(text):
    # Load zero-shot classification pipeline
    classifier = pipeline("zero-shot-classification",
                          model="facebook/bart-large-mnli",
                          device=0)

    labels = ["climate-related"]

    # Devide text into sentences
    sentences = re.split(r'(?<=[.!?]) +', text)

    total_score = 0
    count = 0

    for sentence in sentences:
        result = classifier(sentence, candidate_labels=labels)
        total_score += result["scores"][0]
        count += 1

    # Calculate average score
    average_score = total_score / count if count > 0 else 0
    return average_score


def multilang_classify_climate_sentences(text):
    # Detect language
    lang = detect(text)

    # Translate to English if the text is in Japanese
    if lang == 'ja':
        txt = translate_japanese_to_english(text, max_chunk_size=500)
    else:
        txt = text

    return txt, classify_climate_sentences(txt)

In [None]:
from tqdm import tqdm

df_all['eng_text'], df_all['score'] = df_all['text'].apply(multilang_classify_climate_sentences)

df_all.head()

  0%|          | 0/41 [00:00<?, ?it/s]Device set to use cuda:0
  2%|▏         | 1/41 [00:18<12:11, 18.29s/it]Device set to use cuda:0
  5%|▍         | 2/41 [00:29<09:00, 13.86s/it]Device set to use cuda:0
  7%|▋         | 3/41 [00:34<06:29, 10.24s/it]Device set to use cuda:0
 10%|▉         | 4/41 [00:43<05:50,  9.47s/it]Device set to use cuda:0
 12%|█▏        | 5/41 [00:58<07:00, 11.68s/it]Device set to use cuda:0
 15%|█▍        | 6/41 [01:09<06:35, 11.29s/it]Device set to use cuda:0
 17%|█▋        | 7/41 [01:45<10:59, 19.41s/it]Device set to use cuda:0
 20%|█▉        | 8/41 [02:13<12:06, 22.01s/it]Device set to use cuda:0
 22%|██▏       | 9/41 [02:33<11:27, 21.47s/it]Device set to use cuda:0
 24%|██▍       | 10/41 [02:53<10:51, 21.01s/it]Device set to use cuda:0
 27%|██▋       | 11/41 [03:32<13:15, 26.50s/it]Device set to use cuda:0
 29%|██▉       | 12/41 [04:04<13:39, 28.27s/it]Device set to use cuda:0
 32%|███▏      | 13/41 [04:46<15:07, 32.43s/it]Device set to use cuda:0
 34%|███▍ 

Unnamed: 0,countryname,date,party,partyname,keys,manifesto_id,text,score
0,United States,1960-11-01,61320,Democratic Party,61320_196011,61320_196011,"In 1796, in America's first contested national...",0.068555
1,United States,1960-11-01,61620,Republican Party,61620_196011,61620_196011,PREAMBLE The United States is living in an age...,0.064889
2,United States,1964-11-01,61320,Democratic Party,61320_196411,61320_196411,"ONE NATION, ONE PEOPLE. America is One Nation,...",0.042586
3,United States,1964-11-01,61620,Republican Party,61620_196411,61620_196411,"""FOR THE PEOPLE"" SECTION ONE For a Free People...",0.069937
4,United States,1968-11-01,61320,Democratic Party,61320_196811,61320_196811,THE TERMS OF OUR DUTY America belongs to the p...,0.063394


In [None]:
df_all.to_csv("../data/processed/manifesto_us_japan_related_score.csv")