# NDC data processing and word counting

## Import data 

In [4]:
import PyPDF2
import pandas as pd

pdf_path = {"EU": "../data/raw/ES-2023-10-17 EU submission NDC update.pdf",
            "US": "../data/raw/United States 2035 NDC.pdf"}

dfs = []

for key, path in pdf_path.items():
    with open(path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text_list = [page.extract_text().replace('\n', '') \
                    if page.extract_text() else '' for page in reader.pages]
    df = pd.DataFrame({'country': key, 'text': text_list})
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df.head()

Unnamed: 0,country,text
0,EU,1 Update of the NDC of the European Unio...
1,EU,2 Update of the NDC of the European Unio...
2,EU,3 Update of the NDC of the European Unio...
3,EU,4 Update of the NDC of the European Unio...
4,EU,5 Update of the NDC of the European Unio...


## Word count

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vocab_list = ['ghg', 'greenhouse', 'net-zero', 'carbon']
vectorizer = CountVectorizer(
    stop_words='english',     # Remove stop words. Can be a list of stop words or a string from {'english', 'spanish'}.
    lowercase=True,           # Convert text to lowercase.
    ngram_range=(1, 1),
    vocabulary=vocab_list
)

dfs_count = []

for country in df['country'].unique():
    counts = vectorizer.fit_transform(df.loc[df["country"] == country, 'text']).toarray().sum(axis=0)
    word_freq = dict(zip(vectorizer.get_feature_names_out(), counts))
    row = word_freq.copy()
    row['country'] = country
    df_count = pd.DataFrame([row])
    dfs_count.append(df_count)

df_count = pd.concat(dfs_count, ignore_index=True)
df_count.head()

Unnamed: 0,ghg,greenhouse,net-zero,carbon,country
0,7,29,0,21,EU
1,3,58,0,33,US


## Export results

In [9]:
df_count.to_csv('../data/processed/ndc_word_counts.csv', index=False)