In [52]:
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk


In [53]:
# Read in the data from Tom's output
data = pd.read_csv("../Output/20240416_02_NDC_sentences.csv")
data['text_content'] = data['text_content'].astype(str)

keywords = pd.read_csv("../Data/keywords_for_marine_policy.csv")
keywords

Unnamed: 0,Topic,Keyword
0,climate change,global warming
1,climate change,ocean warming
2,climate change,acidification
3,climate change,sea-level rise
4,climate change,deoxygenation
...,...,...
85,policy,small island developing nations
86,policy,SIDS
87,policy,jurisdiction
88,policy,enforcement


In [54]:
data.shape

(186837, 11)

In [55]:
stemmer = PorterStemmer()

stemmed_keywords = set([stemmer.stem(keyword) for keyword in keywords["Keyword"]])
stemmed_keywords

{'acidif',
 'alga',
 'algal bloom',
 'antarct',
 'aquacultur',
 'arctic',
 'bentho',
 'blue carbon',
 'bycatch',
 'coast',
 'coastal develop',
 'commercial ship',
 'conserv',
 'contamin',
 'coral',
 'deep ocean',
 'deoxygen',
 'disaster relief',
 'ecotour',
 'endangered speci',
 'enforc',
 'estuari',
 'eutroph',
 'exploit',
 'fertil',
 'fish',
 'frack',
 'fund',
 'global warm',
 'habitat loss',
 'heavy met',
 'hurrican',
 'hypoxia',
 'ice melt',
 'international wat',
 'jurisdict',
 'justic',
 'kelp forest',
 'law',
 'long-line fisheri',
 'mangrov',
 'marin',
 'marine mamm',
 'marine protected area',
 'marine research',
 'maritim',
 'marsh',
 'microplast',
 'natural disast',
 'ocean',
 'ocean warm',
 'offshor',
 'offshore wind',
 'oil',
 'oil dril',
 'oil spil',
 'overfish',
 'oxygen',
 'pelag',
 'persistant organic pollut',
 'plankton',
 'plastic',
 'protected speci',
 'reef',
 'resource manag',
 'sanit',
 'sargassum',
 'sea',
 'sea-level',
 'sea-level ris',
 'seabird',
 'seafood indus

# Issue
The output from Tom's script is good, but there are couple data quality issues that need to be addressed
* Some sentences are **split** across multiple cells when they should not be
* Some text_content is literally just punctation and numbers

# Solution
* Combine the cells if possible
* Remove if text content is less than 5 and does not contain keyword

In [56]:
def contains_stemmed_word(text):
    if len(text) > 5:
        return True
    else:
        for word in stemmed_keywords:
            if word in text:
                return True
        return False

In [57]:
data = data[data['text_content'].apply(contains_stemmed_word)]
# Here we removed approximately 20k rows from the dataframe 
# if the text content was < 5 characters and did not contain a keyword
data = data.reset_index(drop=True)

In [58]:
data.shape

(165102, 11)

In [59]:
"""
Description: used to combine cells on a dataframe 
Input:
* df: Dataframe object
* rows: list of indices of the rows that should be combined

Output:
* new_row: a series containin the combined text_content from the rows. 
"""
def cell_merger(df, rows):
    new_row = df.loc[rows[0], ['iso', 'country', 'ndc', 'date', 'html', 'text_type', 'h1_text', 'h2_text', 'h3_text', 'h4_text']]
    new_row['text_content'] = " ".join([df.loc[row_i, 'text_content'] for row_i in rows])
    return new_row

In [60]:
"""
Description: Create a list of lists that record the cells that should be combined to make complete sentences in text_content
"""
def incomplete_cells(df):
    indices_lst = [] # stores a list of lists the indices of cells that can be merged
    i = 0
    through_lst = [] # stores an smaller lst of indices of cells that can be merged
    curr_sen = False
    first_cell_sentence = True
    while i < len(df['text_content']): # iterate through indices
        curr_text = df.loc[i, 'text_content'] # curr text
        # print(i) # to track curr index
        if not curr_text[0].isalpha(): # if first char is not a letter then we continue
            i += 1
            continue
        # if we currently are going through a sentence but we encounter a capital letter or a non alphabet letter
        # then we break the sentence and append it to the indices lst
        if curr_sen and (curr_text[0].isupper() or (not curr_text[0].isalpha())):
            indices_lst.append(through_lst)
            through_lst = []
            curr_sen = False
            first_cell_sentence = True
            i += 1
            continue
        # if we encounter a lower case letter we should append it to through_lst!
        if not curr_text[0].isupper() and curr_text[1] not in set([".", ")"]):
            curr_sen = True
            # also append the previous index because that also needs to be combined (i.e. first cell would contain "Sustainable technology is" and the curr cell would have "very vital")
            if first_cell_sentence:
                through_lst.append(i - 1)
                # we dont want to append twice after the first append. 
                first_cell_sentence = False
            through_lst.append(i)
            i+= 1
            continue
        i += 1
    return indices_lst
            

In [61]:
def df_cleaner(df):
    df = df.copy()
    # create the list of lists of cells to combine
    to_be_combined_lst = incomplete_cells(data)
    for lst in to_be_combined_lst:
        # print(lst)
        row = cell_merger(df, lst)
        df.loc[lst[0]] = row 
        df = df.drop(lst[1:])
    return df

In [62]:
cleaned_df = df_cleaner(data)
cleaned_df.shape

(161031, 11)

We approximately erased 6000 cells and made cells that were separated to combine into proper sentences. 

In [63]:
cleaned_df = cleaned_df.reset_index(drop = True)
cleaned_df.head(10)

Unnamed: 0,iso,country,ndc,date,html,text_type,text_content,h1_text,h2_text,h3_text,h4_text
0,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,p,21 September 2015,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,
1,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,p,The Islamic Republic of Afghanistan hereby com...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,
2,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,table,Base Year: 2005 Target Years: 2020 to 2030 Con...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,
3,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,table,Financial Needs: Total: USD 17.405 billionAdap...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,
4,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,ul,Adaptation: USD 10.785 billion Mitigation: USD...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,
5,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,p,Figure 1.,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,
6,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,p,Greenhouse Gas Emissions for Afghanistan showi...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,,
7,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,p,Afghanistan has extensive development and clim...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,1. Afghanistan’s National Circumstances and Co...,Introduction
8,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,p,Afghanistan remains one of the poorest countri...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,1. Afghanistan’s National Circumstances and Co...,Introduction
9,AFG,Afghanistan,1,11/23/16,AFG-first_ndc-EN.html,p,Afghanistan is highly prone to natural disaste...,Intended Nationally Determined Contribution,Submission to the United Nations Framework Con...,1. Afghanistan’s National Circumstances and Co...,Introduction


In [64]:
cleaned_df.to_csv("../Output/20240419_CN_NDC_sentences_qc.csv", index=False)