## Extract GRI paragraphs from report (JSONL)

Pre-processing phase where the outpu is a dataframe with format:
Concept: Paragraph

In [10]:
import pandas as pd
import os

In [None]:
def process(fn):
    
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    try:
        STOPWORDS = set(stopwords.words('english'))
    except LookupError:
        nltk.download('stopwords')
        STOPWORDS = set(stopwords.words('english'))
    
    def lower(text):
        """
        Transforms given text to lower case.
        Example:
        Input: 'I really like New York city'
        Output: 'i really like new your city'
        """
    
        return text.lower()
    
    def replace_special_characters(text):
        """
        Replaces special characters, such as paranthesis,
        with spacing character
        """
    
        return REPLACE_BY_SPACE_RE.sub(' ', text)
    
    def filter_out_uncommon_symbols(text):
        """
        Removes any special character that is not in the
        good symbols list (check regular expression)
        """
    
        return GOOD_SYMBOLS_RE.sub('', text)
    
    def remove_stopwords(text):
        return ' '.join([x for x in text.split() if x and x not in STOPWORDS])
    
    
    def strip_text(text):
        """
        Removes any left or right spacing (including carriage return) from text.
        Example:
        Input: '  This assignment is cool\n'
        Output: 'This assignment is cool'
        """
    
        return text.strip()
    
    PREPROCESSING_PIPELINE = [
                              lower,
                              replace_special_characters,
                              filter_out_uncommon_symbols,
                              remove_stopwords,
                              strip_text
                              ]
        # Anchor method
    
    def text_prepare(text, filter_methods=None):
        """
        Applies a list of pre-processing functions in sequence (reduce).
        Note that the order is important here!
        """
    
        filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE
    
        return reduce(lambda txt, f: f(txt), filter_methods, text)
    

In [11]:
# Read the jsonl file
fn = '2019_sicredi_orig.jsonl'
print(os.getcwd())

df = pd.read_json(fn, lines=True)
df.head()

/Users/nick/Documents/01_Unibo/tese/programs/xtract_pdf


Unnamed: 0,Size,Text
0,38,1
1,38,Sustainability Report 2019
2,10,Sustainability Report 2019 | Sicredi
3,10,2
4,15,"Sicredi believed in my dream.” The member, who..."


In [12]:
# Lets convert the df['Text'] into numpy array to look for our GRI concepts
# concepts from https://standards.sinzer.org/gri/sector#103
org_profile_GRI= {"102-1","102-2","102-3","102-4","102-5","102-6","102-7","102-8","102-9","102-10",
              "102-11","102-12","102-13"}
strategy_GRI={"102-14","102-15"}
ethics_GRI={"102-16","102-17"}
governance_GRI= {"102-18","102-19","102-20","102-21","102-22","102-23","102-24","102-25","102-26","102-27",
              "102-28","102-29","102-30","102-31","102-32","102-33","102-34","102-35","102-36","102-37",
                "102-38","102-39"}
stakeholder_GRI= {"102-40","102-41","102-42","102-43","102-44"}
reporting_GRI= {"102-45","102-46","102-47","102-48","102-49","102-50","102-51","102-52","102-53","102-54","102-55","102-56"}
management_GRI= {"103-1","103-2","103-3"}
# 200
economic_GRI={"201-1","201-2","201-3","201-4"}
market_GRI={"202-1","202-2"}
indirect_economic_GRI={"203-1","203-2"}
procurement_GRI={"204-1"}
anti_corruption_GRI={"205-1","205-2","205-3"}
anti_comptetive_GRI={"206-1"}
# 300
materials_GRI={"301-1","301-2","301-3"}
energy_GRI={"302-1","302-2","302-3","302-4","302-5"}
water_GRI={"303-1","303-2","303-3"}
bio_diversity_GRI={"304-1","304-2","304-3","304-4"}
emissions_GRI={"305-1","305-2","305-3","305-4","305-5","305-6","305-7"}
waste_GRI={"306-1","306-2","306-3","306-4","306-5"}
environment_GRI={"307-1"}
supplier_GRI={"308-1","308-2"}
#400
employment_GRI={"401-1","401-2","401-3"}
labor_relations_GRI={"402-1"}
health_GRI={"403-1","403-2","403-3","403-4"}
education_GRI={"404-1","404-2","404-3"}
diversity_GRI={"405-1","405-2"}
non_discrimination_GRI={"406-1"}
association_GRI={"407-1"}
child_labor_GRI={"408-1"}
forced_labor_GRI={"409-1"}
security_GRI={"410-1"}
indigenous_GRI={"411-1"}
human_rights_GRI={"412-1","412-2","412-3"}
locals_GRI={"413-1","413-2"}
supplier_social_GRI={"414-1","414-2"}
public_policy_GRI={"415-1"}
customer_health_GRI={"416-1","416-2"}
marketing_GRI={"417-1","417-2","417-3"}
privacy_GRI={"418-1"}
socioeconomic_GRI={"419-1"}
###

all_GRIs=[org_profile_GRI,strategy_GRI,ethics_GRI,governance_GRI,stakeholder_GRI,reporting_GRI,management_GRI,
# 200
economic_GRI,market_GRI,indirect_economic_GRI,procurement_GRI,anti_corruption_GRI,anti_comptetive_GRI,
# 300
materials_GRI,energy_GRI,water_GRI,bio_diversity_GRI,emissions_GRI,waste_GRI,environment_GRI,supplier_GRI,
#400
employment_GRI,labor_relations_GRI,health_GRI,education_GRI,diversity_GRI,non_discrimination_GRI,association_GRI,
child_labor_GRI,forced_labor_GRI,security_GRI,indigenous_GRI,human_rights_GRI,locals_GRI,
supplier_social_GRI,public_policy_GRI,customer_health_GRI,marketing_GRI,privacy_GRI,socioeconomic_GRI
]

for i in all_GRIs:
    print(set(i))


{'102-12', '102-6', '102-7', '102-9', '102-5', '102-1', '102-2', '102-3', '102-11', '102-4', '102-8', '102-13', '102-10'}
{'102-14', '102-15'}
{'102-16', '102-17'}
{'102-34', '102-32', '102-20', '102-33', '102-31', '102-27', '102-37', '102-28', '102-19', '102-26', '102-30', '102-18', '102-35', '102-25', '102-39', '102-22', '102-23', '102-38', '102-29', '102-36', '102-21', '102-24'}
{'102-44', '102-40', '102-42', '102-41', '102-43'}
{'102-45', '102-56', '102-54', '102-53', '102-51', '102-55', '102-46', '102-47', '102-48', '102-52', '102-49', '102-50'}
{'103-3', '103-2', '103-1'}
{'201-2', '201-4', '201-3', '201-1'}
{'202-1', '202-2'}
{'203-2', '203-1'}
{'204-1'}
{'205-3', '205-2', '205-1'}
{'206-1'}
{'301-3', '301-2', '301-1'}
{'302-2', '302-5', '302-3', '302-1', '302-4'}
{'303-3', '303-1', '303-2'}
{'304-4', '304-2', '304-1', '304-3'}
{'305-3', '305-1', '305-5', '305-6', '305-7', '305-2', '305-4'}
{'306-1', '306-3', '306-4', '306-2', '306-5'}
{'307-1'}
{'308-1', '308-2'}
{'401-1', '401

In [61]:
import re
list_classified=[]
for concept in all_GRIs:
    for concept in bio_diversity_GRI:
        p_line = ""
        for line in df['Text']:
            if concept in p_line:
                text = re.sub('[0-9]+\-[0-9]+', '', p_line) # Antecipating a cleaning we 
                list_classified.append([concept, text.strip() + line])

            p_line = line
output_df = pd.DataFrame(list_classified, columns=['GRI', 'Text'])
output_df.head



<bound method NDFrame.head of Empty DataFrame
Columns: [GRI, Text]
Index: []>

In [67]:
# the real deal, treat all the files
import glob
import os
# Read the jsonl file
list_classified=[]
for file_name in glob.iglob('/Users/nick/Documents/01_Unibo/tese/programs/xtract_pdf/Docs/In/*.jsonl'):
    df = pd.read_json(fn, lines=True)
    for concepts in all_GRIs:
        for concept in concepts:
            p_line = ""
            for line in df['Text']:
                if concept in p_line:
                    text = re.sub('[0-9]+\-[0-9]+', '', p_line) # Antecipating a cleaning we 
                    list_classified.append([os.path.basename(file_name),concept, text.strip() + line])
                p_line = line
            output_df = pd.DataFrame(list_classified, columns=['Filename','GRI', 'Text'])
    #output_df.head

output_df.loc[(output_df['GRI'].str.contains('10[0-9]|30[0-9]|40[0-9]'))]



Unnamed: 0,Filename,GRI,Text
0,2018_Votorantim_EN.jsonl,102-12,"GRIIn 2019, we started the process to particip..."
1,2018_Votorantim_EN.jsonl,102-12,"Page 93 Charters, principles and other extern..."
2,2018_Votorantim_EN.jsonl,102-6,"30 GRI / / / FS13 / DL01Throughout 2019, we..."
3,2018_Votorantim_EN.jsonl,102-6,GRI 102: General Disclosures - 2016 Name of t...
4,2018_Votorantim_EN.jsonl,102-7,GRI / / /Sicredi
...,...,...,...
8810,2019_Fitesa_EN.jsonl,405-2,GRI / / / / / Sicredi RC01We consider it ...
8811,2019_Fitesa_EN.jsonl,405-2,Ratio of basic salary and compensation of wome...
8812,2019_Fitesa_EN.jsonl,405-1,GRI / / / / / Sicredi RC01We consider it ...
8813,2019_Fitesa_EN.jsonl,405-1,Explanation of the effect of any restatements ...


In [70]:
len(output_df['Filename'][(output_df['GRI'].str.contains('30[0-9]|40[0-9]'))].unique())

31

In [71]:
output_file = 'summary.jsonl'
output_df.to_json(output_file, orient='records', lines=True) 

In [75]:
output_df['GRI'].describe()

count      8835
unique       85
top       103-3
freq        775
Name: GRI, dtype: object

In [74]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8835 entries, 0 to 8834
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Filename  8835 non-null   object
 1   GRI       8835 non-null   object
 2   Text      8835 non-null   object
dtypes: object(3)
memory usage: 207.2+ KB


In [76]:
output_df.value_counts()

Filename                  GRI     Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
2016_Natura_EN.jsonl      102-1   10 GRIMessage from leadership                                                                                                                                                                                                                                                                                                               

In [77]:
output_df.describe()

Unnamed: 0,Filename,GRI,Text
count,8835,8835,8835
unique,31,85,126
top,2019_Copagaz_EN.jsonl,103-3,GRI / / / / / / / SDG relatedSustainab...
freq,285,775,341
