In [1]:
import os
os.chdir("..")

In [2]:
import pandas as pd
import numpy as np

from src.data.scrape_reports import scrape_reports

In [3]:
pd.set_option("display.max_rows", 5)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# 1. Scrape unlabeled reports

In [4]:
# Load data
df_unlabeled = pd.read_excel('data/interim/reports_unlabeled.xlsx', engine='openpyxl')
df_unlabeled

Unnamed: 0,Identifier,Company_Name,CSR_Period_Relative,CSR_URL
0,BCE_py.TO,BCE Inc,FY0,https://www.bce.ca/responsibility/overview/2020-cr-report.pdf
1,ENB_pa.TO,Enbridge Inc,FY0,https://www.enbridge.com/~/media/Enb/Documents/Reports/Sustainability%20Report%202020/Enbridge_SR_2020.pdf
2,POW_pb.TO,Power Corporation of Canada,FY0,https://www.powercorporationcsr.com/media/uploads/reports/power-corporation-2019-data-supplement-final.pdf
3,PWF_pf.TO,Power Financial Corp,FY0,https://www.powerfinancialcsr.com/media/uploads/reports/pfc_-_csr_website_-_2018_update.pdf
4,WN_pa.TO,George Weston Ltd,FY0,http://www.weston.ca/en/Environment.aspx


In [5]:
# Create new column to track scraping progress
df_unlabeled['CSR_Filename'] = ''

# Specify path to store the reports 
path_output = 'data/pdf_reports/' 

In [20]:
%%capture output 

# Scrape reports
scrape_reports(df_unlabeled, path_output)

In [25]:
# Store updated df
df_unlabeled.to_excel('data/interim/reports_unlabeled.xlsx', index=False)

In [9]:
# Only keep rows for which a pdf report could be scraped
df_unlabeled_clean = df_unlabeled[df_unlabeled['CSR_Filename'] != 'Error']
# Sort
df_unlabeled_clean.sort_values(by=['CSR_Period_Relative', 'Identifier'], ascending=True, inplace=True)
df_unlabeled_clean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,Identifier,Company_Name,CSR_Period_Relative,CSR_URL,CSR_Filename
4580,3IN.L,3I Infrastructure PLC,FY-1,https://www.3i-infrastructure.com/media/4665/3i-infrastructure-annual-report-2020.pdf,FY-1_3IN.L_.pdf
4394,888.L,888 Holdings PLC,FY-1,https://888holdings.html.investis.com/8/888Holdings/pdf/ar-2019-v1.pdf,FY-1_888.L_.pdf
3158,A.N,Agilent Technologies Inc,FY-1,https://www.agilent.com/about/companyinfo/sustainability/Agilent_Report_CSR_2019.pdf,FY-1_A.N_.pdf
3670,AA.N,Alcoa Corp,FY-1,https://www.alcoa.com/sustainability/en/pdf/2019-Sustainability-Report.pdf,FY-1_AA.N_.pdf
4339,AAL.L,Anglo American PLC,FY-1,https://www.angloamerican.com/~/media/Files/A/Anglo-American-Group/PLC/investors/annual-reporting/2020/aa-sustainability-report-2019-v1.pdf,FY-1_AAL.L_.pdf


In [11]:
# Store as new df
df_unlabeled_clean.to_excel('data/interim/reports_unlabeled_clean.xlsx', index=False)

# 2. Update labeled reports with filenames

In [10]:
# Load data
df_labeled = pd.read_excel('data/interim/reports_labeled.xlsx', engine='openpyxl')
df_labeled

Unnamed: 0,Identifier,Company_Name,CSR_Period_Relative,CSR_URL,SDG1,SDG2,SDG3,SDG4,SDG5,SDG6,SDG7,SDG8,SDG9,SDG10,SDG11,SDG12,SDG13,SDG14,SDG15,SDG16,SDG17
0,BCE_py.TO,BCE Inc,FY0,https://www.bce.ca/responsibility/overview/2020-cr-report.pdf,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,ENB_pa.TO,Enbridge Inc,FY0,https://www.enbridge.com/~/media/Enb/Documents/Reports/Sustainability%20Report%202020/Enbridge_SR_2020.pdf,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1
2,POW_pb.TO,Power Corporation of Canada,FY0,https://www.powercorporationcsr.com/media/uploads/reports/power-corporation-2019-data-supplement-final.pdf,1,0,1,1,1,0,1,1,0,0,1,1,1,0,0,1,0
3,GWO_pf.TO,Great-West Lifeco Inc,FY0,https://www.greatwestlifeco.com/content/dam/gwlco/documents/misc/canada-life-2019-public-accountability-statement.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,MFC_pb.TO,Manulife Financial Corp,FY0,https://www.manulife.com/content/dam/corporate/global/en/documents/pas/MFC_SR_PAS_2019.pdf,0,0,1,0,1,0,1,1,0,1,0,0,1,0,0,0,0


In [32]:
# Store as new df
df_labeled_clean.to_excel('data/interim/reports_labeled_clean.xlsx', index=False)

# 3. Update labeled pages with filenames

In [35]:
# Load data
df_labeled_pages = pd.read_excel('data/interim/reports_labeled_pages.xlsx', engine='openpyxl')
df_labeled_pages

Unnamed: 0,Identifier,Company_Name,CSR_Period_Relative,CSR_URL,Page,SDG1,SDG2,SDG3,SDG4,SDG5,SDG6,SDG7,SDG8,SDG9,SDG10,SDG11,SDG12,SDG13,SDG14,SDG15,SDG16,SDG17
0,3IN.L,3I Infrastructure PLC,FY0,https://www.3i-infrastructure.com/media/4665/3i-infrastructure-annual-report-2020.pdf,45,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0
1,3IN.L,3I Infrastructure PLC,FY0,https://www.3i-infrastructure.com/media/4665/3i-infrastructure-annual-report-2020.pdf,46,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0
2,A.N,Agilent Technologies Inc,FY-1,https://www.agilent.com/about/companyinfo/sustainability/Agilent_Report_CSR_2019.pdf,14,0,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,0
3,A.N,Agilent Technologies Inc,FY-1,https://www.agilent.com/about/companyinfo/sustainability/Agilent_Report_CSR_2019.pdf,16,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
4,A.N,Agilent Technologies Inc,FY-1,https://www.agilent.com/about/companyinfo/sustainability/Agilent_Report_CSR_2019.pdf,39,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
df_labeled_pages_clean.to_excel('data/interim/reports_labeled_pages_clean.xlsx', index=False)