In [2]:
import os
os.chdir("..")

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore') 

from src.data.scrape_reports import scrape_pdf_reports, scrape_urls_responsibilityreports_website

In [14]:
pd.set_option("display.max_rows", 5)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## 1. Scrape all CSR URLs from responsibilityreports.com

In [None]:
# Scrape URLs
df_responsibilityreports_website = scrape_urls_responsibilityreports_website()
df_responsibilityreports_website

In [32]:
# Only keep rows for which a URL could be scraped and remove duplicates based on URL
df_responsibilityreports_website = df_responsibilityreports_website[df_responsibilityreports_website['URL'] != '']
df_responsibilityreports_website.drop_duplicates(subset=['URL'], keep='first', inplace=True)
df_responsibilityreports_website

Unnamed: 0,Link,Name,Ticker,Year,URL
0,https://www.responsibilityreports.com/Company/3i-group-plc,3i Group plc,III,2021.0,https://www.responsibilityreports.com/Click/1194
2,https://www.responsibilityreports.com/Company/3i-group-plc,3i Group plc,III,2019.0,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/3/LSE_III_2019.pdf
...,...,...,...,...,...
8752,https://www.responsibilityreports.com/Company/orsted,Ørsted,DOGEF,2018.0,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/o/OTC_DOGEF_2018.pdf
8753,https://www.responsibilityreports.com/Company/orsted,Ørsted,DOGEF,2017.0,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/o/OTC_DOGEF_2017.pdf


In [None]:
# Reformat columns


In [33]:
# Store df
df_responsibilityreports_website.to_excel('data/interim/reports_responsibilityreports_website.xlsx', index=False)

## 2. Add scraped URLs to *reports_labeled* where CSR_URL is empty

In [5]:
# Load refinitiv dataset
df_labeled = pd.read_excel('data/interim/reports_labeled.xlsx', engine='openpyxl')
df_labeled

Unnamed: 0,ID,Identifier,Company_Name,ISIN,Ticker,Country_of_Exchange,Financial_Period_Absolute,Financial_Period_Relative,CSR_URL,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16,SDG_17
0,0,888.L,888 Holdings PLC,GI000A0F6407,888,United Kingdom,2020,FY0,https://corporate.888.com/wp-content/uploads/2021/04/2020-Annual-Report.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,False,True,False
1,1,A.N,Agilent Technologies Inc,US00846U1016,A,United States of America,2020,FY0,https://www.agilent.com/about/companyinfo/sustainability/Agilent-Report-CSR-2020.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10856,11450,YRI.TO,Yamana Gold Inc,CA98462Y1007,YRI,Canada,2010,FY-10,http://www.yamana.com/Theme/NewYamana/files/YAMANA%20CSR-E%2017-08-11b.pdf,False,False,True,True,False,True,True,False,False,,False,True,True,,True,True,False
10857,11451,YUM.N,Yum! Brands Inc,US9884981013,YUM,United States of America,2010,FY-10,http://www.yum.com/responsibility/,False,False,True,True,True,True,True,False,False,,False,True,False,,False,True,False


In [16]:
# Extract rows without CSR_URL
df_labeled_na = df_labeled[df_labeled['CSR_URL'].isna()]
# Check for duplicates before merging
df_labeled_na[df_labeled_na[['Ticker', 'Financial_Period_Absolute']].duplicated(keep='last')]

Unnamed: 0,ID,Identifier,Company_Name,ISIN,Ticker,Country_of_Exchange,Financial_Period_Absolute,Financial_Period_Relative,CSR_URL,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16,SDG_17
49,49,AEP.OQ,American Electric Power Company Inc,US0255371017,AEP,United States of America,2020,FY0,,False,False,True,True,True,True,True,True,False,,False,True,True,,True,True,False
371,373,CARR.N,Carrier Global Corp,US14448C1045,CARR,United States of America,2020,FY0,,False,False,True,True,True,True,True,True,False,,False,True,True,,False,True,False


In [None]:
# Drop rows without URL

## 3. Store remaining URLs from responsibilityreports.com in *reports_unlabeled*

## 4. Scrape reports for refinitiv dataset

In [None]:
# Create new column to track scraping progress
df_labeled['CSR_Filename'] = ''

# Specify path to store the reports 
path_output = 'D:/master-thesis/data/' # external harddrive

In [None]:
# Scrape reports
scrape_pdf_reports(df_labeled, path_output)
df_labeled

In [None]:
# TEMP: Store updated df
df_labeled.to_excel('data/interim/reports_labeled_filenames.xlsx', index=False)

In [None]:
# Check the results
df_labeled_clean = df_labeled[df_unlabeled['CSR_Filename'] != 'Error']
df_labeled_clean

## 5. Scrape remaining unlabeled reports from responsibilityreports.com

# OLD BELOW

# 3. Merge scraped URLs to Eikon dataframe

In [40]:
# Merge on ticker and CSR_Period_Absolute
df_merged = df_unlabeled_errors.merge(df_responsibilityreports_website, how='left', left_on=['Ticker_Proxy', 'CSR_Period_Absolute'], right_on=['Ticker', 'Year'])
df_merged

Unnamed: 0,ID,Identifier,Company_Name,CSR_Period_Relative,CSR_Period_Absolute,CSR_URL,CSR_Filename,Ticker_Proxy,Link,Name,Ticker,Year,URL
0,4,WN_pa.TO,George Weston Ltd,FY0,2019,http://www.weston.ca/en/Environment.aspx,Error,WN,,,,,
1,7,BAM_pb.TO,Brookfield,FY0,2020,https://www.brookfield.com/sites/default/files/2021-06/2020_ESG_Report.pdf,Error,BAM,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7913,13335,VOD.L,Vodafone Group PLC,FY0,2021,https://www.vodafone.com/sustainable-business/our-contribution-to-un-sdgs/SDG-7,Error,VOD,,,,,
7914,13336,VOD.L,Vodafone Group PLC,FY0,2021,https://www.vodafone.com/sustainable-business/our-contribution-to-un-sdgs/SDG-8,Error,VOD,,,,,


In [41]:
# Only keep rows for which merge was successful
df_merged = df_merged[~df_merged['URL'].isna()]
df_merged

Unnamed: 0,ID,Identifier,Company_Name,CSR_Period_Relative,CSR_Period_Absolute,CSR_URL,CSR_Filename,Ticker_Proxy,Link,Name,Ticker,Year,URL
2,10,TRP_pa.TO,TC Energy Corp,FY0,2019,https://www.tcenergy.com/siteassets/pdfs/sustainability/report/ESG_2020.pdf,Error,TRP,https://www.responsibilityreports.com/Company/transcanada-corporation,TransCanada Corporation,TRP,2019.0,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/t/TSX_TRP_2019.pdf
3,18,AQN_pa.TO,Algonquin Power & Utilities Corp,FY0,2019,http://algonquinpower.com/docs/APUC-Sustainability-Report-2020.pdf,Error,AQN,https://www.responsibilityreports.com/Company/algonquin-power-utilities-corp,Algonquin Power & Utilities Corp.,AQN,2019.0,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/a/TSX_AQN_2019.pdf
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7903,13324,VET.TO,Vermilion Energy Inc,FY-1,2019,http://sustainability.vermilionenergy.com/hse/environment/environment-dashboard.cfm,Error,VET,https://www.responsibilityreports.com/Company/vermilion-energy-inc,Vermilion Energy Inc.,VET,2019.0,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/v/TSX_VET_2019.pdf
7904,13325,VET.TO,Vermilion Energy Inc,FY-1,2019,http://sustainability.vermilionenergy.com/people/our-people-overview.cfm,Error,VET,https://www.responsibilityreports.com/Company/vermilion-energy-inc,Vermilion Energy Inc.,VET,2019.0,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/v/TSX_VET_2019.pdf


In [42]:
# Overwrite CSR_URL with the new URL and drop columns
df_merged_clean = df_merged
df_merged_clean['CSR_URL'] = df_merged_clean['URL']
df_merged_clean.drop(['Ticker_Proxy', 'Link', 'Name', 'Ticker', 'Year', 'URL'], axis=1, inplace=True)
df_merged_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,ID,Identifier,Company_Name,CSR_Period_Relative,CSR_Period_Absolute,CSR_URL,CSR_Filename
2,10,TRP_pa.TO,TC Energy Corp,FY0,2019,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/t/TSX_TRP_2019.pdf,Error
3,18,AQN_pa.TO,Algonquin Power & Utilities Corp,FY0,2019,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/a/TSX_AQN_2019.pdf,Error
...,...,...,...,...,...,...,...
7903,13324,VET.TO,Vermilion Energy Inc,FY-1,2019,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/v/TSX_VET_2019.pdf,Error
7904,13325,VET.TO,Vermilion Energy Inc,FY-1,2019,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/v/TSX_VET_2019.pdf,Error


# 6. Manually add the correct URLs to the remaining rows of the labeled dataset and scrape the reports

In [6]:
# Extract rows that could not have been scraped so far from df_labeled where the URL leads to a pdf file
df_labeled_errors = df_labeled[df_labeled['CSR_Filename'] == 'Error']
# Keep rows for which the URL leads to a pdf file
df_labeled_errors = df_labeled_errors[df_labeled_errors['CSR_URL'].str.contains('pdf') | df_labeled_errors['CSR_URL'].str.contains('ashx') | df_labeled_errors['CSR_URL'].str.contains('file') | df_labeled_errors['CSR_URL'].str.contains('report') | df_labeled_errors['CSR_URL'].str.contains('download')]
df_labeled_errors

Unnamed: 0,ID,Identifier,Company_Name,CSR_Period_Relative,CSR_Period_Absolute,CSR_URL,SDG1,SDG2,SDG3,SDG4,SDG5,SDG6,SDG7,SDG8,SDG9,SDG10,SDG11,SDG12,SDG13,SDG14,SDG15,SDG16,SDG17,CSR_Filename
4,7,BAM_pb.TO,Brookfield Asset Management Inc,FY0,FY2020,https://www.brookfield.com/sites/default/files/2021-06/2020_ESG_Report.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error
33,41,ALG.N,Alamo Group Inc,FY0,FY2020,https://www.alamo-group.com/assets/files/Our_Company/Our%20Commitment/2020_Sustainability_Report_Final_03-12-21_Rev2.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,13134,ESKN.L,Esken Ltd,FY-1,FY2019,https://www.stobartgroup.co.uk/wp-content/uploads/2019/06/31087_Stobart_AR_Web.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error
4309,13141,AZRE.N,Azure Power Global Ltd,FY-1,FY2019,https://www.azurepower.com/newsroom_pdf/421415825459572841.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error


In [7]:
# Add column 'Search' to make the manual Google Search easier
df_labeled_errors['Search'] = df_labeled_errors['Company_Name'] + ' ' + df_labeled_errors['CSR_Period_Absolute'].str.replace('FY', '') + ' ' + 'sustainability report filetype:pdf'
# Add column 'CSR_URL_New' to enter a working URL (if it exists)
df_labeled_errors['CSR_URL_New'] = ''
df_labeled_errors

Unnamed: 0,ID,Identifier,Company_Name,CSR_Period_Relative,CSR_Period_Absolute,CSR_URL,SDG1,SDG2,SDG3,SDG4,SDG5,SDG6,SDG7,SDG8,SDG9,SDG10,SDG11,SDG12,SDG13,SDG14,SDG15,SDG16,SDG17,CSR_Filename,Search,CSR_URL_New
4,7,BAM_pb.TO,Brookfield Asset Management Inc,FY0,FY2020,https://www.brookfield.com/sites/default/files/2021-06/2020_ESG_Report.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error,Brookfield Asset Management Inc 2020 sustainability report filetype:pdf,
33,41,ALG.N,Alamo Group Inc,FY0,FY2020,https://www.alamo-group.com/assets/files/Our_Company/Our%20Commitment/2020_Sustainability_Report_Final_03-12-21_Rev2.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error,Alamo Group Inc 2020 sustainability report filetype:pdf,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,13134,ESKN.L,Esken Ltd,FY-1,FY2019,https://www.stobartgroup.co.uk/wp-content/uploads/2019/06/31087_Stobart_AR_Web.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error,Esken Ltd 2019 sustainability report filetype:pdf,
4309,13141,AZRE.N,Azure Power Global Ltd,FY-1,FY2019,https://www.azurepower.com/newsroom_pdf/421415825459572841.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error,Azure Power Global Ltd 2019 sustainability report filetype:pdf,


In [8]:
# Store df and process it manually
df_labeled_errors.to_excel('data/interim/reports_labeled_errors.xlsx', index=False)

In [4]:
# Load processed df
df_labeled_errors = pd.read_excel('data/interim/reports_labeled_errors.xlsx')
df_labeled_errors

Unnamed: 0,ID,Identifier,Company_Name,CSR_Period_Relative,CSR_Period_Absolute,CSR_URL,SDG1,SDG2,SDG3,SDG4,SDG5,SDG6,SDG7,SDG8,SDG9,SDG10,SDG11,SDG12,SDG13,SDG14,SDG15,SDG16,SDG17,CSR_Filename,Search,CSR_URL_New
0,7,BAM_pb.TO,Brookfield Asset Management Inc,FY0,FY2020,https://www.brookfield.com/sites/default/files/2021-06/2020_ESG_Report.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error,Brookfield Asset Management Inc 2020 sustainability report filetype:pdf,https://www.brookfield.com/sites/default/files/2020-09/Brookfield%202019%20ESG%20Report.pdf
1,41,ALG.N,Alamo Group Inc,FY0,FY2020,https://www.alamo-group.com/assets/files/Our_Company/Our%20Commitment/2020_Sustainability_Report_Final_03-12-21_Rev2.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error,Alamo Group Inc 2020 sustainability report filetype:pdf,https://www.alamo-group.com/assets/files/Our_Company/Our%20Commitment/2020_Sustainability_Report_Final_03-12-21_Rev2.pdf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,13134,ESKN.L,Esken Ltd,FY-1,FY2019,https://www.stobartgroup.co.uk/wp-content/uploads/2019/06/31087_Stobart_AR_Web.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error,Esken Ltd 2019 sustainability report filetype:pdf,
376,13141,AZRE.N,Azure Power Global Ltd,FY-1,FY2019,https://www.azurepower.com/newsroom_pdf/421415825459572841.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error,Azure Power Global Ltd 2019 sustainability report filetype:pdf,https://www.azurepower.com/sites/default/files/inline-files/Report2018-19.pdf


In [6]:
# Keep only rows for which a new URL could be found
df_labeled_errors = df_labeled_errors[~df_labeled_errors['CSR_URL_New'].isna()]
# Replace URL and remove additional columns that are not needed anymore
df_labeled_errors['CSR_URL'] = df_labeled_errors['CSR_URL_New']
df_labeled_errors.drop(['Search', 'CSR_URL_New'], axis=1, inplace=True)
df_labeled_errors

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,ID,Identifier,Company_Name,CSR_Period_Relative,CSR_Period_Absolute,CSR_URL,SDG1,SDG2,SDG3,SDG4,SDG5,SDG6,SDG7,SDG8,SDG9,SDG10,SDG11,SDG12,SDG13,SDG14,SDG15,SDG16,SDG17,CSR_Filename
0,7,BAM_pb.TO,Brookfield Asset Management Inc,FY0,FY2020,https://www.brookfield.com/sites/default/files/2020-09/Brookfield%202019%20ESG%20Report.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error
1,41,ALG.N,Alamo Group Inc,FY0,FY2020,https://www.alamo-group.com/assets/files/Our_Company/Our%20Commitment/2020_Sustainability_Report_Final_03-12-21_Rev2.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,13128,WEIR.L,Weir Group PLC,FY-1,FY2018,https://www.global.weir/assets/files/sustainability/weirgroup-cdp-climate-change-questionnaire-2019.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error
376,13141,AZRE.N,Azure Power Global Ltd,FY-1,FY2019,https://www.azurepower.com/sites/default/files/inline-files/Report2018-19.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Error


In [7]:
%%capture output 

# Specify path to store the reports 
path_output = 'data/pdf_reports/'

# Scrape additional reports
scrape_reports(df_labeled_errors, path_output)

In [9]:
# Only keep rows that could have been scraped
df_labeled_errors = df_labeled_errors[df_labeled_errors['CSR_Filename'] != 'Error']
df_labeled_errors

Unnamed: 0,ID,Identifier,Company_Name,CSR_Period_Relative,CSR_Period_Absolute,CSR_URL,SDG1,SDG2,SDG3,SDG4,SDG5,SDG6,SDG7,SDG8,SDG9,SDG10,SDG11,SDG12,SDG13,SDG14,SDG15,SDG16,SDG17,CSR_Filename
0,7,BAM_pb.TO,Brookfield Asset Management Inc,FY0,FY2020,https://www.brookfield.com/sites/default/files/2020-09/Brookfield%202019%20ESG%20Report.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7_FY0_BAM_pb.TO.pdf
2,65,AIV.N,Apartment Investment and Management Co,FY0,FY2019,https://www.aircommunities.com/content/dam/aimco/corporate-responsibility/AimcoCorporateResponsibilityReport_2019.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,65_FY0_AIV.N.pdf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,13127,SBRY.L,J Sainsbury PLC,FY-1,FY2019,https://www.about.sainsburys.co.uk/~/media/Files/S/Sainsburys/documents/reports-and-presentations/annual-reports/2020/Sainsburys_Sustainability_Update_1920.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13127_FY-1_SBRY.L.pdf
376,13141,AZRE.N,Azure Power Global Ltd,FY-1,FY2019,https://www.azurepower.com/sites/default/files/inline-files/Report2018-19.pdf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13141_FY-1_AZRE.N.pdf


In [31]:
df_labeled = pd.read_excel('data/interim/reports_labeled.xlsx')
df_labeled_pages = pd.read_excel('data/interim/reports_labeled_pages.xlsx')
df_unlabeled_clean = pd.read_excel('data/interim/reports_unlabeled_clean.xlsx')

In [32]:
# Merge filenames to df_labeled and df_labeled_pages and add filenames to df_unlabeled_clean
df_labeled = df_labeled.merge(df_labeled_errors[['ID', 'CSR_Filename']], how='left', on='ID')
df_labeled['CSR_Filename_x']= np.where(df_labeled['CSR_Filename_y'].notna(), df_labeled['CSR_Filename_y'], df_labeled['CSR_Filename_x'])
df_labeled.drop('CSR_Filename_y', axis=1, inplace=True)
df_labeled.rename(columns={'CSR_Filename_x': 'CSR_Filename'}, inplace=True)

df_labeled_pages = df_labeled_pages.merge(df_labeled_errors[['ID', 'CSR_Filename']], how='left', on='ID')
df_labeled_pages['CSR_Filename_x']= np.where(df_labeled_pages['CSR_Filename_y'].notna(), df_labeled_pages['CSR_Filename_y'], df_labeled_pages['CSR_Filename_x'])
df_labeled_pages.drop('CSR_Filename_y', axis=1, inplace=True)
df_labeled_pages.rename(columns={'CSR_Filename_x': 'CSR_Filename'}, inplace=True)

temp = df_labeled_errors[['ID', 'Identifier', 'Company_Name', 'CSR_Period_Relative', 'CSR_Period_Absolute', 'CSR_URL', 'CSR_Filename']]
df_unlabeled_clean = df_unlabeled_clean.append(temp)
df_unlabeled_clean.sort_values(by='ID', ascending=True, inplace=True)

In [33]:
# Update Excel files
df_labeled.to_excel('data/interim/reports_labeled.xlsx', index = False)
df_labeled_pages.to_excel('data/interim/reports_labeled_pages.xlsx', index = False)
df_unlabeled_clean.to_excel('data/interim/reports_unlabeled_clean.xlsx', index = False)

In [34]:
# Keep only rows for which a URL could be scraped and update Excel files
df_labeled_clean = df_labeled[df_labeled['CSR_Filename'] != 'Error']
df_labeled_pages_clean = df_labeled_pages[df_labeled_pages['CSR_Filename'] != 'Error']
df_labeled_clean.to_excel('data/interim/reports_labeled_clean.xlsx', index = False)
df_labeled_pages_clean.to_excel('data/interim/reports_labeled_pages_clean.xlsx', index = False)

# 7. Manually adapt the page numbers in df_labeled_pages_clean

In [39]:
# Create new column for correct page number and store to Excel
df_labeled_pages_clean['Page_New'] = ''
df_labeled_pages_clean.to_excel('data/interim/reports_labeled_pages_clean_corrected_page_numbers.xlsx', index = False)

In [None]:
# Load manually corrected df and replace page numbers

# 8. Scrape remaining unlabeled reports from responsibilityreports.com and add (append) them to df_unlabeled_clean

In [None]:
# Get URLs that have not yet been added to df_unlabeled_clean


# 9. Produce final files 

In [None]:
# pickle -> in processed folder