In [21]:
import requests
import time

import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup

In [30]:
n_pages = 441
browser = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36"

In [27]:
companies = list()

In [31]:
for page_number in tqdm(range(50, n_pages + 1)):
    url = f"https://www.sustainalytics.com/esg-ratings/?currentpage={page_number}"
    response = requests.get(url, headers={'user-agent': browser})
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    blocks = soup.findAll(class_="company-row d-flex")

    for block in blocks:
        company_name = block.find(class_="primary-color d-block").text
        company_tag = block.find("small").text
        company_rating = block.find(class_="col-2").text
        companies.append({
            'name': company_name,
            'tag': company_tag,
            'rating': float(company_rating)
        })
    
    time.sleep(5)

100%|██████████| 392/392 [42:44<00:00,  6.54s/it]


In [32]:
df = pd.DataFrame(companies)
df.head()

Unnamed: 0,name,tag,rating
0,1&1 Drillisch AG,ETR:DRI,21.1
1,2i Rete Gas SpA,-,37.2
2,"2U, Inc.",NAS:TWOU,15.7
3,"360 Security Technology, Inc.",SHG:601360,21.2
4,3i Group PLC,LON:III,12.6


In [35]:
df.to_pickle("../data/raw/sustainalytics.pkl")

In [37]:
import wikipedia

In [78]:
wikipedia_pages = list()

for name in tqdm(df['name']):
    try:
        wikipedia_pages.append(wikipedia.page(name))
    except:
        wikipedia_pages.append(None)

100%|██████████| 4419/4419 [51:06<00:00,  1.44it/s]


In [79]:
df['wikipedia_page'] = wikipedia_pages
df.head()

Unnamed: 0,name,tag,rating,wikipedia_page
0,1&1 Drillisch AG,ETR:DRI,21.1,<WikipediaPage '1&1 Drillisch'>
1,2i Rete Gas SpA,-,37.2,<WikipediaPage '2i Rete Gas'>
2,"2U, Inc.",NAS:TWOU,15.7,<WikipediaPage '2U (company)'>
3,"360 Security Technology, Inc.",SHG:601360,21.2,<WikipediaPage 'Norton 360'>
4,3i Group PLC,LON:III,12.6,<WikipediaPage '3i'>


In [80]:
df.to_pickle('../data/raw/sutainalytics_with_wiki.pkl')

In [81]:
sum([1 for p in wikipedia_pages if p is not None])

3775

In [82]:
len(wikipedia_pages)

4419

In [108]:
company_keywords = ['company', 'corporation', 'provider', 'operator']

In [113]:
seems_valid = list()

for page in tqdm(wikipedia_pages):
    seems_valid.append(page is not None and any(keyword in page.summary.lower() for keyword in company_keywords))

100%|██████████| 4419/4419 [12:16<00:00,  6.00it/s]


In [114]:
sum(seems_valid)

3064

In [116]:
summaries = [page.summary if page is not None else None for page in wikipedia_pages]
contents = [page.content if page is not None else None for page in wikipedia_pages]

df['seems_valid'] = seems_valid
df['summary'] = summaries
df['content'] = contents

df.head()

Unnamed: 0,name,tag,rating,wikipedia_page,seems_valid,summary,content
0,1&1 Drillisch AG,ETR:DRI,21.1,<WikipediaPage '1&1 Drillisch'>,True,1&1 Drillisch AG (formerly known as: Drillisch...,1&1 Drillisch AG (formerly known as: Drillisch...
1,2i Rete Gas SpA,-,37.2,<WikipediaPage '2i Rete Gas'>,True,"Headquartered in Milan, 2i Rete Gas is the se...","Headquartered in Milan, 2i Rete Gas is the se..."
2,"2U, Inc.",NAS:TWOU,15.7,<WikipediaPage '2U (company)'>,True,"2U, Inc. (formerly 2tor Inc.) is an American e...","2U, Inc. (formerly 2tor Inc.) is an American e..."
3,"360 Security Technology, Inc.",SHG:601360,21.2,<WikipediaPage 'Norton 360'>,False,"Norton 360, developed by Symantec, is an “all-...","Norton 360, developed by Symantec, is an “all-..."
4,3i Group PLC,LON:III,12.6,<WikipediaPage '3i'>,True,3i Group plc is a British multinational privat...,3i Group plc is a British multinational privat...


In [117]:
df.to_pickle('../data/raw/sustainalytics_with_content.pkl')

In [118]:
df_valid = df[df['seems_valid']]
df_valid.drop(columns=['tag', 'wikipedia_page', 'seems_valid', 'summary'], inplace=True)
df_valid.head()

Unnamed: 0,name,rating,content
0,1&1 Drillisch AG,21.1,1&1 Drillisch AG (formerly known as: Drillisch...
1,2i Rete Gas SpA,37.2,"Headquartered in Milan, 2i Rete Gas is the se..."
2,"2U, Inc.",15.7,"2U, Inc. (formerly 2tor Inc.) is an American e..."
4,3i Group PLC,12.6,3i Group plc is a British multinational privat...
5,3i Infrastructure PLC,22.1,3i Infrastructure plc (LSE: 3IN) is an investm...


In [119]:
df_valid.to_pickle('../data/processed/sustainalytics_dataset.pkl')