In [2]:
import os
import warnings
import requests
import urllib3
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from pypdf import PdfReader
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

warnings.filterwarnings("ignore")
urllib3.disable_warnings()

### Ссылки на ESG рейтинг RAEX

* https://www.raexpert.eu/esg_corporate_ranking/#conf-tab-1
* https://raex-rr.com/ESG/ESG_companies/ESG_rating_companies/2023.4/

## Сбор данных RAEX

### Парсинг сайтов компаний из рейтинга

In [2]:
stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36',
    "accept": "application/json"
}

timeout = (5, 25)

raex_list = pd.read_excel("RAEX list.xlsx").dropna(subset=["url_sustainability"]).reset_index(drop=True)
raex_list

Unnamed: 0,№,Название,Код MOEX,Подотрасль,ESG-рейтинг,E Rank,E-рейтинг,S Rank,S-рейтинг,G Rank,G-рейтинг,Год последней оцененной отчетности,url,url_sustainability
0,1,НЛМК,NLMK,Чёрная металлургия,AA,2,AA,2,AA,21,A,2021,https://nlmk.com/ru/,https://nlmk.com/ru/sustainability/
1,2,«Полюс»,PLZL,Драгоценные металлы,AA,1,AAA,14,A,27,A,2021,https://polyus.com/ru/,https://sustainability.polyus.com/ru/
2,3,«Уралкалий»,-,Агрохимикаты,A,6,BBB,1,AA,6,AA,2021,https://www.uralkali.com/ru/,https://www.uralkali.com/ru/sustainability/
3,4,«ЭЛ5-Энерго»,ELFV,Электроэнергетика,A,9,BBB,9,A,2,AAA,2021,https://www.el5-energo.ru,https://www.el5-energo.ru/sustainability/
4,5,«Полиметалл»,POLY,Драгоценные металлы,A,4,A,5,A,12,AA,2021,https://www.polymetalinternational.com/ru/,https://www.polymetalinternational.com/ru/sust...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,153,Кордиант,-,Производство шин,C,154,C,154,C,142,CC,2020,https://cordiant.ru,https://www.cordiant-tyre.ru/values/
137,155,«Промышленно-металлургический холдинг» (ПМХ),-,Чёрная металлургия,C,156,C,148,CC,149,CC,2020,https://www.metholding.ru,https://www.metholding.ru/development/
138,156,"""Титан"", группа компаний (деревообработка)",-,Деревообработка,C,148,C,152,C,152-154,CC,2020,https://titan-group.ru,https://titan-group.ru/about/development/
139,157,«Минудобрения»,-,Агрохимикаты,C,157,C,157,C,157-160,C,2021,https://minudo.ru,https://minudo.ru/?cid=28&parent_id=5


In [3]:
def parsing_pdf(company, pdf):
    reader = PdfReader(f"pdf/{pdf}")
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"

    text = text.replace('-\n', '').lower()
    text = re.sub(r'[`!@#$%^&*()_+\-=\[\]{};\':"\\|,.<>\/?~©«»—]', r'', text)
    text = ''.join([i for i in text if not i.isdigit()])

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)

    text = list(np.concatenate([sent_tokenize(i.strip()) for i in text.split('\n')]).flat)

    text_lemmatized = []
    for line in [t.split() for t in text]:
        line_lemmatized = ' '.join([morph.normal_forms(l)[0] for l in line if l not in stopwords_ru])
        text_lemmatized.append(line_lemmatized)

    data = pd.DataFrame(text_lemmatized, columns=['text']).drop_duplicates()
    data['company'] = company['Название']
    data['rating'] = company['№']
    data['url'] = pdf
    data = data[['rating', 'company', 'url', 'text']]

    return data

In [4]:
def parsing_site(company, url, depth, max_depth=0):
#     print(url)
    response = requests.get(url=url, timeout=timeout, headers=headers, verify=False)
    soup = BeautifulSoup(response.content, 'lxml')

    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk).replace('-\n', '').lower()
    text = re.sub(r'[`!@#$%^&*()_+\-=\[\]{};\':"\\|,.<>\/?~©«»—]', r'', text)
    text = ''.join([i for i in text if not i.isdigit()])

    text = list(np.concatenate([sent_tokenize(i.strip()) for i in text.split('\n')]).flat)

    text_lemmatized = []
    for line in [t.split() for t in text]:
        line_lemmatized = ' '.join([morph.normal_forms(l)[0] for l in line if l not in stopwords_ru])
        text_lemmatized.append(line_lemmatized)

    spans = list(filter(None, [span.string for span in soup.find_all('span')]))

    if len(spans):
        spans = '\n'.join(span for span in spans if span).replace('-\n', '').lower()
        spans = re.sub(r'[`!@#$%^&*()_+\-=\[\]{};\':"\\|,.<>\/?~©«»—]', r'', spans)
        spans = ''.join([i for i in spans if not i.isdigit()])

        spans = list(np.concatenate([sent_tokenize(i.strip()) for i in spans.split('\n')]).flat)
    else:
        spans = []

    spans_lemmatized = []
    for line in [s.split() for s in spans]:
        line_lemmatized = ' '.join([morph.normal_forms(l)[0] for l in line if l not in stopwords_ru])
        spans_lemmatized.append(line_lemmatized)

    df1 = pd.DataFrame(text_lemmatized, columns=['text'])
    df2 = pd.DataFrame(spans_lemmatized, columns=['text'])
    data = pd.concat([df1, df2]).drop_duplicates().reset_index(drop=True)
    data['company'] = company['Название']
    data['rating'] = company['№']
    data['url'] = url
    data = data[['rating', 'company', 'url', 'text']]

#     print(depth)
    if depth < max_depth:
        internalLinks = [
            a.get('href') for a in soup.find_all('a')
            if a.get('href') and a.get('href').startswith('/')
        ]
        internalLinks = [link[1:] for link in internalLinks]
        internalLinks = [*set(internalLinks)]
        internalLinks = list(filter(None, internalLinks))

        for link in internalLinks:
            if ".pdf" not in link and ".PDF" not in link:
                new_url = os.path.join(url[:url.rfind('www')], urlparse(url).netloc, link)
                try:
                    df = parsing_site(company, new_url, depth + 1, max_depth)
                    data = pd.concat([data, df]).drop_duplicates()
                except Exception as err:
                    print('ERROR', new_url, err)
                    continue

    return data

In [5]:
%%time
max_depth = 1
parsed_data = pd.DataFrame()

# for i, company in raex_list[140:141].iterrows():
# for i, company in raex_list[7:8].iterrows():
for i, company in raex_list.iterrows():
    rating = company['№']
    name = company['Название']
    url = company['url_sustainability']
    
    try:
        if ".pdf" in url:
            df = parsing_pdf(company, url)
        else:
            df = parsing_site(company, url, 0, max_depth)
        parsed_data = pd.concat([parsed_data, df])
        print('DONE', i, name)
    except Exception as err:
        print('ERROR', i, name, err)
        continue

parsed_data = parsed_data.drop_duplicates()
parsed_data = parsed_data.reset_index(drop=True)
parsed_data.to_csv('parsed_data.csv', encoding='utf-8')

DONE 0 НЛМК
DONE 1 «Полюс»
ERROR /pda.uralkali.com/ru/ Invalid URL '/pda.uralkali.com/ru/': No scheme supplied. Perhaps you meant https:///pda.uralkali.com/ru/?
DONE 2 «Уралкалий»
DONE 3 «ЭЛ5-Энерго»
DONE 4 «Полиметалл»
ERROR /www.phosagro.com/pt/ Invalid URL '/www.phosagro.com/pt/': No scheme supplied. Perhaps you meant https:///www.phosagro.com/pt/?
ERROR /www.phosagro.fr/ Invalid URL '/www.phosagro.fr/': No scheme supplied. Perhaps you meant https:///www.phosagro.fr/?
ERROR /www.phosagro.es/ Invalid URL '/www.phosagro.es/': No scheme supplied. Perhaps you meant https:///www.phosagro.es/?
ERROR /www.phosagro.com/it/ Invalid URL '/www.phosagro.com/it/': No scheme supplied. Perhaps you meant https:///www.phosagro.com/it/?
ERROR /www.phosagro.de/ Invalid URL '/www.phosagro.de/': No scheme supplied. Perhaps you meant https:///www.phosagro.de/?
ERROR /www.phosagro.com/sustainability/ Invalid URL '/www.phosagro.com/sustainability/': No scheme supplied. Perhaps you meant https:///www.phosag

ERROR https://www.akbars.ru/news/expert-ra-prisvoil-ak-bars-banku-reiting-na-urovne-esg-iii/ HTTPSConnectionPool(host='www.akbars.ru', port=443): Max retries exceeded with url: /news/expert-ra-prisvoil-ak-bars-banku-reiting-na-urovne-esg-iii/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7feee3a1f700>, 'Connection to www.akbars.ru timed out. (connect timeout=5)'))
ERROR https://www.akbars.ru/news/ak-bars-bank-stal-laureatom-premii-retail-finance-awards-2021/ HTTPSConnectionPool(host='www.akbars.ru', port=443): Max retries exceeded with url: /news/ak-bars-bank-stal-laureatom-premii-retail-finance-awards-2021/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fef016062b0>, 'Connection to www.akbars.ru timed out. (connect timeout=5)'))
ERROR https://www.akbars.ru/small-business/cards/ HTTPSConnectionPool(host='www.akbars.ru', port=443): Max retries exceeded with url: /small-business/cards/ (Caused by ConnectTimeoutError(<urlli

ERROR /www.soda.ru/ru/pages/detail/contacts/ Invalid URL '/www.soda.ru/ru/pages/detail/contacts/': No scheme supplied. Perhaps you meant https:///www.soda.ru/ru/pages/detail/contacts/?
DONE 101 «Башкирская содовая компания»
DONE 102 «Сибантрацит», группа
DONE 103 «Ашинский металлургический завод», группа
DONE 104 «Квадра»
DONE 105 ЧФМК
DONE 106 Пигмент (КРАТА)
DONE 107 «РуссНефть», нефтегазовая компания
DONE 108 «Мечел»
DONE 109 Казаньоргсинтез
DONE 110 Новикомбанк
DONE 111 «Ямал СПГ»
DONE 112 «Русвинил»
DONE 113 «Русская медная компания»
DONE 114 «Т Плюс»
DONE 115 «Титан», группа компаний (нефтехимия)
DONE 116 «Азбука Вкуса» (сеть супермаркетов)
DONE 117 ТГК-2
DONE 118 «Славнефть», группа
DONE 119 «Русский уголь»
DONE 120 «Стройсервис»
DONE 121 Сусуманзолото
DONE 122 «УГМК»
ERROR /zakupki.gazprom-neft.ru/ Invalid URL '/zakupki.gazprom-neft.ru/': No scheme supplied. Perhaps you meant https:///zakupki.gazprom-neft.ru/?
ERROR /www.gpnbonus.ru/ Invalid URL '/www.gpnbonus.ru/': No scheme s

In [7]:
parsed_data

Unnamed: 0,rating,company,url,text
0,1,НЛМК,https://nlmk.com/ru/sustainability/,устойчивый развитие подход политика приоритет
1,1,НЛМК,https://nlmk.com/ru/sustainability/,просмотр содержимое разрешить работа кук
2,1,НЛМК,https://nlmk.com/ru/sustainability/,pinsymbols
3,1,НЛМК,https://nlmk.com/ru/sustainability/,pinhamburger
4,1,НЛМК,https://nlmk.com/ru/sustainability/,pinq
...,...,...,...,...
760446,158-160,"«Петропавловск», группа компаний",POG-SR2020_RUS_080921.pdf,рубцов переулок далее
760447,158-160,"«Петропавловск», группа компаний",POG-SR2020_RUS_080921.pdf,teamirpetropavlovskplccom
760448,158-160,"«Петропавловск», группа компаний",POG-SR2020_RUS_080921.pdf,contactpetropavlovskplccom
760449,158-160,"«Петропавловск», группа компаний",POG-SR2020_RUS_080921.pdf,wwwpetropavlovskplccomпубличный компания


In [8]:
parsed_data.to_csv('parsed_data.csv', encoding='utf-8')

### Тесты и проверки

In [18]:
company = raex_list.loc[7]
# company = raex_list.loc[140]
# company = raex_list.loc[12]
# company = raex_list.loc[18]
# company = raex_list.loc[31]
# company = raex_list.loc[2]
company = raex_list.loc[27]
name = company['Название']
url = company['url_sustainability']
company

№                                                                         28
Название                                                           «Газпром»
Код MOEX                                                                GAZP
Подотрасль                             Интегрированные нефтегазовые компании
ESG-рейтинг                                                              BBB
E Rank                                                                    15
E-рейтинг                                                                BBB
S Rank                                                                    42
S-рейтинг                                                                 BB
G Rank                                                                    72
G-рейтинг                                                                BBB
Год последней оцененной отчетности                                      2021
url                                                   https://www.gazprom.ru

In [19]:
response = requests.get(url=url, timeout=timeout, headers=headers, verify=False)
soup = BeautifulSoup(response.content, 'lxml')

for script in soup(["script", "style"]):
    script.extract()

internalLinks = [
    a.get('href') for a in soup.find_all('a')
    if a.get('href') and a.get('href').startswith('/')
]
internalLinks = [link[1:] for link in internalLinks]
internalLinks = [*set(internalLinks)]
internalLinks = list(filter(None, internalLinks))

print(url)

internalLinks

https://www.gazprom.ru/sustainability/


['sustainability/people/konkurs/',
 'sustainability/environmental-protection/environmental-reports/',
 'investors/',
 'about/ms/quality-management-system/',
 'tenders/',
 'about/legal/policy-personal-data/cookie-notice/',
 'sustainability/environmental-protection/energy-conservation/',
 'about/production/gas-infrastructure-expansion/',
 'sustainability/environmental-protection/hydrogen/',
 'about/legal/policy-personal-data/cookie-notice/#cookies-settings',
 'sustainability/sustainability-management/',
 'about/strategy/innovation/one-window/',
 'about/production/safety/',
 'bulletin-board/',
 '/www.gazprom.de/sustainability/',
 'sustainability/sustainability-management/reports/',
 'sustainability/ratings/openness-rating/',
 'sustainability/people/about-people/',
 'sustainability/local-communities/supporting-sports/',
 'sustainability/local-communities/supporting-cultural-projects/',
 'sustainability/environmental-protection/',
 'sustainability/people/statistics/',
 'sustainability/peopl

In [20]:
for link in internalLinks:
    if ".pdf" not in link:
        new_url = os.path.join(url[:url.rfind('www')], urlparse(url).netloc, link)
        print(new_url)

https://www.gazprom.ru/sustainability/people/konkurs/
https://www.gazprom.ru/sustainability/environmental-protection/environmental-reports/
https://www.gazprom.ru/investors/
https://www.gazprom.ru/about/ms/quality-management-system/
https://www.gazprom.ru/tenders/
https://www.gazprom.ru/about/legal/policy-personal-data/cookie-notice/
https://www.gazprom.ru/sustainability/environmental-protection/energy-conservation/
https://www.gazprom.ru/about/production/gas-infrastructure-expansion/
https://www.gazprom.ru/sustainability/environmental-protection/hydrogen/
https://www.gazprom.ru/about/legal/policy-personal-data/cookie-notice/#cookies-settings
https://www.gazprom.ru/sustainability/sustainability-management/
https://www.gazprom.ru/about/strategy/innovation/one-window/
https://www.gazprom.ru/about/production/safety/
https://www.gazprom.ru/bulletin-board/
/www.gazprom.de/sustainability/
https://www.gazprom.ru/sustainability/sustainability-management/reports/
https://www.gazprom.ru/sustaina

In [37]:
max_depth = 1
parsing_site(company, url, 0, max_depth)

Unnamed: 0,rating,company,url,text
0,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/sustainability/,устойчивый развитие норникель
1,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/sustainability/,карьера
2,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/sustainability/,поставщик
3,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/sustainability/,реализация актив
4,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/sustainability/,сайт группа
...,...,...,...,...
26,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/other-sites/,норникель лидер горнометаллургический промышле...
27,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/other-sites/,пао гмк норильский никель
28,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/other-sites/,раскрытие информация
29,8,"«Норильский никель», горно-металлургическая ко...",https://www.nornickel.ru/other-sites/,страница ооо интерфаксцрка


In [160]:
parsing_pdf(company, url)