In [177]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

In [178]:
from pytesseract import image_to_string
from PIL import Image

from tqdm import tqdm

from pdf2image import convert_from_path, convert_from_bytes
import PyPDF2
import matplotlib.pyplot as plt

In [209]:
company = "gazprom"
comp_link = "https://www.gazprom.ru"
start = 2010
end = 2022

keywords = ["онсолидированный\ {1,3}[бухгалтерский]+\ {1,3}балан", "актив", "обязательств", "капитал", "итог", "([г]{0,1}лавны[й]{0,1}||правлени[я]{0,1}|[п]{0,1}редседател[ь]{0,1})"]

In [180]:
def link_generator(comp_link, company, start, end):
    links_dict = {}
    years_range = [i for i in range(start,end)]
    for i in years_range:
        link = comp_link + "/investors/disclosure/reports/" +  str(i) + "/"
        filename = f'ifrs_gazprom/{company}_{i}'
        links_dict[i] = [link,filename]
    
    return links_dict

In [181]:
def link_extracktor(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.text)
    hrefs = soup.find_all('a', attrs={'class' : 'with-icon'}, href = True)
    for i in hrefs:
        res = re.findall("href=\"([0-9\/a-zA-z.\-\_\(\)]{10,})\".{5,30}Консолидированная финансовая", str(i))
        if res:
            return res
        else:
            pass
    return ""

In [182]:
def download_pdf(url, filename):
    try:
        response = requests.get(url)
        response.raise_for_status() 
        
        filename = filename + ".pdf"
        
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"PDF успешно сохранен в {filename}")
    except requests.RequestException as e:
        print(f"Ошибка при скачивании PDF: {e}")

In [183]:
# Обработка pdf и выделение из него отчета с "Консолидированный бухгалтерский баланс"
def extract_text_from_image(image, filename, keywords):
    
    break_outer_loop = False  # флаг для прерывания обоих циклов

    for i in image[2:13]:
        if break_outer_loop:
            break
        text = image_to_string(i, lang='rus')
        if all(re.search(keyword, text.lower()) for keyword in keywords):
            break_outer_loop = True
            i.save(filename + ".png")
            return text
    return ""

def extract_text_from_pdf(pdf_path, filename, keywords):
    text = ""
    try:
        pdf_file = open(pdf_path, 'rb')
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Получение количества страниц в PDF
        num_pages = len(pdf_reader.pages)

        # Итерация по каждой странице
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            # Проверка наличия искомого текста на страниц
            if all(re.search(keyword, text.lower()) for keyword in keywords):
                image = convert_from_path(pdf_path, first_page = page_num + 1, last_page = page_num + 1)
                image[0].save(filename + ".png")
                return text
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        pdf_file.close()
    return ""

In [184]:
def nomination_cheker(text):
    unit = re.findall("\(в\ ([а-я ]{20,30})\)", text)
    if re.search("(мил[а-я]{1,3}она)", unit[0]):
        unit = 1
    elif re.search("(мил[а-я]{1,3}арда)", unit[0]):
        unit = 1000
    return int(unit)

In [185]:
# Обрезает текст до нужных строк, все ненужные строки выкидыват
def lines_extractor(text):
    
    unit = nomination_cheker(text)
    
    text = text.lower()
    
    if text:
        
        lines = text.split('\n')
        
        for i in range(len(lines)):
            line = lines[0]
            if re.search("[Аа]ктив[ы]+", line):
                break
            else:
                lines.pop(0)
        
        for i in range(len(lines)):
            k = len(lines)
            line = lines[k-1]
            if re.search("^ {0,2}итого", line):
                break
            lines.pop(k-1)

        return lines, unit
    
    return [], []

In [186]:
from itertools import combinations

def find_combinations(s, unit):
    s = s.strip()
    if not any(c.isspace() for c in s):
        return [0, int(s)*unit]
    
    s = s.replace(" ", "")
    
    # Кейс с отрицательными числами
    if '(' in s and ')' in s:
        parts = s.split(')(')
        return [-int(part.replace('(', '').replace(')', ''))*unit for part in parts]

    # Кейс с дефисами
    if '-' in s:
        parts = s.split("-")
        return [int(part)*unit if part else 0 for part in parts]
    
    # Кейс с возможными комбинациями
    all_combinations = []
    for i in range(1, len(s)):
        first_num = int(s[:i])
        second_num = int(s[i:])
        if max(first_num, second_num) <= 10 * min(first_num, second_num):
            return [first_num*unit, second_num*unit]
        all_combinations.append([first_num, second_num])

    return all_combinations[0] if all_combinations else [int(s)]

In [187]:
def remove_leading_symbols(s):
    
    s = s.strip()
    # Удалить две цифры в начале, если они есть
    s = re.sub(r'^\d{1,2}\ ', '', s)
    # Удалить две буквы в начале, если они есть
    s = re.sub(r'^[a-zA-Zа-яА-Я]{1,2}\ ', '', s)
    
    s = s.strip()
    s = s.strip("\"")
    return s

def dict_packer(lines, unit):
    
    keywords = ["актив", "обязательств", "капита"]
    
    result = {}
    current_section = None
    current_subsection = None

    k = 0

    for line in lines:
        if all(c.isalpha() or c.isspace() for c in line) and any(keyword in line for keyword in keywords):
            current_subsection = line.strip()

        elif any(c.isalpha() for c in line) and any(c.isdigit() for c in line):
            k = 0
            parts = re.findall("^(.{1,}?) ([0-9\\ \-\(\)]+)$", line)
            if parts:
                key, value = parts[0]
                key = remove_leading_symbols(key)
                values = find_combinations(value, unit)
                result[key] = values
        elif all(c.isdigit() or c.isspace() or c == '-' for c in line):
            result[f'итого {current_subsection}'] = find_combinations(line, unit)
        else:
            print(line)
    return result

In [188]:
def csv_writer(result, filename, years):
    df = pd.DataFrame(result)
    df = df.T
    df.columns = years
    df.to_csv(filename+".csv")
    print(f"csv успешно сохранен в {filename}")

In [211]:
links_dict = link_generator(comp_link, company, start, end)

for i in tqdm(range(2019,end)):
    url = links_dict[i][0]
    filename = links_dict[i][1]
    url = link_extracktor(url)
    if url:
        url = comp_link + url[0]
        download_pdf(url, filename)
        pdf_path = filename + ".pdf"
        text = extract_text_from_pdf(pdf_path, filename, keywords)
        if not text:
            image = convert_from_path(pdf_path)
            text = extract_text_from_image(image, filename, keywords)
        if not text:
            pass
        lines, unit = lines_extractor(text)
        result = dict_packer(lines, unit)
        years = [str(i),str(i-1)]
        if result:
            csv_writer(result, filename, years)
        else:
            pass
    else:
        print(f"No link {url}")
        pass

  0%|          | 0/3 [00:00<?, ?it/s]

PDF успешно сохранен в ifrs_gazprom/gazprom_2019.pdf


 33%|███▎      | 1/3 [00:33<01:07, 33.58s/it]

краткосрочные кредиты и займы, векселя к уплате и текущая часть
csv успешно сохранен в ifrs_gazprom/gazprom_2019
PDF успешно сохранен в ifrs_gazprom/gazprom_2020.pdf


 67%|██████▋   | 2/3 [01:08<00:34, 34.30s/it]

краткосрочные кредиты и займы, векселя к уплате и текущая часть
csv успешно сохранен в ifrs_gazprom/gazprom_2020
PDF успешно сохранен в ifrs_gazprom/gazprom_2021.pdf


100%|██████████| 3/3 [01:42<00:00, 34.15s/it]

краткосрочные кредиты и займы, векселя к уплате и текущая часть
csv успешно сохранен в ifrs_gazprom/gazprom_2021



