In [1]:
!pip install PyPDF2
!pip install pdfminer.six
!pip install pdfplumber
!pip install pdf2image
!pip install Pillow
!pip install pytesseract



In [2]:

import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import pdfplumber
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import os

In [3]:
# Create function to extract text

def text_extraction(element):
    # Extracting the text from the in line text element
    line_text = element.get_text()

    # Find the formats of the text
    # Initialize the list with all the formats appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))

    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)

In [4]:
# Extracting tables from the page

def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]

    return table

# Convert table into appropriate fromat
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapted texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string

# Create a function to check if the element is in any tables present in the page
def is_element_inside_any_table(element, page ,tables):
    x0, y0up, x1, y1up = element.bbox
    # Change the cordinates because the pdfminer counts from the botton to top of the page
    y0 = page.bbox[3] - y1up
    y1 = page.bbox[3] - y0up
    for table in tables:
        tx0, ty0, tx1, ty1 = table.bbox
        if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
            return True
    return False

# Function to find the table for a given element
def find_table_for_element(element, page ,tables):
    x0, y0up, x1, y1up = element.bbox
    # Change the cordinates because the pdfminer counts from the botton to top of the page
    y0 = page.bbox[3] - y1up
    y1 = page.bbox[3] - y0up
    for i, table in enumerate(tables):
        tx0, ty0, tx1, ty1 = table.bbox
        if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
            return i  # Return the index of the table
    return None

In [5]:
# Create a function to crop the image elements from PDFs
def crop_image(element, pageObj):
    # Get the coordinates to crop the image from PDF
    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
    # Crop the page using coordinates (left, bottom, right, top)
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)
    # Save the cropped page to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    # Save the cropped PDF to a new file
    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

# Create a function to convert the PDF to images
def convert_to_images(input_file,):
    images = convert_from_path(input_file)
    image = images[0]
    output_file = 'PDF_image.png'
    image.save(output_file, 'PNG')

# Create a function to read text from images
def image_to_text(image_path):
    # Read the image
    img = Image.open(image_path)
    # Extract the text from the image
    text = pytesseract.image_to_string(img)
    return text

In [8]:
# # Find the PDF path
# pdf_path = '/Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Методические рекомендации/МЕТОДИЧЕСКИЕ РЕКОМЕНДАЦИИ УТВЕРЖДЕНЫ руководством 8 Центра ФСБ России 31.03.2015 года № 149:7:2:6-432.pdf'
#
# # Create a pdf file object
# pdfFileObj = open(pdf_path, 'rb')
# # Create a pdf reader object
# pdfReaded = PyPDF2.PdfReader(pdfFileObj)

_IncompleteInputError: incomplete input (660873146.py, line 30)

In [212]:
# чтобы работало надо выполнить перед запуском brew install poppler и brew install tesseract
pdf_directory = '/Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Федеральный закон'

# Получите список всех файлов в директории
pdf_files = [os.path.join(pdf_directory, f) for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
full_text = ''

if not pdf_files:
    print("В указанной папке нет PDF-файлов.")
else:
    # Обходим каждый PDF-файл в папке
    for pdf_path in pdf_files:
        print(f"Обработка файла: {pdf_path}")


        try:

            pdfFileObj = open(pdf_path, 'rb')
            pdfReaded = PyPDF2.PdfReader(pdfFileObj)

            # Create the dictionary to extract text from each image
            text_per_page = {}
            # Create a boolean variable for image detection
            image_flag = False

            # We extract the pages from the PDF
            for pagenum, page in enumerate(extract_pages(pdf_path)):

                # Initialize the variables needed for the text extraction from the page
                pageObj = pdfReaded.pages[pagenum]
                page_text = []
                line_format = []
                text_from_images = []
                text_from_tables = []
                page_content = []
                # Initialize the number of the examined tables
                table_in_page= -1
                # Open the pdf file
                pdf = pdfplumber.open(pdf_path)
                # Find the examined page
                page_tables = pdf.pages[pagenum]
                # Find the number of tables in the page
                tables = page_tables.find_tables()
                if len(tables)!=0:
                    table_in_page = 0

                # Extracting the tables of the page
                for table_num in range(len(tables)):
                    # Extract the information of the table
                    table = extract_table(pdf_path, pagenum, table_num)
                    # Convert the table information in structured string format
                    table_string = table_converter(table)
                    # Append the table string into a list
                    text_from_tables.append(table_string)

                # Find all the elements
                page_elements = [(element.y1, element) for element in page._objs]
                # Sort all the element as they appear in the page
                page_elements.sort(key=lambda a: a[0], reverse=True)


                # Find the elements that composed a page
                for i,component in enumerate(page_elements):
                    # Extract the element of the page layout
                    element = component[1]

                    # Check the elements for tables
                    if table_in_page == -1:
                        pass
                    else:
                        if is_element_inside_any_table(element, page ,tables):
                            table_found = find_table_for_element(element,page ,tables)
                            if table_found == table_in_page and table_found != None:
                                page_content.append(text_from_tables[table_in_page])
                                page_text.append('table')
                                line_format.append('table')
                                table_in_page+=1
                            # Pass this iteration because the content of this element was extracted from the tables
                            continue

                    if not is_element_inside_any_table(element,page,tables):

                        # Check if the element is text element
                        if isinstance(element, LTTextContainer):
                            # Use the function to extract the text and format for each text element
                            (line_text, format_per_line) = text_extraction(element)
                            # Append the text of each line to the page text
                            page_text.append(line_text)
                            # Append the format for each line containing text
                            line_format.append(format_per_line)
                            page_content.append(line_text)


                        # Check the elements for images
                        if isinstance(element, LTFigure):
                            # Crop the image from PDF
                            crop_image(element, pageObj)
                            # Convert the croped pdf to image
                            convert_to_images('cropped_image.pdf')
                            # Extract the text from image
                            image_text = image_to_text('PDF_image.png')
                            text_from_images.append(image_text)
                            page_content.append(image_text)
                            # Add a placeholder in the text and format lists
                            page_text.append('image')
                            line_format.append('image')
                            # Update the flag for image detection
                            image_flag = True


                # Create the key of the dictionary
                dctkey = 'Page_'+str(pagenum)
                # Add the list of list as value of the page key
                text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
                pass
            # Close the pdf file object
            pdfFileObj.close()
            if image_flag:
                os.remove('cropped_image.pdf')
                os.remove('PDF_image.png')

            full_text += '\n'.join([''.join(text_per_page[page][4]) for page in text_per_page]) + '\n'


            print(f"Обработка файла {pdf_path} завершена.")

        except Exception as e:
            print(f"Ошибка при обработке файла {pdf_path}: {e}")


Обработка файла: /Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Федеральный закон/Федеральный закон от 29.06.2004 № 98-ФЗ об коммерчиской тайне.pdf
Обработка файла /Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Федеральный закон/Федеральный закон от 29.06.2004 № 98-ФЗ об коммерчиской тайне.pdf завершена.
Обработка файла: /Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Федеральный закон/Федеральный закон от 26.07.2017 N 187-ФЗ.pdf
Обработка файла /Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Федеральный закон/Федеральный закон от 26.07.2017 N 187-ФЗ.pdf завершена.
Обработка файла: /Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Федеральный закон/Федеральный закон от 26.07.2017 № 194-ФЗ.pdf
Обработка файла /Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Федеральный закон/Федеральный закон от 26.07.2017 № 194-ФЗ.pdf завершена.
Обработка файла: /

In [14]:
# # Close the pdf file object
# pdfFileObj.close()

In [15]:
# # Delete the additional files created if image is detected
# if image_flag:
#     os.remove('cropped_image.pdf')
#     os.remove('PDF_image.png')

In [213]:
# Объединение текста со всех страниц
# full_text = '\n'.join([''.join(text_per_page[page][4]) for page in text_per_page])
#
# Вывод результата
print(full_text)

29 июля 2004 года 
№ 98-ФЗ
 
 
РОССИЙСКАЯ ФЕДЕРАЦИЯ 
 
ФЕДЕРАЛЬНЫЙ ЗАКОН 
 
О КОММЕРЧЕСКОЙ ТАЙНЕ 
 
Принят 
Государственной Думой 
9 июля 2004 года 
 
Одобрен 
Советом Федерации 
15 июля 2004 года 
  (в ред. Федеральных законов от 02.02.2006 № 19-ФЗ, 
от 18.12.2006 № 231-ФЗ, от 24.07.2007 № 214-ФЗ, 
от 11.07.2011 № 200-ФЗ, от 12.03.2014 № 35-ФЗ) 
 
Статья 1. Цели и сфера действия настоящего Федерального закона 
 
1.  Настоящий  Федеральный  закон  регулирует  отношения,  связанные  с 
установлением,  изменением  и  прекращением  режима  коммерческой  тайны  в 
отношении  информации,  которая  имеет  действительную  или  потенциальную 
коммерческую ценность в силу неизвестности ее третьим лицам. 
(часть 1 в ред. Федерального закона от 12.03.2014 № 35-ФЗ) 
2.  Положения  настоящего  Федерального 
закона  распространяются  на 
информацию, составляющую коммерческую тайну, независимо от вида носителя, на 
котором она зафиксирована. 
3.  Положения  настоящего  Федерального  закона  не  распр

In [8]:
!pip install deepeval
!pip install langchain_community
!pip install -qU pypdf
!pip install gigachain-community gigachain-chroma
!pip install sentence-transformers
!pip install langsmith
!pip install pymupdf

Collecting tokenizers<=0.20.3,>=0.13.2 (from chromadb!=0.5.4,!=0.5.5,<0.6.0,>=0.4.0->gigachain-chroma)
  Using cached tokenizers-0.20.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.7 kB)
Using cached tokenizers-0.20.3-cp313-cp313-macosx_11_0_arm64.whl (2.6 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.0
    Uninstalling tokenizers-0.21.0:
      Successfully uninstalled tokenizers-0.21.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.48.3 requires tokenizers<0.22,>=0.21, but you have tokenizers 0.20.3 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.20.3
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 k

In [9]:
import fitz
import os
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings.gigachat import GigaChatEmbeddings
from langchain.schema import Document
from gigachat.exceptions import ResponseError

In [214]:
folder_path = "/Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/Архив/Федеральный закон"
persist_directory = "/Users/nikitacesev/PycharmProjects/altirix_bot/LitPom_bot-main/chromadb_chunk_size_1200_for_federal_law"
sber = "YjcxNDNkZmMtZmMwZS00ZTUzLWFlZTgtMDk1ZDg2NjUyOTFjOmFiZDViZDk4LTc2ZjAtNDY1ZC1iZGY3LTU2YmQ4MDZhMjAxZg=="

In [215]:
# Инициализация ChromaDB и эмбеддинга
embedding_function_gigachat = GigaChatEmbeddings(
    credentials=sber,
    verify_ssl_certs=False,
    scope="GIGACHAT_API_PERS"
)

In [216]:
chromadb_chunk_size_1200_for_federal_law = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_function_gigachat
)

In [217]:
db_info = chromadb_chunk_size_1200_for_federal_law._collection.get(include=["documents", "metadatas", "embeddings"])

In [218]:
def clean_text(text):
    if text is None:
        return ""  # Return an empty string if input is None
    text = re.sub(r"\s+", " ", text)  # Убираем лишние пробелы
    text = re.sub(r"http[s]?://\S+|www\.\S+", "", text)  # Убираем ссылки
    text = re.sub(r"[\n\t\u200b\uff0c\u3001\uff0e\u3002\uf02d]", " ", text)
    text = re.sub(r"\bNone\b", "", text, flags=re.IGNORECASE)# Убираем символы-разделители
    return text

In [219]:
existing_ids = set(db_info['ids']) if db_info else set()

In [220]:
documents = []
metadatas = []
doc_counter = len(existing_ids)
clean_text_content = clean_text(full_text)

# Разбиваем на чанки
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=70, length_function=len)
text_chunks = text_splitter.split_text(clean_text_content)

for chunk in text_chunks:
    doc_id = f"doc{doc_counter}"
    source_path = pdf_files if isinstance(pdf_files, str) else pdf_files[0]
    documents.append(Document(page_content=chunk, metadata={"source": source_path}))
    metadatas.append({"source": source_path})
    doc_counter += 1

In [229]:
print(len(chunk))

449


In [222]:
ids_to_remove = [
    doc["source"] for doc in metadatas
    if isinstance(doc["source"], (str, int, float, tuple)) and doc["source"] in existing_ids
]

if ids_to_remove:
    chromadb_chunk_size_1200_for_federal_law._collection.delete(ids=ids_to_remove)


In [225]:
!pip install time

[31mERROR: Could not find a version that satisfies the requirement time (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for time[0m[31m
[0m

In [228]:
import time
from requests.exceptions import ReadTimeout

valid_docs = []
valid_metadatas = []
valid_embs = []
doc_idx_cant_process = []

for i, doc in enumerate(documents):
    start_time = time.time()
    while time.time() - start_time < 3600:
        try:
            emb = embedding_function_gigachat.embed_documents([doc.page_content])
            valid_docs.append(doc)
            valid_metadatas.append(metadatas[i])
            valid_embs.append(emb[0])
            break
        except ReadTimeout:
            time.sleep(10)
        except ResponseError:
            doc_idx_cant_process.append(i)
            break
    else:
        doc_idx_cant_process.append(i)  # Если за час не обработалось, добавляем в ошибки

In [231]:
ids = [f"doc{i}" for i in range(len(existing_ids), len(existing_ids) + len(valid_docs))]

In [232]:
chromadb_chunk_size_1200_for_federal_law._collection.add(
    documents=[d.page_content for d in valid_docs],
    embeddings=valid_embs,
    ids=ids,
    metadatas=valid_metadatas
)

print(f"Добавлено {len(valid_docs)} новых документов")


Добавлено 536 новых документов
