### Second task
Extract the quantity of products from the specified files and create an Excel file with the product serial numbers and their conversion.

In [16]:
import os
from collections import defaultdict
def get_files_by_format(folder_path):
    files_by_format = defaultdict(list)

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            
            _, extension = os.path.splitext(file)
            format_name = extension.lower() if extension else '<empty>'
            
            files_by_format[format_name].append(file_path)
    
    return dict(files_by_format)

In [17]:
files_by_format = get_files_by_format("./files")

In [33]:
from docx import Document
import pandas as pd

def docx_to_dataframe(file_path):
    doc = Document(file_path)
    data = []

    for table in doc.tables:
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            data.append(row_data)
    
    return pd.DataFrame(data[1:], columns=data[:1])

In [34]:
files_by_format

{'.docx': ['./files/5.docx',
  './files/2.docx',
  './files/1.docx',
  './files/3.docx'],
 '.pdf': ['./files/4.pdf']}

In [93]:
data_to_extract = []
for docx_path in files_by_format[".docx"]:
    docx_df = docx_to_dataframe(docx_path)
    docx_df["document_from"] = docx_path
    docx_df.columns = [col[0].replace('\n', ' ') if isinstance(col, tuple) else col for col in docx_df.columns]
    data_to_extract.append(docx_df.to_dict("records"))

In [96]:
def convert_to_text(data):
    headers = [key for key in data[0].keys()]
    result = '; '.join(headers) + '\n'
    
    for i, item in enumerate(data, 1):
        row = ""
        for header in headers:
            row += item[header].replace('\n', ' ') + "; "
        result += row + '\n'
    
    return result

In [117]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

def extract_columns_info_from_text(text, column_names: list[str]):
    required_fields = "\n- ".join(column_names)
    example_element = {column: f"data{i}" for i, column in enumerate(column_names)}
    llm = ChatOpenAI(model="gpt-4o-mini")
    prompt = PromptTemplate(template="""Analyze the provided data and extract information for the following fields for each item:

**Required Fields:**{required_fields}

**Processing Rules:**
1. Extract data strictly from the provided table.
2. For technical specifications, highlight only the most important parameters (power, voltage, protection rating, dimensions, etc.).
3. Preserve the original wording from the product name.
4. Ignore empty lines and separators.

**Required Output Format:**
Return the result as a JSON array, where each element is an object with the fields:{required_fields}
This fields you never change!

Example element:
{example_element}

**Data to process:**
{text}""", input_variables=["required_fields", "example_element", "text"])
    
    chain = prompt | llm | JsonOutputParser()

    response = chain.invoke({"required_fields": required_fields, "example_element": example_element, "text": text})
    
    return response

In [118]:
data_for_df = []

In [119]:
for data in data_to_extract:
    document_from = data[0]["document_from"]
    
    text = convert_to_text([
        {key: value for key, value in item.items() if key != 'document_from'}
        for item in data
    ])
    columns = ["Количество", "Единица измерения"]
    result = extract_columns_info_from_text(text, columns)
    for i, count_and_metric in enumerate(result):
        data_for_df.append([i+1, count_and_metric["Количество"], count_and_metric["Единица измерения"], document_from])


In [120]:
data_for_df

[[1, '22', 'шт', './files/5.docx'],
 [2, '15', 'шт', './files/5.docx'],
 [3, '53', 'шт', './files/5.docx'],
 [4, '111', 'шт', './files/5.docx'],
 [5, '20', 'шт', './files/5.docx'],
 [6, '60', 'шт', './files/5.docx'],
 [7, '250', 'шт', './files/5.docx'],
 [8, '52', 'м', './files/5.docx'],
 [9, '5', 'шт', './files/5.docx'],
 [10, '27', 'шт', './files/5.docx'],
 [1, '2073', 'шт.', './files/2.docx'],
 [2, '103', 'шт.', './files/2.docx'],
 [3, '2115', 'шт.', './files/2.docx'],
 [4, '319', 'шт.', './files/2.docx'],
 [5, '15', 'шт.', './files/2.docx'],
 [6, '423', 'шт.', './files/2.docx'],
 [7, '49', 'шт.', './files/2.docx'],
 [8, '381', 'шт.', './files/2.docx'],
 [9, '21', 'шт.', './files/2.docx'],
 [10, '429', 'шт.', './files/2.docx'],
 [11, '116', 'шт.', './files/2.docx'],
 [12, '1266', 'шт.', './files/2.docx'],
 [13, '181', 'шт.', './files/2.docx'],
 [1, '5 шт', 'шт', './files/1.docx'],
 [2, '10 шт', 'шт', './files/1.docx'],
 [3, '5 шт', 'шт', './files/1.docx'],
 [4, '55 шт', 'шт', './fil

For pdf using llm

In [121]:
import pymupdf4llm
def convert_pdf_to_md_text(file_path):
    md_text = pymupdf4llm.to_markdown(file_path)
    return md_text

In [122]:
md_text = convert_pdf_to_md_text(files_by_format[".pdf"][0])

In [123]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser

def extract_column_info_from_md_text(md_text, column_names: list[str]):
    required_fields = "; ".join(column_names)
    example_element = {column: f"data{i}" for i, column in enumerate(column_names)}
    llm = ChatOpenAI(model="gpt-4o-mini")

    prompt = f"""Analyze the provided table in Markdown format. Your task is to accurately extract the values from the specified field.

    **Processing Rules:**
    1.  Carefully match the table column headers with the requested field.
    2.  Ignore line breaks within cells (such as `<br>` tags or line feeds). Treat the cell content as a single, continuous text.
    3.  If the requested field is absent from the table or the value is not specified, return "Data is unavailable".

    **Table for analysis:**
    {md_text}
    **Task:**
    Extract the values from the **{required_fields}** columns for all products.

    **Required Output Format:**
Return the result as a JSON array, where each element is an object with the fields:{required_fields}

    **Example Response:**{example_element}"""

    response = llm.invoke(prompt)
    parser = JsonOutputParser()
    result = parser.parse(response.content if isinstance(response.content, str) else "[]")
    return result

In [None]:
result = extract_column_info_from_md_text(md_text, ["Количество", "Единица измерения"])

In [None]:
for i, count_and_metric in enumerate(result):
    data_for_df.append([i+1, count_and_metric["Количество"], count_and_metric["Единица измерения"], files_by_format[".pdf"][0]])

In [127]:
df = pd.DataFrame(data_for_df, columns=["№", "Количество", "Единица измерения", "document_from"])

In [129]:
df.to_excel("./results/extract2.xlsx", index=False)