### First task
Extract product names from the specified files and generate an Excel file with serial numbers and names.

In [4]:
import os
from collections import defaultdict
def get_files_by_format(folder_path):
    files_by_format = defaultdict(list)

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            
            _, extension = os.path.splitext(file)
            format_name = extension.lower() if extension else '<empty>'
            
            files_by_format[format_name].append(file_path)
    
    return dict(files_by_format)

In [5]:
files_by_format = get_files_by_format("./files")

In [9]:
from docx import Document
import pandas as pd

def docx_to_dataframe(file_path):
    doc = Document(file_path)
    data = []

    for table in doc.tables:
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            data.append(row_data)
    
    return pd.DataFrame(data[1:], columns=data[:1])

In [21]:
files_by_format

{'.docx': ['./files/5.docx',
  './files/2.docx',
  './files/1.docx',
  './files/3.docx'],
 '.pdf': ['./files/4.pdf']}

In [None]:
data = []
for docx_path in files_by_format[".docx"]:
    docx_df = docx_to_dataframe(docx_path)

    product_names = docx_df.iloc[:,1]
    for i, product_name in enumerate(product_names):
        data.append([i+1, product_name, docx_path])

For pdf using llm

In [57]:
import pymupdf4llm
def convert_pdf_to_md_text(file_path):
    md_text = pymupdf4llm.to_markdown(file_path)
    return md_text

In [58]:
md_text = convert_pdf_to_md_text(files_by_format[".pdf"][0])

In [59]:
from langchain_openai import ChatOpenAI

def extract_column_info_from_md_text(md_text, column_name):
    llm = ChatOpenAI(model="gpt-4o-mini")
    prompt = f"""Analyze the provided table in Markdown format. Your task is to accurately extract the value from the specified field.

    **Processing Rules:**
    1.  Carefully match the table column headers with the requested field.
    2.  Ignore line breaks within cells (such as `<br>` tags or line feeds). Treat the cell content as a single, continuous text.
    3.  If the requested field is absent from the table or the value is not specified, return "Data is unavailable".

    **Table for analysis:**
    {md_text}
    **Task:**
    Extract the value from the **{column_name}** column for all products.

    **Response Format:**
    Return ONLY the extracted values as a string, separated by ';'.

    **Example Response:**
    Name1; Name2; Name3"""

    response = llm.invoke(prompt)
    result = response.content.split(";") if isinstance(response.content,str) and response.content != "Data is unavailable" else []
    return result

In [61]:
product_names = extract_column_info_from_md_text(md_text, "Наименование")

In [63]:
for i, product_name in enumerate(product_names):
    data.append([i+1, product_name, files_by_format[".pdf"][0]])

In [64]:
df = pd.DataFrame(data, columns=["№", "Наименование", "document_from"])

In [70]:
df.to_excel("./results/extract1.xlsx", index=False)