## IMPORT

In [1]:
import yaml
import sys
import os
import re
import logging
import chromadb
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from pdfminer.high_level import extract_text
from llama_cpp import Llama
import wordninja

  from .autonotebook import tqdm as notebook_tqdm





## Config

In [66]:
def get_files(directory):
    return os.listdir(directory)

# Пример использования:
files_and_dirs = get_files("config/embedding/")
files_and_dirs

['all-MiniLM-L6-v2.yaml',
 'all-mpnet-base-v2.yaml',
 'distiluse-base-multilingual-cased-v1.yaml',
 'e5-large-v2.yaml',
 'gtr-t5-large.yaml',
 'LaBSE.yaml',
 'msmarco-distilbert-base-v4.yaml',
 'multi-qa-MiniLM-L6-cos-v1.yaml',
 'multi-qa-mpnet-base-dot-v1.yaml',
 'multilingual-e5-large.yaml',
 'paraphrase-multilingual-MiniLM-L12-v2.yaml',
 'paraphrase-multilingual-mpnet-base-v2.yaml',
 'questions_gen.yaml',
 'stsb-xlm-r-multilingual.yaml']

In [81]:
# CONFIG_PATH = "config/embedding/questions_gen.yaml"
# CONFIG_PATH = "config/embedding/all-MiniLM-L6-v2.yaml"
# CONFIG_PATH = "config/embedding/all-mpnet-base-v2.yaml"
# CONFIG_PATH = "config/embedding/paraphrase-multilingual-MiniLM-L12-v2.yaml"
# CONFIG_PATH = "config/embedding/multi-qa-mpnet-base-dot-v1.yaml"
# CONFIG_PATH = "config/embedding/LaBSE.yaml"
# CONFIG_PATH = "config/embedding/distiluse-base-multilingual-cased-v1.yaml"
# CONFIG_PATH = "config/embedding/msmarco-distilbert-base-v4.yaml"
# CONFIG_PATH = "config/embedding/multi-qa-MiniLM-L6-cos-v1.yaml"
# CONFIG_PATH = "config/embedding/paraphrase-multilingual-mpnet-base-v2.yaml"
# CONFIG_PATH = "config/embedding/stsb-xlm-r-multilingual.yaml"
# CONFIG_PATH = "config/embedding/gtr-t5-large.yaml"
# CONFIG_PATH = "config/embedding/e5-large-v2.yaml"
CONFIG_PATH = "config/embedding/multilingual-e5-large.yaml"

In [82]:
# === Настройка логирования ===
logging.basicConfig(
    filename="log/error_arxiv.txt",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

with open(f"{CONFIG_PATH}", "r") as file:
    config = yaml.safe_load(file)

CHROMA_PATH = config["chroma_path"]
COLLECTION_NAME = config["collection_name"]
MODEL_NAME = config.get("model_name", None)
# MODEL_NAME = "all-MiniLM-L6-v2"

MODEL_PATH = config.get("model_path", r"C:\Users\Igorexy\.lmstudio\models\MaziyarPanahi\Qwen2.5-7B-Instruct-GGUF\Qwen2.5-7B-Instruct.Q4_K_S.gguf")
TOKEN_TRESHOLD = config.get("token_treshold", 32768)

DOCUMENTS_FOLDER = 'dataset'
TOPIC_PATH = 'config/topic.yaml'

In [83]:
# === Загрузка модели эмбеддингов ===
model = SentenceTransformer(MODEL_NAME)

In [84]:
def load_from_yaml(file_path: str):
    """Считывает YAML файл и возвращает список тем."""
    with open(file_path, 'r') as file:
        data = yaml.load(file, Loader=yaml.FullLoader)
    return data



def replace_ligatures(text):
    ligatures = {
    "ﬀ": "ff", "ﬁ": "fi", "ﬂ": "fl", "ﬃ": "ffi", "ﬄ": "ffl", "ﬅ": "ft", "ﬆ": "st",
    "Æ": "AE", "Œ": "OE", "Ǆ": "DZ", "ǅ": "Dz",
    "Ϝ": "W", "Ϟ": "KS",
    "Ꜳ": "AA", "ꜳ": "aa", "Ꜵ": "AO"
}

    pattern = re.compile("|".join(re.escape(k) for k in ligatures))
    return pattern.sub(lambda m: ligatures[m.group()], text)


def remove_after_last_references(text):
    matches = list(re.finditer(r'\bREFERENCES\b', text, re.IGNORECASE))
    
    if matches:
        last_match = matches[-1]  # Берём последнее вхождение REFERENCES
        before_text = text[:last_match.start()]  # Текст до последнего REFERENCES
        after_text = text[last_match.end():]  # Текст после последнего REFERENCES
        
        # Условие: удаляем текст после, если его меньше, чем до
        if len(after_text) < len(before_text):
            return before_text  # Возвращаем только текст до последнего REFERENCES
        else:
            return text  # Если после REFERENCES текста больше или равно, ничего не удаляем
    
    return text  # Если REFERENCES нет, возвращаем исходный текст


def clean_text(text):
    text = replace_ligatures(text)  # Удаляем лигатуры
    text = re.sub(r'\f', '', text)  # Удаляем символы \f
    text = remove_after_last_references(text)   # Удаление ссылок на литературу
    text = re.sub(r'(?m)^.$', '', text)  # Удаляем строки с одним символом
    text = re.sub(r'(?<![.!?])\n(?!\n)', ' ', text)  # Убираем лишние переносы строк
    text = re.sub(r'(?<=\w)-\n', '', text)  # Убираем переносы слов
    text = re.sub(r'\n{2,}', '\n', text)  # Сводим подряд идущие переносы строк к одному
    text = re.sub(r'\d{4,}.*', '', text)  # Удаляем непонятные числовые строки
    text = re.sub(r'(?m)^\s*\d+\.?\s*$', '', text)  # Удаляем строки с номерами
    text = re.sub(r'(?m)^([A-Za-z]+\s*){1,3}\d+$', '', text)  # Удаляем табличные данные
    return text.strip()


def extract_text_from_pdf(file_path: str):
    """Извлекает текст из PDF файла с очисткой."""
    text = extract_text(file_path)
    restored_text = " ".join(wordninja.split(text))     # восстановление пробелов
    cleaned_text = clean_text(text)
    return cleaned_text

llm_token_check = Llama(model_path=MODEL_PATH, n_ctx=32768, verbose=False)
def count_tokens_llama(text):
    return len(llm_token_check.tokenize(text.encode("utf-8"), add_bos=False))


def embed_texts(texts):
    """Создает эмбеддинги для списка текстов."""
    return model.encode(texts).tolist()

def upload_to_chromadb(documents, collection_name, db_path="./chroma_storage"):
    """Загружает документы в ChromaDB."""
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(name=collection_name)
    
    for doc in documents:
        data = {
            "ids": [doc["ids"]],
            "documents": [doc["documents"]],
            "metadatas": [doc["metadata"]],
            "embeddings": [embed_texts([doc["documents"]])[0]]
        }
        collection.add(**data)
    
    print(f"Uploaded {len(documents)} documents to collection '{collection_name}'.")


def upload_unique_to_chromadb(documents, collection_name, db_path="./chroma_storage"):
    """Добавляет в ChromaDB только новые документы, которых нет в базе."""
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(name=collection_name)
    
    # Получаем список уже существующих ids в коллекции
    existing_ids = set(collection.get()['ids'])
    
    new_documents = [doc for doc in documents if doc["ids"] not in existing_ids]
    
    if not new_documents:
        print("No new documents to upload.")
        return
    
    data = {
        "ids": [doc["ids"] for doc in new_documents],
        "documents": [doc["documents"] for doc in new_documents],
        "metadatas": [doc["metadata"] for doc in new_documents],
        "embeddings": embed_texts([doc["documents"] for doc in new_documents])
    }
    
    collection.add(**data)
    
    print(f"Uploaded {len(new_documents)} new documents to collection '{collection_name}'.")

def process_topic(topic, check_token=False, token_treshold=TOKEN_TRESHOLD):
    """Обрабатывает все документы по данной теме и загружает в ChromaDB."""
    topic_name = topic['name']
    folder = rf"{DOCUMENTS_FOLDER}\{topic_name}"
    all_documents = []
    files_to_delete = []
    
    for keyword in os.listdir(folder):
        folder_keywords = os.path.join(folder, keyword)
        print(folder_keywords)
        
        for file_name in os.listdir(folder_keywords):
            if file_name.endswith('.pdf'):
                file_path = os.path.join(folder_keywords, file_name)
                try:
                    document_text = extract_text_from_pdf(file_path)

                    if check_token:
                        if count_tokens_llama(document_text) > token_treshold:
                            files_to_delete.append(file_name)
                    
                except Exception as e:
                    logging.error(f"Ошибка при считывании текста для темы '{topic_name}', ключевого слова '{keyword}': {e}")
                    print(f"Ошибка: {e} - при обработке темы '{topic_name}', ключевого слова '{keyword}'")
                    continue
                
                # Формируем записи для документа
                all_documents.append({
                    "ids": file_name.split('.pdf')[0],
                    "documents": document_text,
                    "metadata": {
                        "topic": topic_name,
                        "keyword": keyword,  # Добавляем ключевое слово в метаданные
                        "filename": file_name,
                    }
                })
    # Удаление файлов
    # for file_path in files_to_delete:
    #     try:
    #         os.remove(file_path)
    #         print(f"Удалён: {file_path}")
    #     except Exception as e:
    #         print(f"Ошибка при удалении {file_path}: {e}")


    print(f"Extracted text from {len(all_documents)} documents for topic '{topic_name}'")
    upload_to_chromadb(all_documents, collection_name=COLLECTION_NAME, db_path=CHROMA_PATH)


## Запуск без сохранения

In [15]:
# topics = load_from_yaml(TOPIC_PATH)['topics']
# print(f"Loaded topics: {[topic['name'] for topic in topics]}")

# for topic in tqdm(topics):
#     process_topic(topic)

## Вариант с process_topic без функции, с разделением на считывание текста и загрузку в векторную базу

In [None]:
topics = load_from_yaml(TOPIC_PATH)['topics']
print(f"Loaded topics: {[topic['name'] for topic in topics]}")

DOCUMENTS_FOLDER = 'dataset'

all_documents = []
files_to_delete = []
check_token = True
token_treshold = TOKEN_TRESHOLD

for topic in tqdm(topics):

    topic_name = topic['name']
    folder = rf"{DOCUMENTS_FOLDER}\{topic_name}"
    if os.path.isdir(folder) == False:
        print(f"Папка {folder} не существует")
        continue

    for keyword in os.listdir(folder):
        folder_keywords = os.path.join(folder, keyword)
        print(folder_keywords)
        
        for file_name in os.listdir(folder_keywords):
            if file_name.endswith('.pdf'):
                file_path = os.path.join(folder_keywords, file_name)
                try:
                    document_text = extract_text_from_pdf(file_path)

                    if check_token:     # Не записываем док, если превышает пороговое значение
                        if count_tokens_llama(document_text) > token_treshold:
                            files_to_delete.append(folder_keywords + "\\" + file_name)
                            continue   

                except Exception as e:
                    logging.error(f"Ошибка при считывании текста для темы '{topic_name}', ключевого слова '{keyword}': {e}")
                    print(f"Ошибка: {e} - при обработке темы '{topic_name}', ключевого слова '{keyword}'")
                    continue
                
                # Формируем записи для документа
                all_documents.append({
                    "ids": file_name.split('.pdf')[0],
                    "documents": document_text,
                    "metadata": {
                        "topic": topic_name,
                        "keyword": keyword,  # Добавляем ключевое слово в метаданные
                        "filename": file_name,
                    }
                })
    print(f"Extracted text from {len(all_documents)} documents for topic '{topic_name}'")

print("files_to_delete", len(files_to_delete))
# Удаление файлов
# for file_path in files_to_delete:
#     try:
#         os.remove(file_path)
#         print(f"Удалён: {file_path}")
#     except Exception as e:
#         print(f"Ошибка при удалении {file_path}: {e}")



Loaded topics: ['Machine Learning', 'Data Analysis']


  0%|          | 0/2 [00:00<?, ?it/s]

dataset\Machine Learning\active learning
dataset\Machine Learning\autoML
dataset\Machine Learning\clustering
dataset\Machine Learning\decision trees
dataset\Machine Learning\deep learning
dataset\Machine Learning\early stopping
dataset\Machine Learning\ensemble methods
dataset\Machine Learning\gradient boosting
dataset\Machine Learning\hyperparameter tuning
dataset\Machine Learning\learning rate
dataset\Machine Learning\loss functions
dataset\Machine Learning\model interpretability
dataset\Machine Learning\model selection
dataset\Machine Learning\neural networks
dataset\Machine Learning\overfitting
dataset\Machine Learning\reinforcement learning
dataset\Machine Learning\supervised learning
dataset\Machine Learning\SVM
dataset\Machine Learning\transfer learning
dataset\Machine Learning\underfitting
dataset\Machine Learning\unsupervised learning


 50%|█████     | 1/2 [03:41<03:41, 221.58s/it]

Extracted text from 220 documents for topic 'Machine Learning'
dataset\Data Analysis\anova
dataset\Data Analysis\correlation analysis
dataset\Data Analysis\data aggregation
dataset\Data Analysis\data preprocessing
dataset\Data Analysis\data transformation
dataset\Data Analysis\data visualization
dataset\Data Analysis\descriptive statistics
dataset\Data Analysis\dimensionality reduction
dataset\Data Analysis\EDA
dataset\Data Analysis\feature engineering
dataset\Data Analysis\hypothesis testing
dataset\Data Analysis\missing data imputation
dataset\Data Analysis\normality tests
dataset\Data Analysis\outlier detection
Ошибка: ('Unhandled', 14) - при обработке темы 'Data Analysis', ключевого слова 'outlier detection'
dataset\Data Analysis\regression analysis
dataset\Data Analysis\sampling techniques
dataset\Data Analysis\scaling and normalization
dataset\Data Analysis\statistical analysis
dataset\Data Analysis\t-test
dataset\Data Analysis\time series analysis
dataset\Data Analysis\z-test


100%|██████████| 2/2 [06:24<00:00, 192.44s/it]

Extracted text from 436 documents for topic 'Data Analysis'
[]





In [11]:
# Удаление файлов
for file_path in files_to_delete:
    try:
        os.remove(file_path)
        print(f"Удалён: {file_path}")
    except Exception as e:
        print(f"Ошибка при удалении {file_path}: {e}")

Удалён: dataset\Machine Learning\active learning\1906.05194v1.pdf
Удалён: dataset\Machine Learning\active learning\2201.09433v2.pdf
Удалён: dataset\Machine Learning\active learning\2211.14819v2.pdf
Удалён: dataset\Machine Learning\active learning\2408.07364v2.pdf
Удалён: dataset\Machine Learning\autoML\2007.04074v3.pdf
Удалён: dataset\Machine Learning\autoML\2012.05390v3.pdf
Удалён: dataset\Machine Learning\autoML\2302.10827v3.pdf
Удалён: dataset\Machine Learning\autoML\2401.00379v1.pdf
Удалён: dataset\Machine Learning\clustering\1004.0694v1.pdf
Удалён: dataset\Machine Learning\clustering\1505.07872v1.pdf
Удалён: dataset\Machine Learning\clustering\1506.01942v1.pdf
Удалён: dataset\Machine Learning\clustering\1808.08317v1.pdf
Удалён: dataset\Machine Learning\clustering\2011.03720v2.pdf
Удалён: dataset\Machine Learning\clustering\2103.09329v1.pdf
Удалён: dataset\Machine Learning\clustering\2105.08348v1.pdf
Удалён: dataset\Machine Learning\cross-validation\1909.05299v5.pdf
Удалён: dataset

In [8]:
len(files_to_delete)

0

In [24]:
files_to_delete

['dataset\\Machine Learning\\cross-validation\\1909.05299v5.pdf',
 'dataset\\Machine Learning\\cross-validation\\1912.13132v1.pdf',
 'dataset\\Machine Learning\\cross-validation\\2003.00617v2.pdf',
 'dataset\\Machine Learning\\cross-validation\\2104.00673v4.pdf',
 'dataset\\Machine Learning\\cross-validation\\2206.08841v1.pdf',
 'dataset\\Machine Learning\\cross-validation\\2306.06591v2.pdf',
 'dataset\\Machine Learning\\cross-validation\\2406.01950v1.pdf']

In [9]:
len(all_documents)

436

### upload в БД

Загрузка

In [85]:
upload_to_chromadb(all_documents, collection_name=COLLECTION_NAME, db_path=CHROMA_PATH)

Uploaded 436 documents to collection 'multilingual-e5-large-embedding'.


## ---