## IMPORT

In [1]:
import yaml
import sys
import os
import re
import logging
import chromadb
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from pdfminer.high_level import extract_text
from llama_cpp import Llama

  from .autonotebook import tqdm as notebook_tqdm





## Config

In [2]:
def get_files(directory):
    return os.listdir(directory)

# Пример использования:
files_and_dirs = get_files("config/embedding/")
files_and_dirs

['all-MiniLM-L6-v2.yaml',
 'all-mpnet-base-v2.yaml',
 'multi-qa-mpnet-base-dot-v1.yaml',
 'paraphrase-multilingual-MiniLM-L12-v2.yaml',
 'questions_gen.yaml']

In [57]:
# CONFIG_PATH = "config/embedding/questions_gen.yaml"
CONFIG_PATH = "config/embedding/all-MiniLM-L6-v2.yaml"
# CONFIG_PATH = "config/embedding/all-mpnet-base-v2.yaml"
# CONFIG_PATH = "config/embedding/multi-qa-mpnet-base-dot-v1.yaml"
# CONFIG_PATH = "config/embedding/paraphrase-multilingual-MiniLM-L12-v2.yaml"

In [58]:
# === Настройка логирования ===
logging.basicConfig(
    filename="log/error_arxiv.txt",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

with open(f"{CONFIG_PATH}", "r") as file:
    config = yaml.safe_load(file)

CHROMA_PATH = config["chroma_path"]
COLLECTION_NAME = config["collection_name"]
MODEL_NAME = config.get("model_name", None)
# MODEL_NAME = "all-MiniLM-L6-v2"

MODEL_PATH = config.get("model_path", r"C:\Users\Igorexy\.lmstudio\models\MaziyarPanahi\Qwen2.5-7B-Instruct-GGUF\Qwen2.5-7B-Instruct.Q4_K_S.gguf")
TOKEN_TRESHOLD = config.get("token_treshold", 32768)

DOCUMENTS_FOLDER = 'dataset'
TOPIC_PATH = 'config/topics_short.yaml'

In [59]:
# === Загрузка модели эмбеддингов ===
model = SentenceTransformer(MODEL_NAME)

In [60]:
def load_from_yaml(file_path: str):
    """Считывает YAML файл и возвращает список тем."""
    with open(file_path, 'r') as file:
        data = yaml.load(file, Loader=yaml.FullLoader)
    return data



def replace_ligatures(text):
    ligatures = {
    "ﬀ": "ff", "ﬁ": "fi", "ﬂ": "fl", "ﬃ": "ffi", "ﬄ": "ffl", "ﬅ": "ft", "ﬆ": "st",
    "Æ": "AE", "Œ": "OE", "Ǆ": "DZ", "ǅ": "Dz",
    "Ϝ": "W", "Ϟ": "KS",
    "Ꜳ": "AA", "ꜳ": "aa", "Ꜵ": "AO"
}

    pattern = re.compile("|".join(re.escape(k) for k in ligatures))
    return pattern.sub(lambda m: ligatures[m.group()], text)


def remove_after_last_references(text):
    matches = list(re.finditer(r'\bREFERENCES\b', text, re.IGNORECASE))
    
    if matches:
        last_match = matches[-1]  # Берём последнее вхождение REFERENCES
        before_text = text[:last_match.start()]  # Текст до последнего REFERENCES
        after_text = text[last_match.end():]  # Текст после последнего REFERENCES
        
        # Условие: удаляем текст после, если его меньше, чем до
        if len(after_text) < len(before_text):
            return before_text  # Возвращаем только текст до последнего REFERENCES
        else:
            return text  # Если после REFERENCES текста больше или равно, ничего не удаляем
    
    return text  # Если REFERENCES нет, возвращаем исходный текст


def clean_text(text):
    text = replace_ligatures(text)  # Удаляем лигатуры
    text = re.sub(r'\f', '', text)  # Удаляем символы \f
    text = remove_after_last_references(text)   # Удаление ссылок на литературу
    text = re.sub(r'(?m)^.$', '', text)  # Удаляем строки с одним символом
    text = re.sub(r'(?<![.!?])\n(?!\n)', ' ', text)  # Убираем лишние переносы строк
    text = re.sub(r'(?<=\w)-\n', '', text)  # Убираем переносы слов
    text = re.sub(r'\n{2,}', '\n', text)  # Сводим подряд идущие переносы строк к одному
    text = re.sub(r'\d{4,}.*', '', text)  # Удаляем непонятные числовые строки
    text = re.sub(r'(?m)^\s*\d+\.?\s*$', '', text)  # Удаляем строки с номерами
    text = re.sub(r'(?m)^([A-Za-z]+\s*){1,3}\d+$', '', text)  # Удаляем табличные данные
    return text.strip()


def extract_text_from_pdf(file_path: str):
    """Извлекает текст из PDF файла с очисткой."""
    text = extract_text(file_path)
    cleaned_text = clean_text(text)
    return cleaned_text

llm_token_check = Llama(model_path=MODEL_PATH, n_ctx=32768, verbose=False)
def count_tokens_llama(text):
    return len(llm_token_check.tokenize(text.encode("utf-8"), add_bos=False))


def embed_texts(texts):
    """Создает эмбеддинги для списка текстов."""
    return model.encode(texts).tolist()

def upload_to_chromadb(documents, collection_name, db_path="./chroma_storage"):
    """Загружает документы в ChromaDB."""
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(name=collection_name)
    
    for doc in documents:
        data = {
            "ids": [doc["ids"]],
            "documents": [doc["documents"]],
            "metadatas": [doc["metadata"]],
            "embeddings": [embed_texts([doc["documents"]])[0]]
        }
        collection.add(**data)
    
    print(f"Uploaded {len(documents)} documents to collection '{collection_name}'.")


def upload_unique_to_chromadb(documents, collection_name, db_path="./chroma_storage"):
    """Добавляет в ChromaDB только новые документы, которых нет в базе."""
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(name=collection_name)
    
    # Получаем список уже существующих ids в коллекции
    existing_ids = set(collection.get()['ids'])
    
    new_documents = [doc for doc in documents if doc["ids"] not in existing_ids]
    
    if not new_documents:
        print("No new documents to upload.")
        return
    
    data = {
        "ids": [doc["ids"] for doc in new_documents],
        "documents": [doc["documents"] for doc in new_documents],
        "metadatas": [doc["metadata"] for doc in new_documents],
        "embeddings": embed_texts([doc["documents"] for doc in new_documents])
    }
    
    collection.add(**data)
    
    print(f"Uploaded {len(new_documents)} new documents to collection '{collection_name}'.")

def process_topic(topic, check_token=False, token_treshold=TOKEN_TRESHOLD):
    """Обрабатывает все документы по данной теме и загружает в ChromaDB."""
    topic_name = topic['name']
    folder = rf"{DOCUMENTS_FOLDER}\{topic_name}"
    all_documents = []
    files_to_delete = []
    
    for keyword in os.listdir(folder):
        folder_keywords = os.path.join(folder, keyword)
        print(folder_keywords)
        
        for file_name in os.listdir(folder_keywords):
            if file_name.endswith('.pdf'):
                file_path = os.path.join(folder_keywords, file_name)
                try:
                    document_text = extract_text_from_pdf(file_path)

                    if check_token:
                        if count_tokens_llama(document_text) > token_treshold:
                            files_to_delete.append(file_name)
                    
                except Exception as e:
                    logging.error(f"Ошибка при считывании текста для темы '{topic_name}', ключевого слова '{keyword}': {e}")
                    print(f"Ошибка: {e} - при обработке темы '{topic_name}', ключевого слова '{keyword}'")
                    continue
                
                # Формируем записи для документа
                all_documents.append({
                    "ids": file_name.split('.pdf')[0],
                    "documents": document_text,
                    "metadata": {
                        "topic": topic_name,
                        "keyword": keyword,  # Добавляем ключевое слово в метаданные
                        "filename": file_name,
                    }
                })
    # Удаление файлов
    # for file_path in files_to_delete:
    #     try:
    #         os.remove(file_path)
    #         print(f"Удалён: {file_path}")
    #     except Exception as e:
    #         print(f"Ошибка при удалении {file_path}: {e}")


    print(f"Extracted text from {len(all_documents)} documents for topic '{topic_name}'")
    upload_to_chromadb(all_documents, collection_name=COLLECTION_NAME, db_path=CHROMA_PATH)


In [8]:
text = extract_text_from_pdf(r"E:\ImportantFiles\Documents\University\Magic App\dataset\Machine Learning\clustering\0105522v1.pdf")

count_tokens_llama(text)

4908

## Запуск без сохранения

In [7]:
topics = load_from_yaml(TOPIC_PATH)['topics']
print(f"Loaded topics: {[topic['name'] for topic in topics]}")

for topic in tqdm(topics):
    process_topic(topic)

Loaded topics: ['Machine Learning', 'Data Analysis', 'Optimization Techniques', 'Natural Language Processing', 'Computer Vision', 'Theoretical Foundations', 'Applied AI']


  0%|          | 0/7 [00:00<?, ?it/s]

dataset_short\Machine Learning\clustering


  0%|          | 0/7 [00:04<?, ?it/s]


KeyboardInterrupt: 

## Вариант с process_topic без функции, с разделением на считывание текста и загрузку в векторную базу

In [7]:
topics = load_from_yaml(TOPIC_PATH)['topics']
print(f"Loaded topics: {[topic['name'] for topic in topics]}")

DOCUMENTS_FOLDER = 'dataset'
TOPIC_PATH = 'config/topics_short.yaml'

all_documents = []
files_to_delete = []
check_token = True
token_treshold = TOKEN_TRESHOLD

for topic in tqdm(topics):

    topic_name = topic['name']
    folder = rf"{DOCUMENTS_FOLDER}\{topic_name}"
    if os.path.isdir(folder) == False:
        print(f"Папка {folder} не существует")
        continue

    for keyword in os.listdir(folder):
        folder_keywords = os.path.join(folder, keyword)
        print(folder_keywords)
        
        for file_name in os.listdir(folder_keywords):
            if file_name.endswith('.pdf'):
                file_path = os.path.join(folder_keywords, file_name)
                try:
                    document_text = extract_text_from_pdf(file_path)

                    if check_token:     # Не записываем док, если превышает пороговое значение
                        if count_tokens_llama(document_text) > token_treshold:
                            files_to_delete.append(folder_keywords + "\\" + file_name)
                            continue   

                except Exception as e:
                    logging.error(f"Ошибка при считывании текста для темы '{topic_name}', ключевого слова '{keyword}': {e}")
                    print(f"Ошибка: {e} - при обработке темы '{topic_name}', ключевого слова '{keyword}'")
                    continue
                
                # Формируем записи для документа
                all_documents.append({
                    "ids": file_name.split('.pdf')[0],
                    "documents": document_text,
                    "metadata": {
                        "topic": topic_name,
                        "keyword": keyword,  # Добавляем ключевое слово в метаданные
                        "filename": file_name,
                    }
                })
    print(f"Extracted text from {len(all_documents)} documents for topic '{topic_name}'")

print(files_to_delete)
# Удаление файлов
# for file_path in files_to_delete:
#     try:
#         os.remove(file_path)
#         print(f"Удалён: {file_path}")
#     except Exception as e:
#         print(f"Ошибка при удалении {file_path}: {e}")



Loaded topics: ['Machine Learning', 'Data Analysis', 'Optimization Techniques', 'Natural Language Processing', 'Computer Vision', 'Theoretical Foundations', 'Applied AI']


  0%|          | 0/7 [00:00<?, ?it/s]

dataset\Machine Learning\clustering
dataset\Machine Learning\decision trees
dataset\Machine Learning\deep learning
dataset\Machine Learning\ensemble methods
dataset\Machine Learning\neural networks
dataset\Machine Learning\reinforcement learning
dataset\Machine Learning\supervised learning
dataset\Machine Learning\SVM
dataset\Machine Learning\unsupervised learning


 14%|█▍        | 1/7 [01:10<07:05, 70.88s/it]

Extracted text from 107 documents for topic 'Machine Learning'
dataset\Data Analysis\data preprocessing
dataset\Data Analysis\data visualization
dataset\Data Analysis\dimensionality reduction
dataset\Data Analysis\feature engineering
dataset\Data Analysis\regression analysis
dataset\Data Analysis\statistical analysis
dataset\Data Analysis\time series analysis


 29%|██▊       | 2/7 [02:07<05:11, 62.35s/it]

Extracted text from 194 documents for topic 'Data Analysis'
dataset\Optimization Techniques\bayesian optimization
dataset\Optimization Techniques\convex optimization
dataset\Optimization Techniques\evolutionary algorithms
dataset\Optimization Techniques\genetic algorithms
dataset\Optimization Techniques\gradient descent
dataset\Optimization Techniques\linear programming


 43%|████▎     | 3/7 [02:52<03:37, 54.36s/it]

Extracted text from 275 documents for topic 'Optimization Techniques'
dataset\Natural Language Processing\language modeling
dataset\Natural Language Processing\sentiment analysis
dataset\Natural Language Processing\sequence-to-sequence models
dataset\Natural Language Processing\text mining
dataset\Natural Language Processing\topic modeling
dataset\Natural Language Processing\transformers
dataset\Natural Language Processing\word embeddings


 57%|█████▋    | 4/7 [03:25<02:18, 46.26s/it]

Extracted text from 354 documents for topic 'Natural Language Processing'
dataset\Computer Vision\convolutional neural networks
dataset\Computer Vision\generative adversarial networks
dataset\Computer Vision\image classification
dataset\Computer Vision\image segmentation
dataset\Computer Vision\object detection
dataset\Computer Vision\transfer learning


 71%|███████▏  | 5/7 [03:55<01:20, 40.17s/it]

Extracted text from 423 documents for topic 'Computer Vision'
dataset\Theoretical Foundations\Bayesian inference
dataset\Theoretical Foundations\complexity theory
dataset\Theoretical Foundations\information theory
dataset\Theoretical Foundations\probability theory
dataset\Theoretical Foundations\statistical learning theory


 86%|████████▌ | 6/7 [04:27<00:37, 37.54s/it]

Extracted text from 483 documents for topic 'Theoretical Foundations'
dataset\Applied AI\AI ethics
dataset\Applied AI\anomaly detection
dataset\Applied AI\autonomous systems
dataset\Applied AI\forecasting models
dataset\Applied AI\recommender systems
dataset\Applied AI\robotics


100%|██████████| 7/7 [05:24<00:00, 46.29s/it]

Extracted text from 551 documents for topic 'Applied AI'
[]





In [9]:
# Удаление файлов
for file_path in files_to_delete:
    try:
        os.remove(file_path)
        print(f"Удалён: {file_path}")
    except Exception as e:
        print(f"Ошибка при удалении {file_path}: {e}")

Удалён: dataset\Machine Learning\clustering\0306145v1.pdf
Удалён: dataset\Machine Learning\clustering\0906.2145v1.pdf
Удалён: dataset\Machine Learning\clustering\1004.0694v1.pdf
Удалён: dataset\Machine Learning\clustering\1209.4257v1.pdf
Удалён: dataset\Machine Learning\clustering\1307.4838v1.pdf
Удалён: dataset\Machine Learning\clustering\1412.2601v2.pdf
Удалён: dataset\Machine Learning\clustering\1503.02059v1.pdf
Удалён: dataset\Machine Learning\clustering\1506.06327v1.pdf
Удалён: dataset\Machine Learning\decision trees\1004.0436v1.pdf
Удалён: dataset\Machine Learning\decision trees\1206.4620v1.pdf
Удалён: dataset\Machine Learning\decision trees\1805.08328v2.pdf
Удалён: dataset\Machine Learning\decision trees\1909.13488v2.pdf
Удалён: dataset\Machine Learning\decision trees\2003.04952v2.pdf
Удалён: dataset\Machine Learning\decision trees\2006.14118v1.pdf
Удалён: dataset\Machine Learning\decision trees\2010.06631v2.pdf
Удалён: dataset\Machine Learning\decision trees\2012.08735v2.pdf
Уд

In [8]:
len(files_to_delete)

0

In [53]:
len(all_documents)

551

### upload в БД

Загрузка

In [86]:
upload_to_chromadb(all_documents, collection_name=COLLECTION_NAME, db_path=CHROMA_PATH)

Uploaded 551 documents to collection 'all-MiniLM-L6-v2-embedding'.


## ---

In [20]:
lenght_list = [len(doc['documents']) for doc in all_documents]

In [26]:
import numpy as np

np.array(lenght_list).mean()

27606.734831460675

In [33]:
all_documents[0]

{'ids': '0012536v1',
 'documents': 'Draft version April 26, \n OPTICAL AND X-RAY CLUSTERS AS TRACERS OF THE SUPERCLUSTER-VOID NETWORK.\nI SUPERCLUSTERS OF ABELL AND X-RAY CLUSTERS M. Einasto1, J. Einasto 1, E. Tago1, V. M¨uller 2 & H. Andernach3 Draft version April 26, \n ABSTRACT\n We study the distribution of X-ray selected clusters of galaxies with respect to superclusters determined by Abell clusters of galaxies and show that the distribution of X-ray clusters follows the supercluster-void network determined by Abell clusters. We find that in this network X-ray clusters are more strongly clustered than other clusters: the fraction of X-ray clusters is higher in rich superclusters, and the fraction of isolated X-ray clusters is lower than the fraction of isolated Abell clusters. There is no clear correlation between X-ray luminosity of clusters and their host supercluster richness. Poor, non-Abell X-ray clusters follow the supercluster-void network as well: these clusters are embedd

In [11]:
count_tokens_llama(all_documents[-1]['documents'])

5236

In [32]:
all_documents[-1]

{'ids': '2005.07474v1',
 'documents': 'Robot Accident Investigation: a case study in Responsible Robotics\n Alan F.T. Winfield, Katie Winkle, Helena Webb, Ulrik Lyngs, Marina Jirotka and Carl Macrae\n Abstract Robot accidents are inevitable. Although rare, they have been happening since assembly-line robots were first introduced in the \n A Winfield, K Winkle Bristol Robotics Lab, UWE Bristol, UK. e-mail: alan.winfield@brl.ac.uk,katie.\nwinkle@brl.ac.uk\n H Webb, U Lyngs, M Jirotka Department of Computer Science, University of Oxford, UK. e-mail: helena.webb@cs.ox.\nac.uk,ulrik.lyngs@cs.ox.ac.uk,marina.jirotka@cs.ox.ac.uk\n C Macrae Nottingham University Business School, University of Nottingham, UK. e-mail: Carl.Macrae@ nottingham.ac.uk\n Winfield, Winkle, Webb, Lyngs, Jirotka and Macrae\n 1 Introduction\n What could possibly go wrong?\nImagine that your elderly mother, or grandmother, has an assisted living robot to help her live independently at home. The robot is capable of fetchin