# Загрузка документов и сохранение эмбеддингов в базу

In [1]:
import yaml
import arxiv
import os
from langchain_community.document_loaders import PyPDFLoader
import chromadb
from tqdm import tqdm
import logging

In [2]:
# === Считывание тем из YAML ===
def load_topics_from_yaml(file_path: str):
    """Считывает YAML файл и возвращает список тем."""
    with open(file_path, 'r') as file:
        data = yaml.safe_load(file)
    return data['topics']

In [3]:
# === Загрузка статей с arXiv ===
def download_arxiv_papers(query: str, max_results: int, download_folder: str):
    """Загружает статьи с arXiv по запросу."""
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    search = arxiv.Search(
        query=query,
        max_results=max_results,
        # sort_by=arxiv.SortCriterion.SubmittedDate
    )

    for result in search.results():
        paper_id = result.entry_id.split('/')[-1]
        pdf_path = os.path.join(download_folder, f'{paper_id}.pdf')
        if not os.path.exists(pdf_path):
            print(f"Downloading {result.title}...")
            result.download_pdf(download_folder, f'{paper_id}.pdf')
            print(f"Saved to {pdf_path}")
        else:
            print(f"{result.title} already downloaded.")

In [None]:
# === Считывание текста из PDF ===
def extract_text_from_pdf(file_path: str, extract_images=True):
    """Извлекает текст из PDF файла."""
    loader = PyPDFLoader(file_path, extract_images=extract_images)
    pages = loader.load()
    document_text = ' '.join([page.page_content for page in pages])
    return document_text, pages

In [5]:
# === Загрузка данных в ChromaDB ===
def upload_to_chromadb(documents: list, collection_name: str, db_path: str = "./chroma_storage"):
    """Добавляет документы в коллекцию ChromaDB."""
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(name=collection_name)

    for doc in documents:
        collection.add(
            ids=[doc["id"]],
            documents=[doc["content"]],
            metadatas=[doc["metadata"]]
        )
    print(f"Uploaded {len(documents)} documents to collection '{collection_name}'.")

In [13]:
# Настройка логирования
logging.basicConfig(
    filename="log/error_arxiv.txt",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)


# Запуск

In [20]:
# Шаг 1. Считываем темы из YAML
topics = load_topics_from_yaml('config/topics.yaml')
print(f"Loaded topics: {[topic['name'] for topic in topics]}")

Loaded topics: ['Machine Learning', 'Data Analysis', 'Optimization Techniques', 'Natural Language Processing', 'Computer Vision', 'Theoretical Foundations', 'Applied AI']


In [None]:
# Шаг 2. Загрузка статей по каждой теме

for topic in tqdm(topics):
    topic_name = topic['name']
    folder = f"dataset/{topic_name}"#.replace(' ', '_')}"

    for keyword in topic['keywords']:
        query = f"all:\"{keyword}\""
        try:
            download_arxiv_papers(query, max_results=20, download_folder=folder + f'/{keyword}')
        except Exception as e:
            # Логирование ошибки
            logging.error(f"Ошибка при загрузке статей для темы '{topic_name}', ключевого слова '{keyword}': {e}")
            # (Необязательно) Вывод сообщения об ошибке в консоль
            print(f"Ошибка: {e} - при обработке темы '{topic_name}', ключевого слова '{keyword}'")

  for result in search.results():


Downloading Quantum Mechanics as an Exotic Probability Theory...
Saved to dataset/Theoretical Foundations/probability theory\9509004v1.pdf
Downloading Extending and Automating Basic Probability Theory with Propositional Computability Logic...
Saved to dataset/Theoretical Foundations/probability theory\1909.07375v3.pdf
Downloading An analogue of Szego's limit theorem in free probability theory...
Saved to dataset/Theoretical Foundations/probability theory\0706.0750v2.pdf
Downloading A non-crossing word cooperad for free homotopy probability theory...
Saved to dataset/Theoretical Foundations/probability theory\1602.08867v1.pdf
Downloading Probability theory and public-key cryptography...
Saved to dataset/Theoretical Foundations/probability theory\2006.01607v1.pdf
Downloading A quantum invitation to probability theory...
Saved to dataset/Theoretical Foundations/probability theory\2012.06355v1.pdf
Downloading Physics with exotic probability theory...
Saved to dataset/Theoretical Foundation

 50%|█████     | 1/2 [01:34<01:34, 94.62s/it]

Saved to dataset/Theoretical Foundations/Bayesian inference\1606.07937v1.pdf
Downloading Overhead-free User-side Recommender Systems...
Saved to dataset/Applied AI/recommender systems\2411.07589v1.pdf
Downloading Towards Principled User-side Recommender Systems...
Saved to dataset/Applied AI/recommender systems\2208.09864v1.pdf
Downloading Matching Theory-based Recommender Systems in Online Dating...
Saved to dataset/Applied AI/recommender systems\2208.11384v1.pdf
Downloading FairRoad: Achieving Fairness for Recommender Systems with Optimized Antidote Data...
Saved to dataset/Applied AI/recommender systems\2212.06750v1.pdf
Downloading Towards Robust Recommendation: A Review and an Adversarial Robustness Evaluation Library...
Saved to dataset/Applied AI/recommender systems\2404.17844v1.pdf
Downloading The Use of Machine Learning Algorithms in Recommender Systems: A Systematic Review...
Saved to dataset/Applied AI/recommender systems\1511.05263v4.pdf
Downloading Leveraging Social Signal 

100%|██████████| 2/2 [05:09<00:00, 154.71s/it]

Saved to dataset/Applied AI/AI ethics\2206.07635v2.pdf





In [14]:
topics

[{'name': 'Machine Learning',
  'keywords': ['supervised learning',
   'unsupervised learning',
   'reinforcement learning',
   'neural networks',
   'decision trees',
   'SVM',
   'ensemble methods',
   'clustering',
   'deep learning']},
 {'name': 'Data Analysis',
  'keywords': ['statistical analysis',
   'data visualization',
   'dimensionality reduction',
   'feature engineering',
   'data preprocessing',
   'time series analysis',
   'regression analysis']},
 {'name': 'Optimization Techniques',
  'keywords': ['gradient descent',
   'convex optimization',
   'evolutionary algorithms',
   'genetic algorithms',
   'bayesian optimization',
   'linear programming']},
 {'name': 'Natural Language Processing',
  'keywords': ['text mining',
   'language modeling',
   'sentiment analysis',
   'topic modeling',
   'transformers',
   'word embeddings',
   'sequence-to-sequence models']},
 {'name': 'Computer Vision',
  'keywords': ['image classification',
   'object detection',
   'image segme

In [30]:
[topics[0]]

[{'name': 'Machine Learning',
  'keywords': ['supervised learning',
   'unsupervised learning',
   'reinforcement learning',
   'neural networks',
   'decision trees',
   'SVM',
   'ensemble methods',
   'clustering',
   'deep learning']}]

In [None]:
# Шаг 3. Извлечение текста из PDF

all_documents = []
for topic in [topics[0]]:
    topic_name = topic['name']
    folder = rf"dataset\{topic_name}"
    for keyword in os.listdir(folder):
        folder_keywords = os.path.join(folder, keyword)
        print(folder_keywords)
        for file_name in os.listdir(folder_keywords):
            if file_name.endswith('.pdf'):
                file_path = os.path.join(folder_keywords, file_name)
                try:
                    document_text, _ = extract_text_from_pdf(file_path, extract_images=False)
                except Exception as e:
                    logging.error(f"Ошибка при считывани текста для темы '{topic_name}', ключевого слова '{keyword}': {e}")
                    print(f"Ошибка: {e} - при обработке темы '{topic_name}', ключевого слова '{keyword}'")
                all_documents.append({
                    "id": file_name.split('.pdf')[0],
                    "content": document_text,
                    "metadata": {
                        "topic": topic_name,
                        "keyword": keyword,  # Добавляем ключевое слово в метаданные
                        "filename": file_name
                    }
                })
print(f"Extracted text from {len(all_documents)} documents.")

dataset\Machine Learning
dataset\Machine Learning\clustering
dataset\Machine Learning\decision trees
dataset\Machine Learning\deep learning
Ошибка: cannot access local variable 'v' where it is not associated with a value - при обработке темы 'Machine Learning', ключевого слова 'deep learning'
dataset\Machine Learning\ensemble methods
dataset\Machine Learning\neural networks
dataset\Machine Learning\reinforcement learning
dataset\Machine Learning\supervised learning
dataset\Machine Learning\SVM
dataset\Machine Learning\unsupervised learning
Extracted text from 180 documents.


In [33]:
# Шаг 4. Загрузка документов в ChromaDB
upload_to_chromadb(all_documents, collection_name="magic_document")

Uploaded 180 documents to collection 'magic_document'.
