In [2]:
import yaml
import arxiv
from tqdm import tqdm
import os
import logging


# Настройка логирования
logging.basicConfig(
    filename="log/error_arxiv.txt",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

In [2]:
# === Считывание тем из YAML ===
def load_topics_from_yaml(file_path: str):
    """Считывает YAML файл и возвращает список тем."""
    with open(file_path, 'r') as file:
        data = yaml.safe_load(file)
    return data['topics']


# === Загрузка статей с arXiv ===
def download_arxiv_papers(query: str, max_results: int, download_folder: str):
    """Загружает статьи с arXiv по запросу."""
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    search = arxiv.Search(
        query=query,
        max_results=max_results,
        # sort_by=arxiv.SortCriterion.SubmittedDate
    )

    for result in search.results():
        paper_id = result.entry_id.split('/')[-1]
        pdf_path = os.path.join(download_folder, f'{paper_id}.pdf')
        if not os.path.exists(pdf_path):
            print(f"Downloading {result.title}...")
            result.download_pdf(download_folder, f'{paper_id}.pdf')
            print(f"Saved to {pdf_path}")
        else:
            print(f"{result.title} already downloaded.")

In [3]:
# Шаг 1. Считываем темы из YAML
topics = load_topics_from_yaml('config/topics_short.yaml')
print(f"Loaded topics: {[topic['name'] for topic in topics]}")

Loaded topics: ['Machine Learning', 'Data Analysis', 'Optimization Techniques', 'Natural Language Processing', 'Computer Vision', 'Theoretical Foundations', 'Applied AI']


In [6]:
# Шаг 2. Загрузка статей по каждой теме
max_results = 50


for topic in tqdm(topics[:6]):
    topic_name = topic['name']
    folder = f"dataset/{topic_name}"#.replace(' ', '_')}"

    for keyword in topic['keywords']:
        query = f"all:\"{keyword}\""
        try:
            download_arxiv_papers(query, max_results=max_results, download_folder=folder + f'/{keyword}')
        except Exception as e:
            # Логирование ошибки
            logging.error(f"Ошибка при загрузке статей для темы '{topic_name}', ключевого слова '{keyword}': {e}")
            # (Необязательно) Вывод сообщения об ошибке в консоль
            print(f"Ошибка: {e} - при обработке темы '{topic_name}', ключевого слова '{keyword}'")

  for result in search.results():


Downloading A Review of Semi Supervised Learning Theories and Recent Advances...
Saved to dataset/Machine Learning/supervised learning\1905.11590v1.pdf
Downloading Confidence-Guided Semi-supervised Learning in Land Cover Classification...
Saved to dataset/Machine Learning/supervised learning\2305.10344v2.pdf
Downloading Homomorphic Self-Supervised Learning...
Saved to dataset/Machine Learning/supervised learning\2211.08282v1.pdf
Downloading Interpolation-based semi-supervised learning for object detection...
Saved to dataset/Machine Learning/supervised learning\2006.02158v2.pdf
Downloading A Brief Summary of Interactions Between Meta-Learning and Self-Supervised Learning...
Saved to dataset/Machine Learning/supervised learning\2103.00845v2.pdf
Downloading Investigating a Baseline Of Self Supervised Learning Towards Reducing Labeling Costs For Image Classification...
Saved to dataset/Machine Learning/supervised learning\2108.07464v1.pdf
Downloading Hyperspherical Consistency Regularizat

 17%|█▋        | 1/6 [07:23<36:55, 443.04s/it]

Saved to dataset/Machine Learning/deep learning\2107.13614v1.pdf
Downloading Multivariate Statistical Analysis: A Geometric Perspective...
Saved to dataset/Data Analysis/statistical analysis\0902.0408v1.pdf
Downloading Discussion of: Statistical analysis of an archeological find...
Saved to dataset/Data Analysis/statistical analysis\0804.0088v1.pdf
Downloading Rejoinder of: Statistical analysis of an archeological find...
Saved to dataset/Data Analysis/statistical analysis\0804.0103v1.pdf
Downloading From Clicks to Conversations: Evaluating the Effectiveness of Conversational Agents in Statistical Analysis...
Saved to dataset/Data Analysis/statistical analysis\2502.08114v2.pdf
Downloading NESTLE: a No-Code Tool for Statistical Analysis of Legal Corpus...
Saved to dataset/Data Analysis/statistical analysis\2309.04146v2.pdf
Downloading Discussion of: Statistical analysis of an archeological find...
Saved to dataset/Data Analysis/statistical analysis\0804.0080v1.pdf
Downloading Discussion

 33%|███▎      | 2/6 [12:35<24:24, 366.23s/it]

Saved to dataset/Data Analysis/regression analysis\1912.12354v2.pdf
Downloading Gradient descent in some simple settings...
Saved to dataset/Optimization Techniques/gradient descent\1808.04839v2.pdf
Downloading Scaling transition from momentum stochastic gradient descent to plain stochastic gradient descent...
Saved to dataset/Optimization Techniques/gradient descent\2106.06753v1.pdf
Downloading MBGDT:Robust Mini-Batch Gradient Descent...
Saved to dataset/Optimization Techniques/gradient descent\2206.07139v1.pdf
Downloading Applying Adaptive Gradient Descent to solve matrix factorization...
Saved to dataset/Optimization Techniques/gradient descent\2010.10280v1.pdf
Downloading Occam Gradient Descent...
Saved to dataset/Optimization Techniques/gradient descent\2405.20194v7.pdf
Downloading A Stochastic Gradient Descent Theorem and the Back-Propagation Algorithm...
Saved to dataset/Optimization Techniques/gradient descent\2104.00539v1.pdf
Downloading Elastic Gradient Descent, an Iterative 

 50%|█████     | 3/6 [16:52<15:49, 316.34s/it]

Saved to dataset/Optimization Techniques/linear programming\2211.11343v1.pdf
Downloading Typesafe Modeling in Text Mining...
Saved to dataset/Natural Language Processing/text mining\1108.0363v1.pdf
Downloading Very Large Language Model as a Unified Methodology of Text Mining...
Saved to dataset/Natural Language Processing/text mining\2212.09271v2.pdf
Downloading Cross-institution text mining to uncover clinical associations: a case study relating social factors and code status in intensive care medicine...
Saved to dataset/Natural Language Processing/text mining\2301.06570v1.pdf
Downloading Pbm: A new dataset for blog mining...
Saved to dataset/Natural Language Processing/text mining\1201.2073v1.pdf
Downloading Scalable Text Mining with Sparse Generative Models...
Saved to dataset/Natural Language Processing/text mining\1602.02332v1.pdf
Downloading Sifaka: Text Mining Above a Search API...
Saved to dataset/Natural Language Processing/text mining\1810.02907v1.pdf
Downloading Population-

 67%|██████▋   | 4/6 [19:57<08:48, 264.46s/it]

Downloading Color and Shape Content Based Image Classification using RBF Network and PSO Technique: A Survey...
Saved to dataset/Computer Vision/image classification\1311.6881v1.pdf
Downloading Image Classification for Arabic: Assessing the Accuracy of Direct English to Arabic Translations...
Saved to dataset/Computer Vision/image classification\1807.05206v2.pdf
Downloading A Technical Report for VIPriors Image Classification Challenge...
Saved to dataset/Computer Vision/image classification\2007.08722v1.pdf
Downloading Genetic Programming-Based Evolutionary Deep Learning for Data-Efficient Image Classification...
Saved to dataset/Computer Vision/image classification\2209.13233v1.pdf
Downloading FewSAR: A Few-shot SAR Image Classification Benchmark...
Saved to dataset/Computer Vision/image classification\2306.09592v1.pdf
Downloading Deep Metric Learning for Few-Shot Image Classification: A Review of Recent Developments...
Saved to dataset/Computer Vision/image classification\2105.08149

 83%|████████▎ | 5/6 [27:28<05:31, 331.81s/it]

Saved to dataset/Computer Vision/generative adversarial networks\1807.06489v1.pdf
Downloading Quantum Mechanics as an Exotic Probability Theory...
Saved to dataset/Theoretical Foundations/probability theory\9509004v1.pdf
Downloading Extending and Automating Basic Probability Theory with Propositional Computability Logic...
Saved to dataset/Theoretical Foundations/probability theory\1909.07375v3.pdf
Downloading An analogue of Szego's limit theorem in free probability theory...
Saved to dataset/Theoretical Foundations/probability theory\0706.0750v2.pdf
Downloading A non-crossing word cooperad for free homotopy probability theory...
Saved to dataset/Theoretical Foundations/probability theory\1602.08867v1.pdf
Downloading Probability theory and public-key cryptography...
Saved to dataset/Theoretical Foundations/probability theory\2006.01607v1.pdf
Downloading A quantum invitation to probability theory...
Saved to dataset/Theoretical Foundations/probability theory\2012.06355v1.pdf
Downloading

100%|██████████| 6/6 [31:17<00:00, 312.84s/it]

Saved to dataset/Theoretical Foundations/Bayesian inference\2410.07605v2.pdf





In [3]:
def count_files_in_directory(directory):
    count = 0
    for _, _, files in os.walk(directory):
        count += len(files)
    return count

directory = r"E:\ImportantFiles\Documents\University\Magic App\dataset"
print(f"Число файлов: {count_files_in_directory(directory)}")


Число файлов: 551


# удаление

In [8]:
import os

def remove_extra_files(root_dir, max_files=25):
    for dirpath, _, filenames in os.walk(root_dir):
        if len(filenames) > max_files:
            filenames.sort()  # Можно поменять на sort(reverse=True), если нужно удалять сначала новые файлы
            files_to_remove = filenames[max_files:]
            
            for file in files_to_remove:
                file_path = os.path.join(dirpath, file)
                try:
                    os.remove(file_path)
                    print(f"Удалён файл: {file_path}")
                except Exception as e:
                    print(f"Ошибка при удалении {file_path}: {e}")

if __name__ == "__main__":
    root_directory = r"E:\ImportantFiles\Documents\University\Magic App\dataset"  # Укажите путь к корневой папке
    remove_extra_files(root_directory)

Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2202.08792v2.pdf
Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2206.07635v2.pdf
Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2207.01493v2.pdf
Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2208.04697v1.pdf
Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2210.08984v1.pdf
Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2304.07249v1.pdf
Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2305.09573v1.pdf
Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2307.10057v1.pdf
Удалён файл: E:\ImportantFiles\Documents\University\Magic App\dataset\Applied AI\AI ethics\2309.13057v3.pdf
Удалён файл: E:\ImportantFil