In [1]:
!pip install arxiv -q
!pip install sentence-transformers faiss-cpu transformers torch -q
!pip install arxiv pdfplumber tqdm python-dotenv fitz tools -q
!pip install PyPDF2 -q
!pip install googletrans -q
!pip install ragas -q
!pip install torch -q
!pip install langchain_community -q
!pip install rouge bert_score -q

In [4]:
# if u need to clear papers, then uncomment code below
# ///////////////////////////////////////////////////

# import os
# def delete_files_in_folder(folder_path):
#     for filename in os.listdir(folder_path):
#         file_path = os.path.join(folder_path, filename)
#         try:
#             if os.path.isfile(file_path):
#                 os.remove(file_path)
#         except Exception as e:
#             print(f'Ошибка при удалении файла {file_path}: {e}')

# # Пример вызова
# delete_files_in_folder("papers")


In [5]:
import arxiv
import os
from tqdm import tqdm
client = arxiv.Client()
def download_articles(keywords, client, max_results=100, output_dir="papers"):
    # Создаем директорию для сохранения PDF
    os.makedirs(output_dir, exist_ok=True)

    # Формируем запрос
    query = " AND ".join([f'abs:"{kw}"' for kw in keywords])
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    results = client.results(search)
    # Скачивание статей
    for result in tqdm(results, desc="Downloading"):
        try:
            result.download_pdf(dirpath=output_dir, filename=f"{result.get_short_id()}.pdf")
        except Exception as e:
            print(f"Ошибка при загрузке {result.entry_id}: {e}")

# Пример вызова
keywords = ["spiking neural network"]
download_articles(keywords, client, max_results=10)

Downloading: 10it [00:44,  4.43s/it]


In [6]:
# import pdfplumber  # склеивает слова
import json
import logging
import PyPDF2
import re
from googletrans import Translator


def clean_text(text):
    # Удаление формул в $...$ и $$...$$
    text = re.sub(r"\$.*?\$", "", text, flags=re.DOTALL) # DOTALL
    text = re.sub(r"\$\$.*?\$\$", "", text, flags=re.DOTALL) # DOTALL
    text = re.sub(r"\b(fig\w*|pic\w*)\b", '', text, flags=re.IGNORECASE).strip()

    # Удаление URL и спецсимволов
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^\w\s.,;:!?()-]", "", text)
    text = re.sub(r"\b(fig\w*|pic\w*)\b", '', text, flags=re.IGNORECASE).strip()

    text = re.sub(r"[a-zA-Z]\([^\)]*\)", "", text)
    # Вставить пробелы между слитными словами (буква + заглавная буква)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    # Удалить спецсимволы, кроме базовых знаков препинания
    text = re.sub(r"[^a-zA-Z0-9\s.,;:!?()-]", "", text)
    # Удалить лишние пробелы
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
    text = re.sub(r'(Section|Appendix|Fig\.?)\s+[IVXLCDM0-9]+', '', text)
    # Удаление библиографических ссылок
    text = re.sub(r'\[\d+\]', '', text)

    #касательно научных текстов очистка

    text = re.sub(r'\$.*?\$', '', text, flags=re.DOTALL)
    text = re.sub(r'\\begin{equation}.*?\\end{equation}', '', text, flags=re.DOTALL)

    # Удаление ссылок на структурные элементы
    text = re.sub(
        r'\b(?:Figure|Fig|Table|Equation|Eq|Section|Appendix|Chapter|Algorithm|Code)\s*[A-Za-z0-9]+\b',
        '',
        text,
        flags=re.IGNORECASE
    )

    # Удаление библиографических ссылок (все форматы)
    text = re.sub(r'\[[\d,-]+\]', '', text)  # [1], [2-5]
    text = re.sub(r'\([A-Za-z]+\s+et\s+al\.?,?\s?\d{4}\)', '', text)  # (Smith et al., 2020)

    # Удаление технических артефактов
    text = re.sub(
        r'\b(?:arxiv|doi|issn|isbn|vol|pp|pages?|http|https|www\.|preprint|submitted|version)\b[^\s]*',
        '',
        text,
        flags=re.IGNORECASE
    )
    # Удаление подписей к рисункам/таблицам
    text = re.sub(r'^\s*(Caption|Source|Note):.*$', '', text, flags=re.IGNORECASE|re.MULTILINE)

    # Обработка специальных символов
    text = re.sub(r'[^\w\s.,;:!?%()\-–/]', '', text)  # Сохраняем основные знаки препинания

    # Удаление LaTeX-команд
    text = re.sub(r'\\[a-z]+(\{[^}]+\})?', '', text)

    # Удаление маркеров перечисления
    text = re.sub(r'^\s*[\d•■♦➢]+[\s.)]*', '', text, flags=re.MULTILINE)

    # Нормализация пробелов и переносов
    text = re.sub(r'(?<=\w)-\s+(?=\w)', '', text)  # Соединение перенесенных слов
    text = re.sub(r'\s+', ' ', text)

    text = " ".join(text.split())
    return text.strip()


def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()

def process_pdfs(input_dir="papers", output_file="articles.json"):
    articles = []
    if not os.path.exists(input_dir):
        print(f"Ошибка: директория {input_dir} не найдена!")
        return
    for filename in tqdm(os.listdir(input_dir), desc="Processing PDFs"):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(input_dir, filename)
            try:
                text = extract_text_from_pdf(pdf_path)
                text = clean_text(text)
            except Exception as e:
                print(f"Ошибка обработки {filename}: {e}")
            articles.append({
                    "id": filename.replace(".pdf", ""),
                    "text": text,
                    "source": "arXiv"
                })
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(articles, f, indent=2, ensure_ascii=False)
        print(f"Файл {output_file} успешно создан!")
    except Exception as e:
        print(f"Ошибка при сохранении JSON: {e}")
    # Сохранение в JSON
    return text

last_text = process_pdfs()
print(last_text)

Processing PDFs: 100%|██████████| 10/10 [00:23<00:00,  2.34s/it]

Файл articles.json успешно создан!
ar Xiv:2205.04263v2 eess.SP 1 Jun 2022Spiking Neural Network for IMDD Optical Communication Elias Arnold1,, Georg B ocherer2,, Eric M uller1, Philipp Spilger1, Johannes Schemmel1, Stefano Calabr o2, Maxim Kuschnerov2 1Electronic Visio, Kirchho-Institute for Physics, He idelberg University, Germany 2Huawei Technologies Duesseldorf Gmb H, Munich Research Cen ter, Germany Abstract A spiking neural network (SNN) model suitable for electronic neuromorphic hardware is designed for an IMDD link. The SNN achieves the s ame bit-error-rate as an articial neural network, outperforming linear ion. 1 Introduction Low cost and low power optical transceivers are indispensable for s upporting the exponentially growing data center trac caused by cloud-based services. The h igh power consumption of digital signal processing (DSP) has motivated research on moving parts of the receiver DSP to an analog lower power frontend. For instance, photonic neuromorphic comp uting 




# PyPdf2 работает лучше для научных статей в пдф формате чем классический pdfminer

In [7]:
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Загрузка данных
with open("articles.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Создаем объекты Document с метаданными
documents = []
for item in data:
    doc = Document(
        page_content=item["text"],
        metadata={
            "id": item["id"],
            "source": item["source"]
        }
    )
    documents.append(doc)

# Настройка сплиттера
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,       # Размер чанка в символах
    chunk_overlap=150,    # Перекрытие между чанками
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Разбиваем на чанки
chunks = text_splitter.split_documents(documents)

# Сохраняем результат
output_data = []
MIN_CHUNK_LENGTH = 50
for i, chunk in enumerate(chunks):
    if len(chunk.page_content) >= MIN_CHUNK_LENGTH:
      output_data.append({
          "id": chunk.metadata["id"],
          "text": chunk.page_content,
          "source": chunk.metadata["source"],
          "chunk_id": f"{chunk.metadata['id']}_{len(output_data)}",
          "chunk_position": i  # Уникальный ID чанка
      })

with open("chunks.json", "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

In [8]:
print(f"Всего статей: {len(data)}")
print(f"Всего чанков: {len(output_data)}")
print("\nПример чанка:")
print(f"ID: {output_data[0]['chunk_id']}")
print(f"Текст: {output_data[0]['text'][:100]}...")

Всего статей: 10
Всего чанков: 1033

Пример чанка:
ID: 2303.10780v2_0
Текст: A Comprehensive Review of Spiking Neural Networks: Interpretation, Optimization, Efciency, and Best ...


In [9]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer

# Загрузка чанков из JSON
with open("chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

texts = [chunk["text"] for chunk in chunks]
metadatas = [{"id": chunk["id"], "source": chunk["source"], "chunk_id": chunk["chunk_id"], "chunk_position": chunk["chunk_position"]} for chunk in chunks]

# Загрузка модели для эмбеддингов (1B параметров)
model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2", device="cuda")  # Для GPU: device="cuda"

# Преобразование текстов в векторыремонт ноутбуков москва
embeddings = model.encode(
    texts,
    batch_size=32,  # Оптимизация для больших данных
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"Размерность эмбеддингов: {embeddings.shape}")  # (num_chunks, 768)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Размерность эмбеддингов: (1033, 512)


In [10]:
import faiss

# Создание индекса
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)  # Индекс с L2-метрикой

# Добавление векторов в индекс
index.add(embeddings)

# Сохранение индекса
faiss.write_index(index, "bio_articles.index")

# Сохранение маппинга id -> метаданные
id_to_metadata = {i: md for i, md in enumerate(metadatas)}
with open("id_to_metadata.json", "w") as f:
    json.dump(id_to_metadata, f)

In [11]:
# Загрузка индекса и маппинга
index = faiss.read_index("bio_articles.index")
with open("id_to_metadata.json", "r") as f:
    id_to_metadata = json.load(f)

# Поиск по запросу
query = "Tell me about spiking neural networks"
query_embedding = model.encode([query])

k = 5  # Количество результатов
distances, indices = index.search(query_embedding, k)

# Вывод результатов
print("Топ-5 результатов:")
for idx in indices[0]:
    print(f"ID: {id_to_metadata[str(idx)]['id']}")
    print(f"Текст: {texts[idx][:500]}...\n")

Топ-5 результатов:
ID: 2308.08218v2
Текст: . How to describe neuronal activity: Spikes, rates, or assemblies? In J. Cowan, G. Tesauro, and J. Alspector, editors, NIPS 1993 , volume 6. MorganKaufmann, 1993. Wulfram Gerstner, Werner M. Kistler, Richard Naud, and Liam Paninski. Neuronal Dynamics: From Single Neurons to Networks and Models of Cognition . Cambridge University Press, 2014. Paul W. Goldberg and Mark R. Jerrum. Bounding the Vapnik-Chervonenkis dimension of concept classes parameterized by real numbers. Machine Learning , 18(2):1...

ID: 2308.08218v2
Текст: . TDSNN: From deep neural networks to deep spike neural networks with temporal-coding. In AAAI 2019 , volume 33, 13191326, 2019. 10.1609aaai.v33i01.33011319. Shao-Qun Zhang and Zhi-Hua Zhou. Theoretically provable spiking neural networks. In S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh, editors, Neur IPS 2022 , volume 35, 1934519356. Curran Associates, Inc., 2022. 16 . Proofs Outline We start by introduc

In [13]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json

# 1. Загрузка данных
with open("chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

texts = [chunk["text"] for chunk in chunks]
metadatas = [{"id": chunk["id"], "source": chunk["source"], "chunk_id": chunk["chunk_id"], "chunk_position": chunk["chunk_position"]} for chunk in chunks]

# 2. Инициализация эмбеддингов
embeddings = HuggingFaceEmbeddings(
    model_name= "sentence-transformers/all-mpnet-base-v2" # "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

# 3. Обновление векторного хранилища
vector_store = FAISS.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas
)

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=300,  # Укоротите для фокусировки
    do_sample=True,
    temperature=0.7,  # Уменьшите случайность
    top_p=0.9,
    repetition_penalty=1.5,
    no_repeat_ngram_size=3,  # Блокировка повторяющихся фраз
    forced_eos_token_id=tokenizer.eos_token_id,  # Четкое завершение
    num_beams=4
)

llm = HuggingFacePipeline(pipeline=generator)

def is_context_relevant(query, context_chunks, threshold=0.3):
    query_embedding = embeddings.embed_query(query)
    chunk_embeddings = embeddings.embed_documents(context_chunks)

    max_similarity = max(
        np.dot(query_embedding, chunk_emb)
        for chunk_emb in chunk_embeddings
    )
    return max_similarity > threshold




def rag_answer(query):
    # Поиск релевантных чанков
    docs = vector_store.similarity_search(query, k=2)
    context_chunks = [d.page_content for d in docs]

    # Проверка релевантности
    if not context_chunks or not is_context_relevant(query, context_chunks):
        return "I don't know"

    # Сбор контекста
    context = "\n".join(context_chunks)


    # Генерация ответа
    prompt = f"""
    Write a comprehensive answer in academic English, synthesizing information from the context.
    Ignore section numbers (e.g. Sec.), citations (e.g. [12]), roman numerals, artifacts of scientific texts and equations. Structure your answer as:
    If there's no relevant info in context - answer 'I don't know'
    Context: {context}
    Question: {query}
    Answer (concise, academic style):
    """
    # print(context)
    response = generator(prompt,max_length=400,
    min_length=150,
    length_penalty=2.0)[0]["generated_text"]
    final_answer = response.split("Ответ:")[-1].strip()  # Вырезаем только ответ
    if not final_answer or final_answer.lower().startswith("i don't"):
        return "I don't know"
    return final_answer

# Пример использования
query = "Tell me about spiking neural networks"
answer = rag_answer(query)
print(answer)


  embeddings = HuggingFaceEmbeddings(
Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=generator)


In a spiking neuron, information is represented with a series of short electrical impulses known as spikes, as opposed to the numerical representation of articial neurons. In addition, there are various neuron models to represent a Spiking Neuron. One of the most popular and simplest neuron model is the Leaky Integrate-and-Fire (LIF) neuron. Action-potentials or spikes are short electrical pulses that are the result of electrical and biochemical properties of a biological neuron . Sec. III briey introduces DECOLLE approach.III. B ACKGROUND A. SpiKING Neural Networks


In [20]:
from datasets import Dataset

questions = ["What is the main advantage of Spiking Neural Networks (SNNs) compared to traditional Artificial Neural Networks (ANNs) in the context of computer vision??",
             "How does the DECOLLE learning rule enable supervised training in deep SNNs, and what are its key benefits??",
             "Describe the architecture of the proposed DCSNN model for single object localization. How does it process grayscale images and produce bounding box predictions?",
            ]
ground_truths = [["The main advantage of SNNs over traditional ANNs is their potential for much lower energy consumption. SNNs are inspired by biological neurons and communicate through discrete spikes, making them highly efficient for implementation on neuromorphic hardware. This efficiency makes SNNs attractive for energy-constrained computer vision applications, even though their performance has historically lagged behind ANNs."],
                ["DECOLLE (Deep Continuous Local Learning) enables supervised training in deep SNNs by using local surrogate gradients. In DECOLLE, each layer has a readout layer with fixed random weights that computes a local error at each timestep. This allows each layer to optimize its own local error function, facilitating online learning and reducing memory requirements. The locality of the learning rule also makes DECOLLE suitable for neuromorphic hardware and easy to implement with popular machine learning frameworks."],
                ["The proposed DCSNN architecture follows an encoder-decoder paradigm using convolutional Leaky Integrate-and-Fire (LIF) layers. The encoder consists of three convolutional LIF layers to extract semantic features, while the decoder reconstructs spatial details using three more convolutional LIF layers, connected via residual links. Grayscale images are encoded into spike trains using rate coding (via a Poisson process), and the network predicts bounding box coordinates through linear readout layers. The final bounding box prediction is taken from the last timestep and the deepest readout layer, corresponding to the most refined hierarchical representation."]]
answers = []
contexts = []

# Inference
for query in questions:
  answers.append(rag_answer(query))
  contexts.append([docs.page_content for docs in vector_store.similarity_search(query, k=5)])

# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

# Quality of system

In [24]:
import numpy as np
from sklearn.metrics import precision_score, recall_score
from rouge import Rouge
from bert_score import score
from sentence_transformers import SentenceTransformer, util

# Модель для оценки семантического сходства
similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Метрики для ретривера
def evaluate_retriever(contexts, ground_truths):
    """
    Оценка качества поиска контекста
    """
    precisions = []
    recalls = []

    for ctx_list, gt_list in zip(contexts, ground_truths):
        # Эмбеддинги для сравнения
        ctx_emb = similarity_model.encode(ctx_list)
        gt_emb = similarity_model.encode(gt_list)

        # Попарное сходство
        similarity_matrix = util.cos_sim(ctx_emb, gt_emb)

        # Precision@k
        prec = np.max(similarity_matrix.numpy(), axis=1).mean()
        precisions.append(prec)

        # Recall@k
        rec = np.max(similarity_matrix.numpy(), axis=0).mean()
        recalls.append(rec)

    return {
        'retriever_precision@5': np.mean(precisions),
        'retriever_recall@5': np.mean(recalls),
        'retriever_f1@5': 2 * (np.mean(precisions)*np.mean(recalls)) / (np.mean(precisions)+np.mean(recalls))
    }

# Метрики для генератора
def evaluate_generator(answers, ground_truths):
    """
    Оценка качества генерации ответов
    """
    # Текстовые метрики
    rouge = Rouge()
    rouge_scores = rouge.get_scores(answers, [gt[0] for gt in ground_truths], avg=True)

    # Семантическое сходство
    answer_emb = similarity_model.encode(answers)
    gt_emb = similarity_model.encode([gt[0] for gt in ground_truths])
    semantic_sim = np.diag(util.cos_sim(answer_emb, gt_emb)).mean()

    # BERTScore
    P, R, F1 = score(answers, [gt[0] for gt in ground_truths], lang='en')

    return {
        'rouge-1': rouge_scores['rouge-1']['f'],
        'rouge-l': rouge_scores['rouge-l']['f'],
        'semantic_similarity': semantic_sim.item(),
        'bert_score': F1.mean().item()
    }

# Проверка на галлюцинации
def check_hallucinations(answers, contexts):
    """
    Оценка соответствия ответов контексту
    """
    hall_scores = []

    for ans, ctx in zip(answers, contexts):
        ans_emb = similarity_model.encode(ans)
        ctx_emb = similarity_model.encode(' '.join(ctx))
        hall_scores.append(util.cos_sim(ans_emb, ctx_emb).item())

    return {
        'faithfulness_score': np.mean(hall_scores)
    }

def full_evaluation(data):
    retriever_metrics = evaluate_retriever(data['contexts'], data['ground_truths'])
    generator_metrics = evaluate_generator(data['answer'], data['ground_truths'])
    hallucination_metrics = check_hallucinations(data['answer'], data['contexts'])

    return {**retriever_metrics, **generator_metrics, **hallucination_metrics}

results = full_evaluation(dataset)

print("Retriever Metrics:")
print(f"Precision@5: {results['retriever_precision@5']:.2f}")
print(f"Recall@5: {results['retriever_recall@5']:.2f}")

print("\nGenerator Metrics:")
print(f"Semantic Similarity: {results['semantic_similarity']:.2f}")
print(f"BERTScore F1: {results['bert_score']:.2f}")

print("\nHallucination Metrics:")
print(f"Faithfulness: {results['faithfulness_score']:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retriever Metrics:
Precision@5: 0.63
Recall@5: 0.72

Generator Metrics:
Semantic Similarity: 0.65
BERTScore F1: 0.82

Hallucination Metrics:
Faithfulness: 0.78


## The final RAG system turned out to be quite good, but further improvements are possible. For example, using a hybrid search with BM25 give an improvement.