📦 Library Descriptions:

- **langchain-gigachat** — integration of GigaChat with LangChain, used for building request chains to LLMs.

- **langgraph** — library for creating graph-based structures of agents and LangChain chains.

- **langchain-community** — community-driven extensions and connectors for LangChain.

- **langchain** — the core framework for working with LLMs, agents, memory, and pipelines.

- **faiss-cpu** — library from Facebook (Facebook AI Similarity Search) for fast vector search on CPU, suitable for large-scale embeddings.

- **sentence-transformers** — library for generating text embeddings using pretrained models from Hugging Face.

- **playwright** — headless browser automation tool, often used for web scraping and parsing.


In [1]:
!pip install langchain
!pip install langchain-gigachat
!pip install langgraph
!pip install langchain-community
!pip install faiss-cpu
!pip install sentence-transformers
!pip install playwright

In [2]:
!playwright install

In [2]:
%%writefile parser.py

from langchain_community.document_loaders import AsyncChromiumLoader
from bs4 import BeautifulSoup

# URL of the page with search results for the "Physics" tag
url = "https://nplus1.ru/search?tags=869"

# Initialize the asynchronous loader and fetch the page
loader = AsyncChromiumLoader([url])
html = loader.load()

# Save the fetched page to a file (useful for debugging parsing)
with open('page.html', 'w') as f:
    f.write(html[0].page_content)

# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html[0].page_content, 'html.parser')

# CSS class of article links on N+1 website (manually identified after inspecting page structure)
articles = 'n1_climb_4 transition-colors duration-75 hover:text-main inline-block mb-10 sm:mb-5 font-spectral leading-24'

# Collect and save links
links = []
with open('links.txt', 'w') as f:
    for link in soup.find_all('a', class_=articles):
        print(link)  # Debug output
        links.append(link['href'])
        f.write(link['href'] + '\n')

In [3]:
!python parser.py

In [4]:
with open('links.txt') as f:
    sources = [line.strip() for line in f if line.strip()]

print(sources)

In [5]:
# Import Hugging Face embeddings (we’ll use pretrained models)
!pip install langchain-huggingface
from langchain_huggingface import HuggingFaceEmbeddings

# Import different search strategies (BM25 — classic keyword-based, FAISS — vector-based)
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# FAISS — a vector database for storing embeddings of text chunks
from langchain.vectorstores.faiss import FAISS

# Document class is used to wrap texts into a format that LangChain understands
from langchain_core.documents import Document

# WebBaseLoader — a simple webpage loader (we’ll use it to fetch article content from URLs)
from langchain_community.document_loaders import WebBaseLoader

# A text splitter that breaks documents into smaller chunks, respecting structure
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
# Load articles by links using WebBaseLoader
loader = WebBaseLoader(sources.split())  # Pass the list of links from the file
docs = loader.load()  # Load the contents of all URLs as a list of documents

# Check how many articles were successfully loaded
len(docs)

In [7]:
# Set up the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,        # Maximum chunk size
    chunk_overlap=100      # Overlap between chunks
)

# Split documents into fragments
split_docs = text_splitter.split_documents(docs)

# Check how many chunks were created in total
print(len(split_docs))

# View one of the fragments (for example, the second one)
split_docs[1]


In [10]:
%%time
# Measure execution time of the vectorization step

# Choose the model for generating embeddings
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

# Model loading parameters
model_kwargs = {'device': 'cpu'}  # You can change to 'cuda' if you have a GPU
encode_kwargs = {'normalize_embeddings': False}  # Do not normalize embeddings

# Create the embedding object
embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Create a FAISS vector store and load the text chunks into it
vector_store = FAISS.from_documents(split_docs, embedding=embedding)

In [11]:
# Set up the retriever: for each query, retrieve the 5 most relevant chunks
embedding_retriever = vector_store.as_retriever(search_kwargs={"k": 5})

In [11]:
# Import function to create a chain that injects documents into the prompt
from langchain.chains.combine_documents import create_stuff_documents_chain

# Import prompt template for the language model
from langchain_core.prompts import ChatPromptTemplate

# Import GigaChat — Sber’s language model
from langchain.chat_models.gigachat import GigaChat

# Import function to build a Retrieval-Augmented Generation (RAG) chain
from langchain.chains import create_retrieval_chain

In [12]:
# Create a GigaChat object — specify model parameters and authentication
llm = GigaChat(
    credentials='LLM_API_KEY',
#   model="GigaChat-Max",
    scope="GIGACHAT_API_PERS",
    verify_ssl_certs=False, 
    profanity_check=False
)

# Test the model
llm.invoke("Hello! What’s your name?").content

# Create a prompt template
prompt = ChatPromptTemplate.from_template('''
Answer the user’s question as a friendly restaurant assistant. 
Use only the information provided in the context. 
If the context lacks an answer, politely inform the user.

Context: {context}
Question: {input}
Answer:
''')

In [14]:
document_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt
)

In [15]:
# Example query to our RAG system
q1 = "Is there any information about photons?"

# Pass the query to the chain
resp1 = retrieval_chain.invoke(
    {'input': q1}
)

# Display the result
resp1

-------

In [13]:
!pip install -q pyTelegramBotAPI

In [14]:
from langchain_core.prompts import ChatPromptTemplate

# Simple prompt: the model answers only based on the provided context
prompt = ChatPromptTemplate.from_template('''
You are a physics expert. Use only the context provided below.
For greetings, reply: "Ready to answer questions about physics news!"
If the context does not contain an answer, say that you cannot answer.

Context:
{context}

Question:
{input}

Answer:
''')

In [19]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

# LangChain chain that injects documents into the prompt
document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)

# Main Retrieval-Augmented Generation (RAG) chain
retrieval_chain = create_retrieval_chain(embedding_retriever, document_chain)

In [None]:
import telebot
from time import sleep

bot = telebot.TeleBot('Bot_API_key')

In [15]:
# Handle non-text messages
@bot.message_handler(content_types=['audio', 'video', 'document', 'photo',
                                    'sticker', 'voice', 'location', 'contact'])
def not_text(message):
    bot.send_message(message.chat.id, "I only work with text messages.")

# Handle text messages
@bot.message_handler(content_types=['text'])
def handle_text_message(message):
    user_id = message.chat.id
    query = message.text

    # Send the query to the retrieval_chain (RAG)
    response = retrieval_chain.invoke({'input': query})

    # Send the answer back to the user
    bot.send_message(user_id, response['answer'])

    sleep(1)

In [22]:
bot.polling(none_stop=True)