In [None]:
!pip install nltk
!pip install -U sentence-transformers
!pip install fasttext
!pip install langchain
!pip install spacy
!python -m spacy download ru_core_news_sm

In [None]:
import pandas as pd
import numpy as np
import torch
import nltk
import spacy
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Загрузка стоп-слов
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import fasttext
import fasttext.util

from transformers import AutoTokenizer, AutoModel


from google.colab import userdata
userdata.get('secret')

In [None]:
df = pd.read_csv('all_df.csv')
df.head()

TF-IDF с делением на чанки и очистка+лемматизация

In [None]:
# Function to clean and preprocess text
def clean_text(text):
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Lemmatization
    doc = nlp(text)
    lemmatized_text = ' '.join(token.lemma_ for token in doc if not token.is_stop)
    return lemmatized_text

In [None]:
# Load the Russian lemmatizer from SpaCy
nlp = spacy.load("ru_core_news_sm")

# Step 1: Prepare your dataset
documents = df[['filename', 'paragraphs']].to_dict(orient='records')

# Using RecursiveCharacterTextSplitter for splitting into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = []
for entry in documents:
    title = entry['filename']
    paragraphs = entry['paragraphs']
    split_chunks = text_splitter.split_text(paragraphs)
    for chunk in split_chunks:
        cleaned_chunk = clean_text(chunk)
        chunks.append((title, cleaned_chunk))  # Store title and cleaned text

# Create and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(chunk for _, chunk in chunks)

In [None]:
# Get the indices of the top 10 most similar entries
# Your query
query = "Выставление документа Счет на оплату клиенту, при изменении данных о плательщики, городе оплаты и сумме."
cleaned_query = clean_text(query)
query_vec = vectorizer.transform([cleaned_query])

# Calculate cosine similarity
results = cosine_similarity(X, query_vec).flatten()

top_indices = results.argsort()[-10:][::-1]

# Print the top 10 most similar entries
for i in top_indices:
    similarity_score = results[i]
    print(f"Similarity: {similarity_score:.4f}")
    print(f"Title: {chunks[i][0]}")
    print()