In [None]:
DEFAULT_COLUMN = 5
STEM_COLUMN = 7

DOCS = DEFAULT_COLUMN

In [None]:
import os
import re
import sys
import glob
import gensim
import numpy as np
import pandas as pd
from tqdm import tqdm
from uuid import uuid4
from functools import reduce
from multiprocessing import Pool
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import faiss
import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords
nltk.download('stopwords')

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from sklearn.feature_extraction.text import TfidfVectorizer

import wikiextractor

import nltk
from nltk.stem.snowball import SnowballStemmer

# Загрузка необходимых ресурсов
nltk.download("punkt")


from abc import ABC, abstractmethod
from rank_bm25 import BM25Okapi, BM25L, BM25Plus

import pymorphy2


def _remove_non_printed_chars(string):
    reg = re.compile('[^a-zA-Zа-яА-ЯёЁ]')
    return reg.sub(' ', string)

def _remove_stop_words(string,sw=[]):
    return ' '.join([word if word not in sw else '' \
                     for word in string.strip().split(' ')])

def _trim_string(string):
    # remove extra spaces, remove trailing spaces, lower the case 
    return re.sub('\s+',' ',string).strip().lower()
    
def clean_string(string,
                 stop_words_list,
                 min_len=2,
                 max_len=30):

    string = _remove_non_printed_chars(string)
    string = _remove_stop_words(string,stop_words_list)
    string = _trim_string(string)
    # also remove short words, most likely containing addresses / crap / left-overs / etc remaining after removal
    # gensim mostly does the same as above, it is used here for simplicity
    string = ' '.join(gensim.utils.simple_preprocess(string,
                                                     min_len=min_len,
                                                     max_len=max_len))
    return string
    
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
    
def remove_special_chars(text,char_list):
    for char in char_list:
        text=text.replace(char,'')
    return text.replace(u'\xa0', u' ')

def splitkeepsep(s, sep):
    cleaned = []
    s = re.split("(%s)" % re.escape(sep), s)
    for _ in s:
        if _!='' and _!=sep:
            cleaned.append(sep+_)
    return cleaned

def extract_url(text):
    pattern = 'http([^"]+)'
    match = re.search(pattern, text)
    if match:
        url = match.group(0)
        return url
    else:
        return ""

def create_vector(text):
    return model.encode(text, normalize_embeddings=True)


import requests
from bs4 import BeautifulSoup

# URL статьи Википедии
url = 'https://ru.wikipedia.org/wiki?curid=9'

def getHeadings(url):
    # Получаем содержимое страницы
    response = requests.get(url)
    
    # Парсим HTML-код с помощью BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Находим элемент с оглавлением (обычно он находится внутри элемента с классом mw-parser-output)
    toc = soup.find('div', id='toc')
    if toc is None:
        return None
    
    # Извлекаем все элементы списка (<li>) из оглавления
    items = toc.find_all('li')
    
    # Формируем список заголовков
    headings = []
    for item in items:
        link = item.find('a')  # находим ссылку внутри каждого пункта списка
        if link is not None:
            heading_text = link.text.strip()  # получаем текст ссылки
            cleaned_heading = heading_text.split(maxsplit=1)[-1].strip()  # убираем номер и точку
            if cleaned_heading + '.' not in headings:
                headings.append(cleaned_heading + '.')  # добавляем очищенное название в список
    
    # Выводим результат
    return headings


def lemmatize(doc):
    morph = pymorphy2.MorphAnalyzer()
    return [morph.parse(word)[0].normal_form for word in doc.split()]

def stem(doc):
   stemmer = SnowballStemmer("russian")
   return [stemmer.stem(word) for word in doc.split()]

def process_wiki_files(wiki_file):
    chars = ['\n\n']
    global sw

    with open(wiki_file, encoding='utf-8') as f:
        content = f.read()

    articles = splitkeepsep(content,'<doc id=')
    df_texts = pd.DataFrame(columns=['article_uuid','url', 'title', 'article','proc_article','proc_len'])
    emds = []

    for article in articles:
        if len(article) < 500:
            continue

        uuid_text = uuid4()
        
        articleParts = article.split('\n')
        url = extract_url(article)
        headings = getHeadings(url)
        if headings is None:
            continue
        title = articleParts[1]

        article = remove_html_tags(article)
        article = remove_special_chars(article, chars)
        clearArticleParts = article.split('\n')
        
        startIndex = 1
        currHeading = ''
        
        for endIndex in range(startIndex, len(clearArticleParts)):
            if len(clearArticleParts[endIndex]) < 100 and clearArticleParts[endIndex] in headings: 
                if endIndex - startIndex == 1:
                    startIndex = endIndex
                    currHeading = clearArticleParts[endIndex]
                    continue
            
                onePart = title + '. ' + currHeading + ' ' + ' '.join(clearArticleParts[startIndex+1:endIndex])
            
                proc_onePart = clean_string(onePart, sw_ru)
                stemmed_onePart = ' '.join(stem(proc_onePart))
                proc_len = len(proc_onePart.split(' '))
            
                temp_df_texts = pd.DataFrame(
                    {'article_uuid': [uuid_text],
                     'url': url + "#" + currHeading[:-1].replace(' ', '_') if len(currHeading) > 0 else url,
                     'title': title + '. ' + currHeading if len(currHeading) > 0 else title,
                     'article': onePart,
                     'proc_article':proc_onePart,
                     'proc_len':proc_len,
                     'stem_article':stemmed_onePart
                    })
                df_texts = pd.concat([df_texts, temp_df_texts], ignore_index=True)
            
                emb = create_vector(proc_onePart)
                emds.append(emb)
            
                startIndex = endIndex
                currHeading = clearArticleParts[endIndex]
    
    return df_texts, np.array(emds)

sw_en = set(stopwords.words('english'))
sw_ru = set(stopwords.words('russian'))
sw = list(sw_ru.union(sw_en))


model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")


import os
import faiss
from os.path import exists

def saveEmbdsToVectorDB(embds, path):
    if not exists(path):
        index = faiss.IndexFlatL2(embds.shape[1]) 
        index = faiss.IndexIDMap(index)
        index.add_with_ids(embds, np.arange(0, embds.shape[0]))
        faiss.write_index(index, path)
    else:
        index = faiss.read_index(path)
        index.add_with_ids(embds, np.arange(index.ntotal, index.ntotal + embds.shape[0]))
        faiss.write_index(index, path)


def getVectorDB(path):
    return faiss.read_index(path)

def addMetadataToDB(pathDB, cursor, conn, metadataDf):
    metadataDf.to_sql(name='documents', con=conn, if_exists='append', index=False)
    conn.commit()


def getRevertedIndexTextDB(pathDB):
    return open_dir(pathDB)


def get_rows_from_csv(filename, indices):
    df = pd.read_csv(
        filename,
        header=None,
        skiprows=lambda x: x not in indices
    )
    
    return df


def textSearch_with_bm25_ranking(query, pathDB):
    index = getRevertedIndexTextDB(pathDB)
    with index.searcher() as searcher:
        query_parser = QueryParser("content", index.schema)
        parsed_query = query_parser.parse(query)
        print("Получился запрос вида: ", parsed_query)
        results = searcher.search(parsed_query)
        return np.array([(result['id'], result.score) for result in results])


wikiFilesRootPath = "data/wiki"
vectorDBPath = 'data/data_bases/vectorDB.index'
metadataDBPath = "data/data_bases/documentsMetadataDB.db"
textsCsvPath = "data/data_bases/texts.csv"


import sqlite3

def process_file(file_path):
    print("Обрабатываю: ", file_path)
    
    # сохраняем в векторную БД
    df_texts, embds = process_wiki_files(file_path)
    saveEmbdsToVectorDB(embds, vectorDBPath)
    currentDbSize = getVectorDB(vectorDBPath).ntotal
    
    # сохраняем тексты документов в текстовую БД
    df_texts.to_csv(textsCsvPath, mode='a', header=False)

    # сохраняем метаданные документов в SQLlite БД
    conn = sqlite3.connect(metadataDBPath)
    cursor = conn.cursor()
    new_index = range(currentDbSize, currentDbSize + len(df_texts))
    df_texts.index = new_index
    addMetadataToDB(metadataDBPath, cursor, conn, df_texts[['url', 'title', 'proc_article']])
    conn.close()


# from concurrent.futures import ThreadPoolExecutor

# files_to_process = []
# for dirpath, dirnames, filenames in os.walk(wikiFilesRootPath):
#     for filename in filenames:
#         file_path = os.path.join(dirpath, filename)
#         files_to_process.append(file_path)

#  # Используем ThreadPoolExecutor для параллельной обработки файлов
# with ThreadPoolExecutor(max_workers=8) as executor:  # Количество рабочих потоков можно настроить
#     futures = {executor.submit(process_file, file_path): file_path for file_path in files_to_process}
    
#     # Ждем завершения всех задач
#     for future in concurrent.futures.as_completed(futures):
#         file_path = futures[future]
#         try:
#             data = future.result()
#         except Exception as exc:
#             print(f'Ошибка при обработке файла {file_path}: {exc}')




class DocsRanker(ABC):
    @abstractmethod
    def rankDocuments(self, query, docs):
        pass


class Bm25Ranker(DocsRanker):
    # preprocess_func: переобразует запрос и документ в список слов
    def __init__(self, bm25_alg = BM25Okapi,  preprocess_func = None) -> None:
        self.preprocess_func = preprocess_func
        self.bm25_alg = bm25_alg

    def rankDocuments(self, query, docs):
        if self.preprocess_func is None:
            self.preprocess_func = lambda doc: doc.split()
        tokenized_corpus = [doc.split() for doc in docs]
        bm25 = self.bm25_alg(tokenized_corpus)
        tokenized_query = self.preprocess_func(query)
        return bm25.get_scores(tokenized_query)
    
class BiEncoderRanker(DocsRanker):
    def __init__(self) -> None:
        self.reranker_model = SentenceTransformer('DiTy/bi-encoder-russian-msmarco', device='cuda')


    def rankDocuments(self, query, docs):
        sentences = [query] + docs
        embeddings = model.encode(sentences)
        results = util.semantic_search(embeddings[0], embeddings[1:])[0]
        return np.array([res['score'] for res in results])


class CrossEncoderRanker(DocsRanker):
    def __init__(self) -> None:
        self.reranker_model = CrossEncoder('DiTy/cross-encoder-russian-msmarco', max_length=512, device='cuda')
        # self.reranker_model = CrossEncoder('DiTy/cross-encoder-russian-msmarco', max_length=512, device='cpu')

    def rankDocuments(self, query, docs):
        return np.array([self.reranker_model.predict([[query, doc]])[0] for doc in docs])


def findVectorsIndexes(query, encoder, kDocuments):
  queryEmbd = encoder.encode(query, normalize_embeddings=True)
  D, I = index.search(np.array([queryEmbd]), kDocuments)
  return D[0], I[0]

def retrieveDocsAndUrls(indexes):
  urlsAndDocs = get_rows_from_csv(textsCsvPath, indexes)[[2, DOCS]]
  urlsAndDocs = urlsAndDocs.fillna('stub')
  return urlsAndDocs[2], urlsAndDocs[DOCS]

def rankDocuments(query, indexes, ranker):
    urls, docs = retrieveDocsAndUrls(indexes)
    doc_scores = ranker.rankDocuments(query, docs)
    sorted_idx = np.argsort(doc_scores)
    return list(docs.iloc[sorted_idx[::-1]]), list(urls.iloc[sorted_idx[::-1]]), doc_scores[sorted_idx[::-1]]

def getSortedDocumentsWithUrls(query, encoder, kDocuments, ranker):
  indexes = findVectorsIndexes(query, encoder, kDocuments)
  return rankDocuments(query, indexes, ranker)

def getUnsortedDocumentsWithUrls(query, encoder, kDocuments):
  indexes = findVectorsIndexes(query, encoder, kDocuments)
  return retrieveDocsAndUrls(indexes)

[nltk_data] Downloading package punkt_tab to /home/marat/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/marat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import math
from collections import defaultdict
from typing import List


class BM25WithProximity:
    def __init__(self, documents: List[List[str]], k1=1.5, b=0.75, proximity_weight=0.6):
        self.k1 = k1
        self.b = b
        self.proximity_weight = proximity_weight
        self.documents = documents
        self.doc_lengths = [len(doc) for doc in self.documents]
        self.avg_doc_length = sum(self.doc_lengths) / len(self.doc_lengths)
        self.total_docs = len(self.documents)

        self.doc_freqs = defaultdict(int)
        for doc in self.documents:
            for term in set(doc):
                self.doc_freqs[term] += 1

    def _bm25_term_score(self, term, doc, doc_length):
        term_freq = doc.count(term)
        if term_freq == 0:
            return 0
        idf = math.log((self.total_docs) /
                       (self.doc_freqs[term] + 1.0) + 1)
        normalization = 1 - self.b + self.b * (doc_length / self.avg_doc_length)
        score = idf * ((term_freq * (self.k1 + 1)) /
                       (term_freq + self.k1 * normalization))
        return score

    def _proximity_score(self, query_terms, doc):
        positions = {term: [] for term in query_terms}
        for index, word in enumerate(doc):
            if word in query_terms:
                positions[word].append(index)


        min_distance = float('inf')
        for i, term1 in enumerate(query_terms):
            for term2 in query_terms[i + 1:]:
                for pos1 in positions[term1]:
                    for pos2 in positions[term2]:
                        distance = abs(pos1 - pos2)
                        if distance < min_distance:
                            min_distance = distance


        if min_distance == float('inf'):
            return 0
        return 1 / (1 + min_distance)

    def score(self, query: List[str], doc_index: int):
        doc = self.documents[doc_index]
        doc_length = self.doc_lengths[doc_index]

        bm25_score = sum(self._bm25_term_score(term, doc, doc_length) for term in query)

        proximity_score = self._proximity_score(query, doc)

        total_score = bm25_score + self.proximity_weight * proximity_score
        return total_score

    def get_scores(self, query: List[str]):
        scores = [self.score(query, doc_index) for doc_index in range(self.total_docs)]
        return scores

In [None]:
from typing import List

def calculate_relevance(query_words: List[str], document_words: List[str]) -> float:
    """
    Оценивает релевантность документа запросу на основе кворума (числа совпавших слов).
    
    :param query: текст запроса
    :param document: текст документа
    :return: значение релевантности от 0 до 1
    """
    if not query_words:
        return 0.0

    query_words = set(query_words)
    document_words = set(document_words)
    
    intersection = query_words.intersection(document_words)
    relevance = len(intersection) / len(query_words)
    return relevance

def documents_filter_quorum(query: List[str], documents: List[List[str]], threshold: float = 0.5) -> List[str]:
    return [doc for doc in documents if calculate_relevance(query, doc) >= threshold]


query = "модель машинного обучения".lower().split()
documents = [
    "Машинное обучение и искусственный интеллект".lower().split(),
    "Основы программирования".lower().split(),
    "Модель обучения нейронных сетей".lower().split(),
    "Машинное обучение: теория и практика".lower().split(),
]

relevant_docs = documents_filter_quorum(query, documents, threshold=0.25)
print(f'Релевантные документы: {relevant_docs}')
# Релевантные документы: [['модель', 'обучения', 'нейронных', 'сетей']]

In [21]:
import time

In [None]:
use_gpu_index = True
  
res = faiss.StandardGpuResources()
index = getVectorDB(vectorDBPath)
if use_gpu_index:
    index = faiss.index_cpu_to_gpu(res, 0, index)

In [11]:
def test_model(filename, ranker = None, document_num = 50):
    real_urls = []
    queries = []
    with open(filename, encoding='utf-8') as f:
        prev_line = ''
        for line in f:
            if (prev_line == '##\n'):
                real_urls.append(line[:-1])
            if (prev_line == '#\n'):
                queries.append(line[:-1])
            prev_line = line
    pos_arr = -np.ones(len(queries))
    time_arr = np.zeros(len(queries))
    if ranker is None:
        N = len(queries)
        for i in range(N):
            start_time = time.time()
            dists, indexes = findVectorsIndexes(queries[i], model, document_num)
            end_time = time.time()
            print(dists)
            print(indexes)
            time_arr[i] = end_time - start_time
            alter_res = []
            for idx in indexes:
                alter_url, _ = retrieveDocsAndUrls([idx])
                alter_res.append(alter_url)
            print(alter_res)
            urls, docs = retrieveDocsAndUrls(indexes)
            start_time = time.time()
            sorted_idx = np.argsort(indexes)
            end_time = time.time()
            print(urls)
            anses = urls.iloc[sorted_idx].to_numpy()
            #print(anses)
            for j in range(anses.shape[0]):
                if anses[j] == real_urls[i]:
                    pos_arr[i] = j
                    break
    else:
        N = len(queries)
        for i in range(N):
            start_time = time.time()
            _, indexes = findVectorsIndexes(queries[i], model, document_num)
            end_time = time.time()
            time_arr[i] = end_time - start_time
            urls, docs = retrieveDocsAndUrls(indexes)
            start_time = time.time()
            doc_scores = ranker.rankDocuments(queries[i], docs)
            sorted_idx = np.argsort(doc_scores)
            end_time = time.time()
            anses = urls.iloc[sorted_idx[::-1]].to_numpy()
            time_arr[i] += end_time - start_time
            for j in range(anses.shape[0]):
                if anses[j] == real_urls[i]:
                    pos_arr[i] = j
                    break
    return pos_arr, time_arr

In [12]:
def metric_inv(n, coef = 5):
    if n == -1:
        return 0
    else:
        return coef / (n + coef)

In [13]:
def eval_model(filename, ranker = None, echo = False, document_num = 50, metric = metric_inv):
    p, t = test_model(filename, ranker = ranker, document_num = document_num)
    if echo:
        print(p)
        print(t)
    for i in range(p.shape[0]):
        p[i] = metric(p[i])
    return {'score' : p.mean(), 'avg_t' : t.mean(), 'std_t' : t.std()}

In [14]:
def multi_eval(filenames, rankers, document_nums, metrics):
    for filename in filenames:
        for ranker in rankers:
            for document_num in document_nums:
                for metric in metrics:
                    print(f'filename: {filename[0]}, ranker: {ranker[0]}, doc num: {document_num}, metric: {metric[0]}, ')
                    print(eval_model(filename[1], ranker = ranker[1], document_num = document_num, metric = metric[1]))

In [None]:
multi_eval(
    [
        ['Литва', 'data/queries_split/Литва.txt'],
        ['Лесков', 'data/queries_split/Лесков.txt'],
        ['Метро', 'data/queries_split/Метро.txt'],
        ['Перестройка', 'data/queries_split/Перестройка.txt']
    ],
    [['None', None], ['Bm25', Bm25Ranker(bm25_alg = BM25WithProximity, preprocess_func = stem)], ['CrossEncoder', CrossEncoderRanker()]],
    [5, 10, 50],
    [['inv 5', metric_inv]]
)



filename: Литва, ranker: None, doc num: 5, metric: inv 5, 
{'score': 0.4976757369614513, 'avg_t': 0.1854288305555071, 'std_t': 0.030290228096442686}
filename: Литва, ranker: None, doc num: 10, metric: inv 5, 
{'score': 0.42307731950589095, 'avg_t': 0.1822347777230399, 'std_t': 0.04680426847220043}
filename: Литва, ranker: None, doc num: 50, metric: inv 5, 
{'score': 0.26655455635299413, 'avg_t': 0.1763831002371652, 'std_t': 0.007981322130825738}
filename: Литва, ranker: Bm25, doc num: 5, metric: inv 5, 
{'score': 0.48486394557823126, 'avg_t': 0.19175197056361606, 'std_t': 0.032426700603974054}
filename: Литва, ranker: Bm25, doc num: 10, metric: inv 5, 
{'score': 0.4964317428603142, 'avg_t': 0.1794096061161586, 'std_t': 0.00843924462752806}
filename: Литва, ranker: Bm25, doc num: 50, metric: inv 5, 
{'score': 0.4481245128715881, 'avg_t': 0.1855290004185268, 'std_t': 0.02140704646733572}
filename: Литва, ranker: CrossEncoder, doc num: 5, metric: inv 5, 
{'score': 0.5988095238095238, 'avg

In [37]:
test_model('data/queries_split/short.txt', ranker = None, document_num = 5)

[0.46514064 0.48637408 0.5020088  0.5178219  0.5191462 ]
[  569 27960 25554 36330 51794]
[0    https://ru.wikipedia.org/wiki?curid=726#Геогра...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=79
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=38#Географ...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=22070#Геог...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=21837#Геог...
Name: 2, dtype: object]
0    https://ru.wikipedia.org/wiki?curid=726#Геогра...
1    https://ru.wikipedia.org/wiki?curid=38#Географ...
2               https://ru.wikipedia.org/wiki?curid=79
3    https://ru.wikipedia.org/wiki?curid=22070#Геог...
4    https://ru.wikipedia.org/wiki?curid=21837#Геог...
Name: 2, dtype: object


(array([-1.]), array([0.5779624]))

In [38]:
test_model('data/queries_split/short.txt', ranker = None, document_num = 10)

[0.46514064 0.48637408 0.5020088  0.5178219  0.5191462  0.52117324
 0.52175593 0.5274729  0.5305184  0.5454009 ]
[  569 27960 25554 36330 51794 21051 44657 23783 18404 21242]
[0    https://ru.wikipedia.org/wiki?curid=726#Геогра...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=79
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=38#Географ...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=22070#Геог...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=21837#Геог...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=548#География
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=18820#Геог...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=13269#Круп...
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=7
Name: 2, dtype: object, 0    https://ru.wikipedia.org/wiki?curid=7317#Геогр...
Name: 2, dtype: object]
0    https://ru.wikipedia.org/wiki?curid=726#Геогра...
1 

(array([6.]), array([0.20453882]))