In [1]:
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import os
from tqdm import tqdm_notebook as tqdm
import time
from lxml import etree
from sklearn.metrics import r2_score
from datetime import timedelta
import numpy as np

In [2]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def pretty_print_result(search_result, fields=None):
    if fields is None:
        fields = []
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')


def get_score(search_result):
    res = []
    for hit in search_result['hits']['hits']:
        res.append((hit["_id"], hit["_score"]))
    res.sort(key = lambda x: x[1], reverse = True)
    return res


class Index:
    def __init__(self, index, settings):
        self.index_name = index
        self.settings = settings
        self.es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360}])
        if self.es.indices.exists(index=index):
            self.es.indices.delete(index=index)
        self.es.indices.create(index=index, body=settings)

    def es_actions_generator(self, path_to_docs):
        for doc_name in tqdm(os.listdir(path_to_docs)):
            with open(f"{path_to_docs}/{doc_name}", "r", encoding="utf-8") as inf:
                doc_id = int(''.join(list(filter(str.isdigit, doc_name))))
                doc = json.load(inf)           
            yield create_es_action(self.index_name, doc_id, doc)


    def add_documents(self, path_to_docs):
        try:
            for ok, result in parallel_bulk(self.es, self.es_actions_generator(path_to_docs), queue_size=4, thread_count=4,
                                        chunk_size=1000):
                  if not ok:
                     print(result)
        except Exception as e: 
            print(e)

    def get_doc_by_id(self, doc_id):
        return self.es.get(index=self.index_name, id=doc_id)['_source']

    def search(self, query, *args):
        return self.es.search(index=self.index_name, body=query, size=20)
        # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20



In [14]:
settings_1 = {
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            }
        }
    }
}

In [15]:
settings_title = {
    "mappings": {
        "properties": {
            "title": {
                "type": "text"
            },
            
            "content": {
                "type": "text"
            }
        }
    }
}

In [16]:
index = Index("docs", settings_1)

RequestError: RequestError(400, 'mapper_parsing_exception', 'Root mapping definition has unsupported parameters:  [text : {type=text}]')

In [17]:
index_title = Index("docs", settings_title)

RequestError: RequestError(400, 'mapper_parsing_exception', 'Root mapping definition has unsupported parameters:  [title : {type=text}] [content : {type=text}]')

In [10]:
start = time.time()
index.add_documents("data/json_text")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


RequestError(400, 'action_request_validation_exception', 'Validation Failed: 1: type is missing;2: type is missing;3: type is missing;4: type is missing;5: type is missing;6: type is missing;7: type is missing;8: type is missing;9: type is missing;10: type is missing;11: type is missing;12: type is missing;13: type is missing;14: type is missing;15: type is missing;16: type is missing;17: type is missing;18: type is missing;19: type is missing;20: type is missing;21: type is missing;22: type is missing;23: type is missing;24: type is missing;25: type is missing;26: type is missing;27: type is missing;28: type is missing;29: type is missing;30: type is missing;31: type is missing;32: type is missing;33: type is missing;34: type is missing;35: type is missing;36: type is missing;37: type is missing;38: type is missing;39: type is missing;40: type is missing;41: type is missing;42: type is missing;43: type is missing;44: type is missing;45: type is missing;46: type is missing;47: type is

In [61]:
start = time.time()
index_title.add_documents("res/text_titles_json")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


0:02:16.731730


In [11]:
def print_index_size(index): 
    print(f"{(index.es.indices.stats(index.index_name)['_all']['primaries']['store']['size_in_bytes'] / 2 ** 30):.2f} GB")

In [12]:
print_index_size(index)

0.00 GB


In [13]:
print_index_size(index_title)

0.00 GB


In [76]:
class Query:
    def __init__(self, task_id, query, relevant_docs):
        self.task_id = task_id
        self.query = query
        self.relevant_docs = relevant_docs

    def json_query(self):
        return {
            'query': {
                'bool': {
                    'should': [
                        {
                            'match': {
                                'text': self.query
                            }
                        }
                    ]
                }
            }
        }
    
    def title_query(self):
        return {
            'query': {
                'bool': {
                    'should': [
                        {
                        'match': {
                            'title': {
                                'query': self.query,
                                'boost': 0.15
                            }
                        }
                    },
                    {
                        'match': {
                            'content': self.query
                        }
                    }
                    ]
                }
            }
        }
    
    def fulltext_query():
         return   {
            'query': {
                'bool': {
                    'should': [
                        {
                            'match': {
                                'text': self.query
                            }
                        },
                        
                        {
                            'rank_feature': {
                                'field': 'pagerank',
                                'saturation': {
                                    'pivot': 10
                                }
                            }
                        }
                    ]
                }
            }
        }
            
    

In [72]:
class SearchQualityChecker:
    def __init__(self, queries, index):
        self.queries = queries
        self.index = index
        self.results = {}
        
    def get_results(self):
        r_precision_total = 0
        map_score_total = 0
        r_total = 0
        p_total = 0
        for q in tqdm(self.queries):
            res = self.index.search(q.title_query())
            scores = get_score(res)
            p_total += self.p(20, q, scores)
            r_total += self.r(20, q, scores)
            r_precision_total += self.r_precision(q, scores)
            map_score_total += self.map_score(q, scores, 20)
        Q = len(self.queries)
        return r_precision_total / Q, map_score_total / Q, p_total / Q, r_total / Q
    
    def r_precision(self, query, search_res_score):
        return self.r(len(query.relevant_docs), query, search_res_score)
    
    def map_score(self, query, search_res_score, n):
        m = 0
        for k in range(1, n):       
            m += self.p(k, query, search_res_score)
        R = len(query.relevant_docs)
        return m / n
    
    def p(self, k, query, search_res_score):
        r = 0
        for doc, _ in search_res_score[:k]:
            if doc in query.relevant_docs:
                r += 1
        return r / k
    
    def r(self, k, query, search_res_score):
        R = len(query.relevant_docs)
        r = 0
        for doc, _ in search_res_score[:k]:
            if doc in query.relevant_docs:
                r += 1
        return r / R if R != 0 else 0 if len(search_res_score) > 0 else 1


In [69]:
def get_relevance():
    res = {}
    xml_tree = etree.parse("data/relevant_table_2009.xml")
    root = xml_tree.getroot()
    for task in root.getchildren():
        relevant_docs = set()
        for document in task.getchildren():
            if document.get("relevance") == "vital":
                relevant_docs.add(document.get("id"))
        res[task.get("id")] = relevant_docs
    print(len(res))
    return res


def generate_queries_plain_texts():
    relevances = get_relevance()
    xml_tree = etree.parse("data/web2008_adhoc.xml")
    root = xml_tree.getroot()
    res = []
    for task in root.getchildren():
        if task.get("id") is not None:
            for query_text in task.getchildren():
                try:
                    res.append(Query(task.get("id"), query_text.text, relevances[task.get("id")]))
                except:
                    pass
    print(len(res))
    return res

In [37]:
queries = generate_queries_plain_texts()
quality_checker = SearchQualityChecker(queries, index)
plain_text_res = quality_checker.get_results()

547
547


HBox(children=(IntProgress(value=0, max=547), HTML(value='')))




In [77]:
queries2 = generate_queries_plain_texts()
quality_checker2 = SearchQualityChecker(queries2, index_title)
plain_text_res2 = quality_checker2.get_results()

547
547


HBox(children=(IntProgress(value=0, max=547), HTML(value='')))




In [51]:
rprec, map_, p, r, = plain_text_res
print("Rprec = " + str(rprec))
print("Map = " + str(map_))
print("P = " + str(p))
print("R_total = " + str(r))

Rprec = 0.18028930351002476
Map = 0.3446174663606433
P = 0.3023765996343693
R_total = 0.21012091279641404


In [78]:
rprec_t, map_t, p_t, r_t, = plain_text_res2
print("Rprec = " + str(rprec_t))
print("Map = " + str(map_t))
print("P = " + str(p_t))
print("R_total = " + str(r_t))

Rprec = 0.18945526497011625
Map = 0.3593103822503914
P = 0.3171846435100548
R_total = 0.21807568364669536


In [110]:
from pymystem3 import Mystem
from nltk.corpus import stopwords
import nltk
import re
#nltk.download('punkt')
#nltk.download('stopwords')
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import word_tokenize

russian_stopwords = stopwords.words("russian")
english_stopwords = stopwords.words("english")
black_list = ["°", "№", "©", "...", "//", "://", "</", "\">", "=\"", "=\'", "\r", "\n", "\t"]
stem = Mystem()

def lemmatize(text):
    tokens = stem.lemmatize(text.lower())
    tokens = [token for token in tokens if token != " " and token.strip() not in punctuation \
              and token not in russian_stopwords and token not in english_stopwords]
    return ' '.join(tokens)

def stemmize(text):
    english_check = re.compile(r'[a-z]')
    stemmerr = RussianStemmer()
    stemmere = PorterStemmer()
    
    lowercase_text = text.lower()

    tokens = []
    for token in word_tokenize(lowercase_text):
        if english_check.match(token):
            tokens.append(stemmere.stem(token))
        else:
            tokens.append(stemmerr.stem(token))

    tokens = [token for token in tokens if token != " " and token.strip() not in punctuation \
              and token not in russian_stopwords and token not in english_stopwords]
    return ' '.join(tokens)

def generate_queries_lemmas():
    relevances = get_relevance()
    xml_tree = etree.parse("data/web2008_adhoc.xml")
    root = xml_tree.getroot()
    res = []
    for task in tqdm(root.getchildren()):
        if task.get("id") is not None:
            for query_text in task.getchildren():
                try:
                    res.append(Query(task.get("id"), lemmatize(query_text.text), relevances[task.get("id")]))
                except:
                    pass
    print(len(res))
    return res

def generate_queries_stem():
    relevances = get_relevance()
    xml_tree = etree.parse("data/web2008_adhoc.xml")
    root = xml_tree.getroot()
    res = []
    for task in tqdm(root.getchildren()):
        if task.get("id") is not None:
            for query_text in task.getchildren():
                try:
                    res.append(Query(task.get("id"), stemmize(query_text.text), relevances[task.get("id")]))
                except:
                    pass
    print(len(res))
    return res


In [96]:
queries_lemmas = generate_queries_lemmas()

547


HBox(children=(IntProgress(value=0, max=29232), HTML(value='')))


547


In [97]:
queries_lemmas[0].query

'корозия металл'

In [53]:
lemma_index = Index("lemma_docs", settings_1)

start = time.time()
lemma_index.add_documents("data/json_text_lemmatized_full")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


0:03:50.537718


In [54]:
print_index_size(lemma_index)

1.53 GB


In [57]:
lemma_quality_checker = SearchQualityChecker(queries_lemmas, lemma_index)
lemma_res = lemma_quality_checker.get_results()

HBox(children=(IntProgress(value=0, max=547), HTML(value='')))




In [58]:
rprec, map_, p, r, = lemma_res
print("Rprec = " + str(rprec))
print("Map = " + str(map_))
print("P = " + str(p))
print("R_total = " + str(r))

Rprec = 0.21331724414666695
Map = 0.38518268726110433
P = 0.3556672760511884
R_total = 0.2519418342391568


In [48]:
id_to_pagerank = {}
with open('data/pagerank.txt','r') as f:
    for line in f:
        docId, docURL, rank = line.split()
        id_to_pagerank[int(docId)] = float(rank)

In [49]:
len(id_to_pagerank)


199202

In [50]:
 for doc_name in tqdm(os.listdir("data/json_filtered_tokens_texts")):
        with open(f"data/json_filtered_tokens_texts/{doc_name}", "r+", encoding="utf-8") as inf:
            doc_id = int(''.join(list(filter(str.isdigit, doc_name))))
            doc = json.load(inf)
            try:
                doc["pagerank"] = id_to_pagerank.get(doc_id)
            except:
                pass
            inf.seek(0)        # <--- should reset file position to the beginning.
            json.dump(doc, inf, indent=4, ensure_ascii=False)
            inf.truncate()

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




In [5]:
settings_with_pagerank = {
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "pagerank": {
                "type": "rank_feature"
            }
        }
    }
}

In [6]:
pr_index = Index("pagerank_index", settings_with_pagerank)

In [7]:
pr_index.add_documents("data/json_filtered_tokens_texts")

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


('3 document(s) failed to index.', [{'index': {'_index': 'pagerank_index', '_type': '_doc', '_id': '1204092', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': "failed to parse field [pagerank] of type [rank_feature] in document with id '1204092'. Preview of field's value: '0'", 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'featureValue must be a positive normal float, got: 0.0for feature pagerank on field _feature which is less than the minimum positive normal float: 1.17549435E-38'}}, 'data': {'text': 'english version услуга продукт решение партнер новость компания группа iba контакт поиск главный новость добро пожаловать www kancler смотреть также подписываться наш новость добро пожаловать www kancler пакет прикладной программа ппп канцлер это новое поколение программный продукт наш компания платформа lotus domino notes предназначать создание система электронный документооборот сэд орган государственный управление крупный территориально распред

In [100]:
index_lemmatized_title = Index("docs", settings_title)

In [101]:
start = time.time()
index_lemmatized_title.add_documents("res/lemmatized_titles")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


0:03:07.211645


In [102]:
print_index_size(index_lemmatized_title)


1.19 GB


In [103]:
#queries_lemmas = generate_queries_lemmas()
quality_checker = SearchQualityChecker(queries_lemmas, index_lemmatized_title)
plain_text_res = quality_checker.get_results()



HBox(children=(IntProgress(value=0, max=547), HTML(value='')))




In [104]:
rprec, map_, p, r, = plain_text_res
print("Rprec = " + str(rprec))
print("Map = " + str(map_))
print("P = " + str(p))
print("R_total = " + str(r))

Rprec = 0.2200322847545744
Map = 0.40476707256709393
P = 0.37029250457038393
R_total = 0.25696784007892093


In [106]:
index_stemmed_title = Index("docs", settings_title)


In [107]:
start = time.time()
index_stemmed_title.add_documents("res/stemmed_titles")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


0:01:59.685953


In [108]:
print_index_size(index_stemmed_title)

1.12 GB


In [111]:
queries_stem = generate_queries_stem()
quality_checker = SearchQualityChecker(queries_stem, index_stemmed_title)
stem_text_res = quality_checker.get_results()

547


HBox(children=(IntProgress(value=0, max=29232), HTML(value='')))


547


HBox(children=(IntProgress(value=0, max=547), HTML(value='')))




In [112]:
rprec, map_, p, r, = stem_text_res
print("Rprec = " + str(rprec))
print("Map = " + str(map_))
print("P = " + str(p))
print("R_total = " + str(r))

Rprec = 0.21094760698252735
Map = 0.39077617948888793
P = 0.35648994515539306
R_total = 0.2407488051953686
