In [105]:
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import os
from tqdm import tqdm_notebook as tqdm
import time
from lxml import etree
from sklearn.metrics import r2_score
from datetime import timedelta
import numpy as np

In [106]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def pretty_print_result(search_result, fields=None):
    if fields is None:
        fields = []
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')


def get_score(search_result):
    res = []
    for hit in search_result['hits']['hits']:
        res.append((hit["_id"], hit["_score"]))
    res.sort(key = lambda x: x[1], reverse = True)
    return res


class Index:
    def __init__(self, index, settings):
        self.index_name = index
        self.settings = settings
        self.es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360}])
        if self.es.indices.exists(index=index):
            self.es.indices.delete(index=index)
        self.es.indices.create(index=index, body=settings)

    def es_actions_generator(self, path_to_docs):
        for doc_name in tqdm(os.listdir(path_to_docs)):
            with open(f"{path_to_docs}/{doc_name}", "r", encoding="utf-8") as inf:
                doc_id = int(''.join(list(filter(str.isdigit, doc_name))))
                doc = json.load(inf)           
            yield create_es_action(self.index_name, doc_id, doc)


    def add_documents(self, path_to_docs):
        try:
            for ok, result in parallel_bulk(self.es, self.es_actions_generator(path_to_docs), queue_size=4, thread_count=4,
                                        chunk_size=1000):
                  if not ok:
                     print(result)
        except Exception as e: 
            print(e)

    def get_doc_by_id(self, doc_id):
        return self.es.get(index=self.index_name, id=doc_id)['_source']

    def search(self, query, *args):
        return self.es.search(index=self.index_name, body=query, size=20)
        # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20



In [107]:
settings_1 = {
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            }
        }
    }
}

In [135]:
settings = {
        'mappings': {
            'properties': {
                'text': {
                    'type': 'text',
                    'analyzer': 'russian_complex',
                    'search_analyzer': 'russian_complex'
                }
            }
            
        },
        "settings": {
        "analysis" : {
            "analyzer" : {
                "my_analyzer" : {
                    "tokenizer" : "standard",
                    "filter" : ["lowercase", "russian_snow", "english_snow"]
                },
                'russian_complex': {
                    'char_filter': [
                        'yont'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase',
                        'russian_snow'
                    ]
                }
            },
            'char_filter': {
                'yont': {
                    'type': 'mapping',
                    'mappings': [
                        'ё => е'
                    ]
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                },
                'white_20': {
                    'type': 'whitespace',
                    'max_token_length': 5
                }
            },
            "filter" : {
                "russian_snow" : {
                    "type" : "snowball",
                    "language" : "Russian"
                },
                "english_snow" : {
                    "type" : "snowball",
                    "language" : "English"
                }
            }
        }
    }
}

In [136]:
index = Index("docs", settings_1)

In [110]:
start = time.time()
index.add_documents("res/json")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200001), HTML(value='')))


0:03:17.092622


In [137]:
stem_index = Index('stem_docs', settings)




In [138]:
start = time.time()
stem_index.add_documents("res/json")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200001), HTML(value='')))


0:04:46.952495


In [139]:
stem_index.get_doc_by_id('1000039')

{'text': 'сервисы | о компании | партнерам Описание Рубрикатор Мобильный переводчик SMS-ИНФО - услуга, позволяющая с помощью SMS-запроса оперативно получать информацию различного \nхарактера в текстовом виде. Информация SMS-ИНФО мобильна, доступна 24 часа в сутки на всей территории \nзоны действия сети VELCOM. Ответ на запрос приходит оперативно - в течение нескольких секунд. Для получения информации необходимо отправить SMS-запрос (латиницей или кириллицей) на номер 511. Кроме того, можно сформировать \nтакой SMS-запрос с помощью SIM-карты нового образца с VELCOM-меню. Таблица транслитерации (соответствия \nрусских букв латинским) представлена в разделе Инструкция по SMS-ИНФО. Полный список рубрик SMS-ИНФО можно получить, отправив SMS-запрос HELP на номер 511. Стоимость одного запроса на номер 511 составляет 110 рублей без учета НДС и налога с продаж; для абонентов. Подробнее об услуге SMS-ИНФО читайте в разделе Инструкция по SMS-ИНФО. Внимание! Информацию можно получать и в виде MMS-

In [111]:
class Query:
    def __init__(self, task_id, query, relevant_docs):
        self.task_id = task_id
        self.query = query
        self.relevant_docs = relevant_docs

def json_query(query):
    return {
    'query': {
        'bool': {
            'should': [
                {
                    'match': {
                        'text': query.query
                    }
                }
            ]
        }
    }
}
        


In [112]:
def pagerank_query(query):
    return  {
    'query': {
        'bool': {
            'should': [
                {
                    'match': {
                        'text': query.query
                    }
                },
                {
                    'rank_feature': {
                        'field': 'pagerank',
                        'saturation': {
                            'pivot': 10
                        },
                        'boost': '5.0'
                    }
                }
            ]
        }
    }
}

In [119]:
class SearchQualityChecker:
    def __init__(self, queries, index):
        self.queries = queries
        self.index = index
        self.results = {}
        
    def get_results(self, get_query=json_query):
        r_precision_total = 0
        map_score_total = 0
        r_total = 0
        p_total = 0
        for q in tqdm(self.queries):
            res = self.index.search(get_query(q))
            print(q.task_id)
            pretty_print_result(res)
            scores = get_score(res)
            p_total += self.p(20, q, scores)
            r_total += self.r(20, q, scores)
            r_precision_total += self.r_precision(q, scores)
            map_score_total += self.map_score(q, scores, 20)
        Q = len(self.queries)
        return r_precision_total / Q, map_score_total / Q, p_total / Q, r_total / Q
    
    def r_precision(self, query, search_res_score):
        return self.r(len(query.relevant_docs), query, search_res_score)
    
    def map_score(self, query, search_res_score, n):
        m = 0
        for k in range(1, n):       
            m += self.p(k, query, search_res_score)
        R = len(query.relevant_docs)
        return m / n
    
    def p(self, k, query, search_res_score):
        r = 0
        for doc, _ in search_res_score[:k]:
            if doc in query.relevant_docs:
                r += 1
        return r / k
    
    def r(self, k, query, search_res_score):
        R = len(query.relevant_docs)
        r = 0
        for doc, _ in search_res_score[:k]:
            if doc in query.relevant_docs:
                r += 1
        return r / R if R != 0 else 0 if len(search_res_score) > 0 else 1


In [120]:
def get_relevance():
    res = {}
    xml_tree = etree.parse("data/or_relevant-minus_table.xml")
    root = xml_tree.getroot()
    for task in root.getchildren():
        relevant_docs = set()
        for document in task.getchildren():
            if document.get("relevance") == "vital":
                relevant_docs.add(document.get("id"))
        res[task.get("id")] = relevant_docs
    print(len(res))
    return res


def generate_queries_plain_texts():
    relevances = get_relevance()
    xml_tree = etree.parse("data/web2008_adhoc.xml")
    root = xml_tree.getroot()
    res = []
    for task in root.getchildren():
        if task.get("id") is not None:
            for query_text in task.getchildren():
                try:
                    res.append(Query(task.get("id"), query_text.text, relevances[task.get("id")]))
                except:
                    pass
    print(len(res))
    return res

In [121]:
queries = generate_queries_plain_texts()
quality_checker = SearchQualityChecker(queries, index)
plain_text_res = quality_checker.get_results()

547
547


HBox(children=(IntProgress(value=0, max=547), HTML(value='')))

arw49633
Total documents: 1911
Doc 990390, score is 13.157524
Doc 993272, score is 13.157524
Doc 359461, score is 13.157524
Doc 360857, score is 12.175998
Doc 114021, score is 9.994495
Doc 863522, score is 9.937239
Doc 863494, score is 9.902861
Doc 599962, score is 9.885215
Doc 969889, score is 9.791648
Doc 802136, score is 9.790344
Doc 77084, score is 9.769339
Doc 89548, score is 9.744069
Doc 1319849, score is 9.6186695
Doc 623630, score is 9.608261
Doc 971553, score is 9.583519
Doc 1283004, score is 9.556349
Doc 86909, score is 9.48445
Doc 1427859, score is 9.48445
Doc 77545, score is 9.461813
Doc 658433, score is 9.452979
arw49662
Total documents: 0
arw49674
Total documents: 10000
Doc 74265, score is 20.83848
Doc 876069, score is 19.587797
Doc 1115273, score is 17.429165
Doc 19214, score is 17.099628
Doc 19995, score is 17.099628
Doc 1509861, score is 16.007002
Doc 74270, score is 15.681253
Doc 1360296, score is 15.483906
Doc 681222, score is 15.483906
Doc 878912, score is 14.794117

arw49936
Total documents: 10000
Doc 951580, score is 17.564886
Doc 886237, score is 17.314306
Doc 1499101, score is 17.257875
Doc 1263409, score is 17.193613
Doc 18647, score is 17.174175
Doc 1342194, score is 17.15792
Doc 1380429, score is 17.114546
Doc 696668, score is 17.114546
Doc 435356, score is 16.669022
Doc 1345166, score is 16.165646
Doc 886078, score is 16.121595
Doc 969907, score is 16.121471
Doc 1401098, score is 16.043303
Doc 1488655, score is 15.7847185
Doc 1435053, score is 15.529593
Doc 1045967, score is 15.359879
Doc 1181639, score is 15.280785
Doc 1218350, score is 15.240352
Doc 1429081, score is 15.116403
Doc 1267206, score is 14.867378
arw49938
Total documents: 10000
Doc 108087, score is 15.39679
Doc 490775, score is 13.221358
Doc 593314, score is 13.119261
Doc 1522926, score is 13.119261
Doc 432781, score is 12.770277
Doc 636717, score is 12.770277
Doc 853616, score is 12.292774
Doc 527481, score is 12.292774
Doc 618444, score is 12.039942
Doc 417346, score is 11.7

arw50191
Total documents: 7461
Doc 402431, score is 28.38454
Doc 614686, score is 26.936474
Doc 1353937, score is 25.463774
Doc 694703, score is 25.209534
Doc 1252848, score is 24.926878
Doc 300630, score is 23.232151
Doc 1039968, score is 22.828705
Doc 1455592, score is 22.308514
Doc 1059765, score is 22.231182
Doc 624771, score is 22.02257
Doc 1138214, score is 21.67081
Doc 1455416, score is 21.42957
Doc 480187, score is 21.351116
Doc 254889, score is 21.259575
Doc 802169, score is 21.11596
Doc 1371639, score is 21.096764
Doc 1214163, score is 20.904005
Doc 13995, score is 20.760763
Doc 1345946, score is 20.195251
Doc 1387487, score is 20.13168
arw50197
Total documents: 10000
Doc 427952, score is 44.394287
Doc 77192, score is 38.25826
Doc 1286031, score is 36.802708
Doc 801663, score is 35.733795
Doc 1512211, score is 35.45205
Doc 1326805, score is 35.088627
Doc 48294, score is 34.84206
Doc 1259302, score is 34.795094
Doc 44902, score is 34.34107
Doc 401661, score is 33.87911
Doc 721

arw50498
Total documents: 99
Doc 1012024, score is 14.120238
Doc 845794, score is 14.095792
Doc 314169, score is 13.819838
Doc 703623, score is 13.486127
Doc 1139478, score is 13.393441
Doc 101476, score is 13.270543
Doc 376855, score is 13.166261
Doc 514284, score is 12.947877
Doc 427993, score is 12.722467
Doc 1378017, score is 12.722467
Doc 443498, score is 12.722467
Doc 482973, score is 12.695429
Doc 459625, score is 12.6533375
Doc 833497, score is 12.294401
Doc 680169, score is 12.068155
Doc 576081, score is 11.776447
Doc 5172, score is 11.774935
Doc 882083, score is 11.566755
Doc 949647, score is 11.566755
Doc 887228, score is 11.042431
arw50507
Total documents: 985
Doc 977476, score is 22.466837
Doc 944379, score is 20.615173
Doc 8972, score is 20.274193
Doc 1439417, score is 20.171535
Doc 443608, score is 19.73111
Doc 36886, score is 19.250051
Doc 822957, score is 19.250051
Doc 31717, score is 19.250051
Doc 350559, score is 18.779512
Doc 1266495, score is 18.693344
Doc 994663, 

arw50728
Total documents: 10000
Doc 313327, score is 23.244072
Doc 1060108, score is 20.04041
Doc 874668, score is 18.867714
Doc 1010162, score is 18.808899
Doc 768628, score is 17.922592
Doc 450889, score is 17.886162
Doc 278209, score is 17.55618
Doc 577472, score is 17.432629
Doc 1383988, score is 17.249052
Doc 1138488, score is 16.700634
Doc 1114155, score is 16.228905
Doc 1513088, score is 16.072678
Doc 1263379, score is 16.053314
Doc 1060544, score is 15.947378
Doc 1083822, score is 15.904657
Doc 1046368, score is 15.904657
Doc 774425, score is 15.872717
Doc 1508087, score is 15.847483
Doc 313325, score is 15.769871
Doc 996493, score is 15.628749
arw50761
Total documents: 10000
Doc 958941, score is 18.364445
Doc 1032474, score is 17.15096
Doc 1317725, score is 16.529587
Doc 1429959, score is 16.476248
Doc 1186222, score is 16.223711
Doc 1317710, score is 16.072165
Doc 45484, score is 15.745802
Doc 1364163, score is 15.473917
Doc 1138700, score is 15.370774
Doc 313251, score is 15

arw51002
Total documents: 1027
Doc 524586, score is 15.47694
Doc 814021, score is 11.81953
Doc 1101954, score is 11.6875
Doc 686382, score is 11.591975
Doc 956521, score is 11.481452
Doc 723829, score is 11.2277775
Doc 1273459, score is 11.111229
Doc 21863, score is 11.07958
Doc 914871, score is 10.96565
Doc 822495, score is 10.9004345
Doc 1004608, score is 10.688985
Doc 914843, score is 10.688985
Doc 597833, score is 10.459771
Doc 676332, score is 10.36115
Doc 1417983, score is 10.28896
Doc 356073, score is 10.16119
Doc 907642, score is 10.099969
Doc 887786, score is 10.099314
Doc 861415, score is 10.042082
Doc 437516, score is 10.036186
arw51035
Total documents: 10000
Doc 1043531, score is 21.205286
Doc 808104, score is 13.029863
Doc 899530, score is 12.697915
Doc 653718, score is 12.678439
Doc 2065, score is 12.671608
Doc 832570, score is 12.671043
Doc 626214, score is 12.650589
Doc 716906, score is 12.638705
Doc 524929, score is 12.621563
Doc 609615, score is 12.618474
Doc 1310244,

arw51343
Total documents: 2661
Doc 436246, score is 20.617016
Doc 91544, score is 20.605413
Doc 891518, score is 19.996414
Doc 826004, score is 19.908665
Doc 91719, score is 19.838318
Doc 286059, score is 19.677746
Doc 450036, score is 19.390705
Doc 826910, score is 19.390705
Doc 827074, score is 19.35343
Doc 826731, score is 19.252472
Doc 436395, score is 19.239092
Doc 824679, score is 19.175358
Doc 1124648, score is 18.94321
Doc 1332252, score is 18.94321
Doc 1350002, score is 18.94321
Doc 450001, score is 18.599983
Doc 799478, score is 18.57944
Doc 889942, score is 18.55902
Doc 1485359, score is 18.448568
Doc 1524540, score is 18.339462
arw51349
Total documents: 10000
Doc 1249277, score is 29.98082
Doc 703086, score is 28.389856
Doc 357907, score is 28.389856
Doc 1248159, score is 23.508427
Doc 997626, score is 23.485643
Doc 539311, score is 23.062439
Doc 1248340, score is 23.018188
Doc 1051576, score is 22.010176
Doc 1050860, score is 21.708986
Doc 540023, score is 21.378971
Doc 95

arw51835
Total documents: 10000
Doc 1211339, score is 14.206616
Doc 1209978, score is 14.136552
Doc 1484824, score is 14.136552
Doc 1331299, score is 13.998478
Doc 435396, score is 13.796349
Doc 939962, score is 13.664808
Doc 939966, score is 13.664808
Doc 287030, score is 13.552433
Doc 825659, score is 13.535753
Doc 286656, score is 13.535753
Doc 1228504, score is 13.535753
Doc 92709, score is 13.535753
Doc 826386, score is 13.535753
Doc 890698, score is 13.162809
Doc 1228092, score is 13.162809
Doc 278169, score is 12.779197
Doc 91980, score is 12.696386
Doc 574918, score is 12.619637
Doc 1072882, score is 12.5425
Doc 435769, score is 12.260839
arw51844
Total documents: 10000
Doc 307027, score is 20.458931
Doc 284803, score is 20.365223
Doc 9119, score is 20.34599
Doc 283844, score is 20.273438
Doc 1036895, score is 20.273438
Doc 1136520, score is 18.816038
Doc 12468, score is 17.379269
Doc 707211, score is 17.166214
Doc 1142033, score is 16.723942
Doc 284793, score is 16.545296
Doc 

arw52094
Total documents: 10000
Doc 1278253, score is 14.013835
Doc 466718, score is 13.961291
Doc 601701, score is 13.893696
Doc 963905, score is 13.874554
Doc 1098905, score is 13.795109
Doc 963890, score is 13.772605
Doc 1480865, score is 13.770399
Doc 880116, score is 13.746735
Doc 1315281, score is 13.744375
Doc 1367566, score is 13.744375
Doc 770799, score is 13.696906
Doc 1315358, score is 13.681378
Doc 678235, score is 13.657838
Doc 967055, score is 13.636177
Doc 51307, score is 13.613104
Doc 1065502, score is 13.607752
Doc 678264, score is 13.607334
Doc 1082056, score is 13.601533
Doc 1389319, score is 13.591533
Doc 771931, score is 13.590435
arw52095
Total documents: 10000
Doc 1505722, score is 13.312135
Doc 650174, score is 12.861571
Doc 33701, score is 12.836102
Doc 998105, score is 12.813145
Doc 665055, score is 12.504442
Doc 346445, score is 12.492001
Doc 1014329, score is 12.492001
Doc 1449354, score is 12.490936
Doc 372268, score is 12.466341
Doc 1374787, score is 12.40

arw52427
Total documents: 10000
Doc 1339737, score is 27.459026
Doc 1309514, score is 26.659983
Doc 506622, score is 21.519743
Doc 315355, score is 19.938263
Doc 1361342, score is 19.31596
Doc 1246831, score is 18.4904
Doc 851322, score is 18.313374
Doc 314152, score is 17.934439
Doc 561315, score is 17.653189
Doc 1248848, score is 17.22477
Doc 702650, score is 16.993715
Doc 404898, score is 16.65218
Doc 1030754, score is 16.65218
Doc 1504381, score is 16.207216
Doc 1051375, score is 16.020727
Doc 357493, score is 15.972159
Doc 1227027, score is 15.948584
Doc 788630, score is 15.78803
Doc 285894, score is 14.970431
Doc 709053, score is 14.741159
arw52446
Total documents: 10000
Doc 297527, score is 21.39814
Doc 940852, score is 19.78466
Doc 1214651, score is 18.764818
Doc 1360770, score is 18.637686
Doc 92754, score is 18.490255
Doc 335624, score is 18.490255
Doc 1045478, score is 18.473928
Doc 398411, score is 18.41442
Doc 296308, score is 18.211306
Doc 1281973, score is 18.181564
Doc 

arw52745
Total documents: 10000
Doc 690005, score is 27.052185
Doc 689892, score is 23.41669
Doc 1219882, score is 19.06687
Doc 295962, score is 18.166758
Doc 560583, score is 18.113495
Doc 28340, score is 18.063839
Doc 1136508, score is 18.00595
Doc 866249, score is 18.00595
Doc 866246, score is 18.00595
Doc 866800, score is 17.993471
Doc 1187172, score is 17.979156
Doc 560792, score is 17.94847
Doc 26045, score is 17.94847
Doc 1298820, score is 17.928268
Doc 1186914, score is 17.928268
Doc 1187292, score is 17.928268
Doc 870006, score is 17.928268
Doc 871507, score is 17.900597
Doc 870823, score is 17.893421
Doc 1186846, score is 17.891384
arw52760
Total documents: 10000
Doc 499534, score is 15.126755
Doc 1388043, score is 14.408034
Doc 883594, score is 14.408034
Doc 1333174, score is 13.506895
Doc 720955, score is 13.281884
Doc 875490, score is 13.126205
Doc 273021, score is 12.421305
Doc 984438, score is 11.889012
Doc 581038, score is 11.873187
Doc 399377, score is 11.873187
Doc 69

arw53079
Total documents: 10000
Doc 1387395, score is 15.467274
Doc 1145652, score is 14.0327425
Doc 878651, score is 13.669283
Doc 1284274, score is 13.352282
Doc 1122706, score is 12.960568
Doc 1398355, score is 12.374481
Doc 799553, score is 12.33318
Doc 458841, score is 12.318758
Doc 1405638, score is 12.318758
Doc 378732, score is 12.318758
Doc 817212, score is 12.255839
Doc 1519364, score is 12.2416315
Doc 1291257, score is 12.2416315
Doc 569737, score is 12.207338
Doc 1505576, score is 12.154677
Doc 558739, score is 12.142597
Doc 1396930, score is 12.048262
Doc 1249219, score is 12.026903
Doc 1398775, score is 12.013037
Doc 1086314, score is 11.96639
arw53136
Total documents: 9052
Doc 433584, score is 16.544277
Doc 1368507, score is 15.355385
Doc 618701, score is 15.190493
Doc 1407614, score is 15.190493
Doc 978411, score is 14.401328
Doc 1319669, score is 13.999574
Doc 628758, score is 13.336935
Doc 605168, score is 12.937
Doc 639955, score is 12.752202
Doc 881831, score is 12.

arw53594
Total documents: 10000
Doc 529965, score is 19.580694
Doc 887193, score is 17.333164
Doc 1336463, score is 16.30972
Doc 348577, score is 16.186203
Doc 494975, score is 16.12937
Doc 386900, score is 15.728544
Doc 471318, score is 15.477055
Doc 660565, score is 14.511826
Doc 29028, score is 14.503913
Doc 628270, score is 14.441107
Doc 421092, score is 14.324958
Doc 1401538, score is 14.080242
Doc 450057, score is 13.941151
Doc 1082001, score is 13.930275
Doc 494971, score is 13.566611
Doc 560753, score is 13.5312
Doc 1091517, score is 13.36917
Doc 812710, score is 13.33011
Doc 427123, score is 13.266834
Doc 650378, score is 13.128854
arw53610
Total documents: 10000
Doc 617818, score is 20.337803
Doc 807554, score is 19.148611
Doc 1041278, score is 17.005566
Doc 930947, score is 16.746483
Doc 930922, score is 16.746483
Doc 343713, score is 15.618769
Doc 352222, score is 14.8203
Doc 816108, score is 14.790633
Doc 998404, score is 14.519998
Doc 1164327, score is 14.467072
Doc 12584

arw53808
Total documents: 10000
Doc 252351, score is 19.662136
Doc 1378867, score is 18.95515
Doc 1203507, score is 18.83319
Doc 641080, score is 18.298159
Doc 74294, score is 18.223976
Doc 1199389, score is 18.174856
Doc 1045468, score is 16.463072
Doc 928585, score is 13.520149
Doc 998887, score is 13.263159
Doc 379814, score is 13.0871935
Doc 1167297, score is 12.991611
Doc 1275015, score is 12.920777
Doc 304557, score is 12.532711
Doc 1274561, score is 12.422636
Doc 937675, score is 12.042207
Doc 1258444, score is 11.798398
Doc 460393, score is 11.451637
Doc 704752, score is 11.2297945
Doc 643721, score is 11.016385
Doc 84754, score is 10.759702
arw53809
Total documents: 10000
Doc 1217813, score is 21.097252
Doc 1298824, score is 20.991528
Doc 930794, score is 20.898401
Doc 870119, score is 20.83421
Doc 1412691, score is 20.746597
Doc 841489, score is 20.319267
Doc 943160, score is 20.22963
Doc 33927, score is 20.17063
Doc 911877, score is 20.16763
Doc 286294, score is 20.114134
Do

arw54081
Total documents: 10000
Doc 685297, score is 10.422533
Doc 714249, score is 10.1083765
Doc 451814, score is 10.092985
Doc 715515, score is 10.018712
Doc 954507, score is 10.006412
Doc 1207340, score is 9.891705
Doc 453589, score is 9.881299
Doc 629463, score is 9.81776
Doc 454072, score is 9.758918
Doc 954594, score is 9.694588
Doc 473875, score is 9.65736
Doc 1293024, score is 9.645069
Doc 113254, score is 9.630265
Doc 1277550, score is 9.620456
Doc 1152415, score is 9.572268
Doc 723337, score is 9.471826
Doc 387391, score is 9.471561
Doc 1146925, score is 9.380521
Doc 475087, score is 9.373779
Doc 1237559, score is 9.234659
arw54087
Total documents: 10000
Doc 1093585, score is 18.831377
Doc 961150, score is 18.387264
Doc 877415, score is 18.387264
Doc 771099, score is 18.387264
Doc 771093, score is 18.361032
Doc 1065780, score is 18.31899
Doc 1076198, score is 18.31899
Doc 975145, score is 16.830765
Doc 1137757, score is 16.709953
Doc 560780, score is 16.630363
Doc 488455, sc

arw54507
Total documents: 10000
Doc 1482803, score is 19.913132
Doc 477089, score is 18.967505
Doc 108484, score is 14.885553
Doc 880765, score is 14.50629
Doc 1325046, score is 14.212941
Doc 1322196, score is 13.863758
Doc 804441, score is 13.863758
Doc 1350061, score is 13.761722
Doc 1090388, score is 13.424079
Doc 539721, score is 13.279341
Doc 1092677, score is 12.97655
Doc 495530, score is 12.835539
Doc 593390, score is 12.824046
Doc 263336, score is 12.512411
Doc 979492, score is 12.469386
Doc 979857, score is 12.415898
Doc 266194, score is 12.404301
Doc 554643, score is 12.393654
Doc 585484, score is 12.358443
Doc 1249829, score is 12.294459
arw54521
Total documents: 10000
Doc 330464, score is 27.272547
Doc 840596, score is 25.93009
Doc 659331, score is 24.81066
Doc 615312, score is 24.290325
Doc 567289, score is 20.719181
Doc 578736, score is 20.63322
Doc 398928, score is 20.614246
Doc 1353449, score is 20.590178
Doc 1130678, score is 20.438358
Doc 35669, score is 20.438358
Doc

arw54646
Total documents: 10000
Doc 502827, score is 17.097889
Doc 718079, score is 16.775677
Doc 399061, score is 15.752887
Doc 530989, score is 15.192515
Doc 1010637, score is 15.019305
Doc 1496180, score is 14.771245
Doc 366591, score is 14.669147
Doc 371972, score is 14.669147
Doc 1181725, score is 14.669147
Doc 1119468, score is 14.669147
Doc 527208, score is 14.435561
Doc 301003, score is 14.286673
Doc 437646, score is 14.2096615
Doc 372036, score is 14.2096615
Doc 708505, score is 13.991056
Doc 79160, score is 13.729832
Doc 708469, score is 13.574312
Doc 368323, score is 13.574312
Doc 1522172, score is 13.470052
Doc 842208, score is 13.379599
arw54670
Total documents: 10000
Doc 447908, score is 19.642878
Doc 449389, score is 18.981155
Doc 91088, score is 17.809252
Doc 1331956, score is 17.79448
Doc 287321, score is 17.175909
Doc 1210599, score is 16.257778
Doc 1331946, score is 15.874706
Doc 1338231, score is 15.332604
Doc 91084, score is 15.217207
Doc 939030, score is 15.133571

arw55045
Total documents: 10000
Doc 560338, score is 20.61656
Doc 1368869, score is 20.055172
Doc 560713, score is 19.805765
Doc 1359928, score is 18.078438
Doc 870823, score is 18.063389
Doc 1150903, score is 17.195816
Doc 716636, score is 16.46941
Doc 487763, score is 16.248089
Doc 1136535, score is 16.18457
Doc 1187590, score is 16.046165
Doc 28001, score is 16.046165
Doc 691220, score is 16.046165
Doc 870606, score is 15.771145
Doc 1361496, score is 15.541589
Doc 786506, score is 15.468013
Doc 935351, score is 15.379192
Doc 487577, score is 15.219715
Doc 509853, score is 15.02711
Doc 1361386, score is 14.799387
Doc 1101361, score is 14.776627
arw55066
Total documents: 33
Doc 1076586, score is 17.026875
Doc 992280, score is 16.843369
Doc 1272307, score is 15.856283
Doc 39719, score is 14.544349
Doc 1279376, score is 14.544349
Doc 675098, score is 14.013286
Doc 1034209, score is 12.482272
Doc 1432853, score is 12.411255
Doc 851074, score is 12.140716
Doc 665567, score is 11.881719
Do

arw55541
Total documents: 271
Doc 111496, score is 15.171892
Doc 887828, score is 13.976596
Doc 1249141, score is 13.559688
Doc 1170608, score is 13.453257
Doc 1292390, score is 13.323
Doc 411087, score is 13.211791
Doc 1054713, score is 13.123695
Doc 1260006, score is 12.978841
Doc 1136316, score is 12.893789
Doc 1137159, score is 12.893789
Doc 416647, score is 12.800365
Doc 1317118, score is 12.794581
Doc 1444574, score is 12.686441
Doc 701035, score is 12.481571
Doc 306636, score is 12.281586
Doc 26936, score is 12.281586
Doc 44306, score is 12.16879
Doc 588009, score is 12.141504
Doc 1469991, score is 12.141504
Doc 355845, score is 12.062973
arw55549
Total documents: 10000
Doc 1339076, score is 36.08385
Doc 833112, score is 35.974857
Doc 542026, score is 34.245052
Doc 845511, score is 33.85972
Doc 1091202, score is 32.880795
Doc 987180, score is 29.98332
Doc 719510, score is 28.716274
Doc 1192683, score is 27.926973
Doc 1425683, score is 27.891237
Doc 695290, score is 27.28514
Doc 

arw55894
Total documents: 10000
Doc 1218891, score is 20.921757
Doc 1460362, score is 19.238644
Doc 1460353, score is 18.766373
Doc 1084416, score is 18.388643
Doc 1037718, score is 18.32175
Doc 1080679, score is 17.658121
Doc 1426697, score is 16.530891
Doc 1162294, score is 16.205435
Doc 68019, score is 16.14149
Doc 1379381, score is 15.979725
Doc 962463, score is 15.770282
Doc 1049624, score is 15.360412
Doc 1368485, score is 15.066055
Doc 50633, score is 14.950456
Doc 51986, score is 14.950456
Doc 20003, score is 14.950456
Doc 52335, score is 14.950456
Doc 51541, score is 14.950456
Doc 313432, score is 14.759531
Doc 1279893, score is 14.694366
arw55895
Total documents: 10000
Doc 872151, score is 20.891739
Doc 1066363, score is 14.976164
Doc 1292144, score is 13.046305
Doc 1043316, score is 12.949042
Doc 1043510, score is 12.949042
Doc 74912, score is 12.914215
Doc 602149, score is 12.914215
Doc 865773, score is 12.914215
Doc 109302, score is 12.914215
Doc 79768, score is 12.914215


arw56440
Total documents: 10000
Doc 883233, score is 15.889874
Doc 777540, score is 15.709234
Doc 605379, score is 15.10577
Doc 785408, score is 14.840254
Doc 3573, score is 14.837943
Doc 994283, score is 14.774665
Doc 1347050, score is 14.322827
Doc 355721, score is 14.31106
Doc 1303850, score is 13.935759
Doc 650013, score is 13.750483
Doc 843065, score is 13.156113
Doc 464760, score is 12.834257
Doc 649189, score is 12.704887
Doc 1479374, score is 12.629555
Doc 1297246, score is 12.378482
Doc 896933, score is 12.235588
Doc 1461674, score is 12.179253
Doc 1102894, score is 12.166158
Doc 401504, score is 12.082325
Doc 1229179, score is 11.995369
arw56467
Total documents: 2030
Doc 437809, score is 20.085026
Doc 708706, score is 19.373314
Doc 484520, score is 18.710316
Doc 708705, score is 18.710316
Doc 1384084, score is 18.057026
Doc 365862, score is 17.945894
Doc 366769, score is 17.40793
Doc 832367, score is 17.40793
Doc 564768, score is 17.306805
Doc 537894, score is 17.103771
Doc 3

arw56839
Total documents: 10000
Doc 1418888, score is 17.88842
Doc 105576, score is 17.271183
Doc 1506193, score is 16.9691
Doc 854904, score is 16.651415
Doc 1276329, score is 15.897912
Doc 1402197, score is 15.731503
Doc 258755, score is 15.139602
Doc 1323154, score is 14.868531
Doc 257484, score is 14.817522
Doc 942393, score is 14.158992
Doc 566159, score is 14.067208
Doc 565179, score is 13.920359
Doc 104151, score is 13.8675585
Doc 559614, score is 13.696829
Doc 272157, score is 13.451131
Doc 999063, score is 13.451131
Doc 1372513, score is 13.435248
Doc 117308, score is 13.237711
Doc 1038112, score is 13.077997
Doc 942708, score is 12.995688
arw56840
Total documents: 10000
Doc 439031, score is 25.729567
Doc 93895, score is 20.811398
Doc 440880, score is 20.22519
Doc 1183156, score is 20.113506
Doc 1008521, score is 20.113506
Doc 1088691, score is 19.848148
Doc 1232461, score is 18.628374
Doc 376505, score is 18.259129
Doc 1339051, score is 17.400167
Doc 1502430, score is 16.1386

arw57399
Total documents: 10000
Doc 447204, score is 18.037216
Doc 486806, score is 15.136028
Doc 774588, score is 15.010398
Doc 1427009, score is 14.451853
Doc 1479526, score is 14.165406
Doc 1524205, score is 14.065344
Doc 581047, score is 14.065344
Doc 523230, score is 13.95774
Doc 1030731, score is 13.886916
Doc 539616, score is 13.886916
Doc 943077, score is 13.886916
Doc 1338393, score is 13.817274
Doc 1052417, score is 13.71247
Doc 647699, score is 13.699377
Doc 1052141, score is 13.6813
Doc 401581, score is 13.6813
Doc 917862, score is 13.6813
Doc 26831, score is 13.371774
Doc 807965, score is 13.292384
Doc 951519, score is 12.968853
arw57417
Total documents: 10000
Doc 1250172, score is 19.190989
Doc 270748, score is 19.190989
Doc 636238, score is 13.431664
Doc 345359, score is 12.921949
Doc 1463635, score is 12.756722
Doc 1335982, score is 12.375967
Doc 1277391, score is 11.544475
Doc 529180, score is 11.41073
Doc 886318, score is 11.41073
Doc 1258654, score is 10.983706
Doc 1

arw57647
Total documents: 1652
Doc 1521661, score is 18.827793
Doc 532031, score is 18.049656
Doc 721856, score is 17.249317
Doc 1277069, score is 15.252264
Doc 1032461, score is 14.927492
Doc 1371901, score is 14.717339
Doc 118087, score is 14.594564
Doc 1309098, score is 14.505798
Doc 1313005, score is 14.408232
Doc 1100474, score is 13.934862
Doc 1197646, score is 13.741484
Doc 1113172, score is 13.741484
Doc 518060, score is 13.345091
Doc 412874, score is 13.070509
Doc 1373554, score is 12.831212
Doc 116997, score is 12.496597
Doc 1100205, score is 12.496597
Doc 373941, score is 11.955669
Doc 1024181, score is 11.955669
Doc 941790, score is 11.475455
arw57701
Total documents: 3653
Doc 472928, score is 32.596184
Doc 1404303, score is 29.969292
Doc 472913, score is 28.624008
Doc 1149318, score is 28.12512
Doc 1187965, score is 27.430988
Doc 933692, score is 26.337883
Doc 1000301, score is 26.039583
Doc 1361878, score is 24.916798
Doc 1292335, score is 24.879787
Doc 1518333, score is 

arw57881
Total documents: 10000
Doc 575214, score is 20.189432
Doc 965044, score is 19.294333
Doc 585578, score is 15.903329
Doc 105132, score is 12.282671
Doc 1430591, score is 11.899367
Doc 347330, score is 11.838234
Doc 962880, score is 11.655708
Doc 799700, score is 9.450956
Doc 1158877, score is 9.32595
Doc 1243202, score is 9.041545
Doc 1407793, score is 8.761847
Doc 1144476, score is 8.07221
Doc 1311572, score is 7.240443
Doc 1094376, score is 7.235602
Doc 311924, score is 7.1314797
Doc 1162458, score is 7.101149
Doc 76199, score is 7.074702
Doc 865655, score is 7.0254793
Doc 660609, score is 7.004841
Doc 1190789, score is 7.004841
arw57932
Total documents: 10000
Doc 880639, score is 19.948145
Doc 986290, score is 19.63608
Doc 906465, score is 19.34823
Doc 395954, score is 19.078842
Doc 446652, score is 17.954287
Doc 486491, score is 17.615168
Doc 1086269, score is 17.168764
Doc 49459, score is 17.122644
Doc 1378167, score is 16.943407
Doc 827938, score is 16.86401
Doc 1479444, 

arw58309
Total documents: 10000
Doc 893805, score is 29.099545
Doc 893800, score is 28.63488
Doc 573076, score is 27.136295
Doc 1272980, score is 26.227573
Doc 14202, score is 25.629623
Doc 1347725, score is 25.349981
Doc 963247, score is 25.271112
Doc 1085068, score is 24.866455
Doc 1287074, score is 24.617167
Doc 1221750, score is 24.447718
Doc 65978, score is 24.185253
Doc 539628, score is 24.12385
Doc 899211, score is 24.114517
Doc 1109251, score is 23.7959
Doc 903306, score is 23.693695
Doc 1043979, score is 23.564892
Doc 1282616, score is 23.563654
Doc 773606, score is 23.215055
Doc 1157976, score is 23.201338
Doc 87671, score is 22.980484
arw58328
Total documents: 10000
Doc 1181049, score is 14.501312
Doc 1069025, score is 14.300163
Doc 774457, score is 13.869848
Doc 678850, score is 13.771218
Doc 876651, score is 13.689619
Doc 964505, score is 13.689619
Doc 1079497, score is 13.689619
Doc 44756, score is 13.499004
Doc 420852, score is 13.374075
Doc 1079506, score is 13.374075
D

arw58656
Total documents: 10000
Doc 1011797, score is 18.854252
Doc 888082, score is 18.648142
Doc 885267, score is 15.682592
Doc 720243, score is 15.160116
Doc 622524, score is 14.612402
Doc 1160367, score is 14.23526
Doc 1215054, score is 13.591521
Doc 1009296, score is 8.266839
Doc 1409610, score is 0.30556768
Doc 1077248, score is 0.30437756
Doc 1459068, score is 0.30426902
Doc 1234647, score is 0.30392897
Doc 320801, score is 0.30387834
Doc 1153649, score is 0.30386227
Doc 1050943, score is 0.3037649
Doc 452748, score is 0.30376276
Doc 1268249, score is 0.3036822
Doc 357297, score is 0.30361316
Doc 1101758, score is 0.303599
Doc 374920, score is 0.30359265
arw58729
Total documents: 6684
Doc 1227722, score is 18.671474
Doc 1380591, score is 17.906292
Doc 1382034, score is 17.851414
Doc 1153147, score is 17.747498
Doc 1396046, score is 15.178157
Doc 1203374, score is 14.774771
Doc 1289039, score is 14.190643
Doc 378067, score is 13.354945
Doc 574281, score is 13.175722
Doc 513355, s

arw58970
Total documents: 10000
Doc 1262228, score is 13.346936
Doc 1473606, score is 12.404811
Doc 1421173, score is 12.29347
Doc 672364, score is 11.570872
Doc 905038, score is 11.05001
Doc 1077331, score is 10.870287
Doc 661559, score is 10.839827
Doc 39399, score is 10.818637
Doc 1516902, score is 10.383684
Doc 626613, score is 10.38235
Doc 72651, score is 10.38235
Doc 887844, score is 10.319556
Doc 1262447, score is 10.283515
Doc 527356, score is 10.258385
Doc 1166149, score is 10.1987915
Doc 62632, score is 10.079145
Doc 296316, score is 10.019127
Doc 1227069, score is 9.959688
Doc 431289, score is 9.848082
Doc 462175, score is 9.848082
arw58974
Total documents: 9360
Doc 1519881, score is 16.95644
Doc 1393897, score is 16.459774
Doc 917546, score is 16.390667
Doc 665645, score is 16.266926
Doc 1308001, score is 15.754616
Doc 819031, score is 15.653839
Doc 1501015, score is 15.599531
Doc 1358149, score is 15.543877
Doc 1201133, score is 15.543877
Doc 1380743, score is 15.543877
Do

In [42]:
plain_text_res

(0.18016356694188573,
 0.3444619661098512,
 0.3020109689213894,
 0.20969783400006586)

In [43]:
from pymystem3 import Mystem
from nltk.corpus import stopwords
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from string import punctuation
from nltk.corpus import stopwords

russian_stopwords = stopwords.words("russian")
english_stopwords = stopwords.words("english")
black_list = ["°", "№", "©", "...", "//", "://", "</", "\">", "=\"", "=\'", "\r", "\n", "\t"]
stem = Mystem()

def lemmatize(text):
    words = nltk.word_tokenize(text.lower())
    tokens = []
    for word in words:
        tokens.extend(stem.lemmatize(word))
    tokens = [token for token in tokens if token != " " and token.strip() not in punctuation \
              and token not in russian_stopwords and token not in english_stopwords \
              and token not in black_list \
              and token.find("\r") == -1 \
              and token.find("\n") == -1 \
              and token.find("\t") == -1 \
              and not (token.isdigit() and len(token) == 1)]
    return ' '.join(tokens)

def generate_queries_lemmas():
    relevances = get_relevance()
    xml_tree = etree.parse("data/web2008_adhoc.xml")
    root = xml_tree.getroot()
    res = []
    for task in tqdm(root.getchildren()):
        if task.get("id") is not None:
            for query_text in task.getchildren():
                try:
                    res.append(Query(task.get("id"), lemmatize(query_text.text), relevances[task.get("id")]))
                except:
                    pass
    print(len(res))
    return res



In [44]:
queries_lemmas = generate_queries_lemmas()

547


HBox(children=(IntProgress(value=0, max=29232), HTML(value='')))


547


In [74]:
queries_lemmas[8].query

'инвесткапиталбанк'

In [46]:
lemma_index = Index("lemma_docs", settings_1)

start = time.time()
lemma_index.add_documents("data/json_filtered_tokens_texts")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


0:02:57.629302


In [47]:
lemma_quality_checker = SearchQualityChecker(queries_lemmas, lemma_index)
lemma_res = lemma_quality_checker.get_results()

HBox(children=(IntProgress(value=0, max=547), HTML(value='')))




In [48]:
lemma_res

(0.21180581149352445,
 0.3860612166406112,
 0.3581352833638026,
 0.2536162197731625)

In [49]:
id_to_pagerank = {}
with open('res/pagerank.txt','r') as f:
    for line in f:
        docId, docURL, rank = line.split()
        id_to_pagerank[int(docId)] = float(rank)

In [50]:
len(id_to_pagerank)


199202

In [51]:
 for doc_name in tqdm(os.listdir("data/json_filtered_tokens_texts")):
        with open(f"data/json_filtered_tokens_texts/{doc_name}", "r+", encoding="utf-8") as inf:
            doc_id = int(''.join(list(filter(str.isdigit, doc_name))))
            doc = json.load(inf)
            try:
                doc["pagerank"] = id_to_pagerank.get(doc_id)
            except:
                pass
            inf.seek(0)        # <--- should reset file position to the beginning.
            json.dump(doc, inf, indent=4, ensure_ascii=False)
            inf.truncate()

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




In [5]:
settings_with_pagerank = {
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "pagerank": {
                "type": "rank_feature"
            }
        }
    }
}

In [6]:
pr_index = Index("pagerank_index", settings_with_pagerank)

In [7]:
pr_index.add_documents("data/json_filtered_tokens_texts")

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


('3 document(s) failed to index.', [{'index': {'_index': 'pagerank_index', '_type': '_doc', '_id': '1204092', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': "failed to parse field [pagerank] of type [rank_feature] in document with id '1204092'. Preview of field's value: '0'", 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'featureValue must be a positive normal float, got: 0.0for feature pagerank on field _feature which is less than the minimum positive normal float: 1.17549435E-38'}}, 'data': {'text': 'english version услуга продукт решение партнер новость компания группа iba контакт поиск главный новость добро пожаловать www kancler смотреть также подписываться наш новость добро пожаловать www kancler пакет прикладной программа ппп канцлер это новое поколение программный продукт наш компания платформа lotus domino notes предназначать создание система электронный документооборот сэд орган государственный управление крупный территориально распред

In [63]:
pr_quality_checker = SearchQualityChecker(queries_lemmas, pr_index)
pr_res = pr_quality_checker.get_results(pagerank_query)

HBox(children=(IntProgress(value=0, max=547), HTML(value='')))




In [64]:
pr_res

(0.20259482033279524,
 0.38557425220134395,
 0.3574040219378429,
 0.24368115581340624)

In [59]:
pr_index.get_doc_by_id(1000039)

{'text': 'сервис компания партнер описание рубрикатор мобильный переводчик sms инфо услуга позволять помощь sms запрос оперативно получать информация различный характер текстовый вид информация sms инфо мобильный доступный 24 час сутки весь территория зона действие сеть velcom ответ запрос приходить оперативно течение несколько секунда получение информация необходимо отправлять sms запрос латиница кириллица номер 511 кроме сформировывать sms запрос помощь sim карта новое образец velcom меню таблица транслитерация соответствие русский буква латинский представлять раздел инструкция sms инфо полный список рубрика sms инфо получать отправлять sms запрос help номер 511 стоимость запрос номер 511 составлять 110 рубль учет ндс налог продажа абонент подробно услуга sms инфо читать раздел инструкция sms инфо внимание информация получать вид mms сообщение помощь услуга mms инфо март поздравлять 23 февраль лотерея ваш лото день св валентина новость спорт деньги день рейтинг погода gismeteo киноне

In [None]:
settings_titles = {
        'mappings': {
            'properties': {
                'content': {
                    'type': 'text',
                },
                'title': {
                    'type': 'text'
                }
            }
            
        }
    }