In [3]:
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import os
from tqdm import tqdm_notebook as tqdm
import time
from lxml import etree
from sklearn.metrics import r2_score
from datetime import timedelta
import numpy as np

In [27]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def pretty_print_result(search_result, fields=None):
    if fields is None:
        fields = []
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')


def get_score(search_result):
    res = []
    for hit in search_result['hits']['hits']:
        res.append((hit["_id"], hit["_score"]))
    res.sort(key = lambda x: x[1], reverse = True)
    return res


class Index:
    def __init__(self, index, settings):
        self.index_name = index
        self.settings = settings
        self.es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360}])
        if self.es.indices.exists(index=index):
            self.es.indices.delete(index=index)
        self.es.indices.create(index=index, body=settings)

    def es_actions_generator(self, path_to_docs):
        for doc_name in tqdm(os.listdir(path_to_docs)):
            with open(f"{path_to_docs}/{doc_name}", "r", encoding="utf-8") as inf:
                doc_id = int(''.join(list(filter(str.isdigit, doc_name))))
                doc = json.load(inf)           
            yield create_es_action(self.index_name, doc_id, doc)


    def add_documents(self, path_to_docs):
        try:
            for ok, result in parallel_bulk(self.es, self.es_actions_generator(path_to_docs), queue_size=4, thread_count=4,
                                        chunk_size=1000):
                  if not ok:
                     print(result)
        except Exception as e: 
            print(e)

    def get_doc_by_id(self, doc_id):
        return self.es.get(index=self.index_name, id=doc_id)['_source']

    def search(self, query, *args):
        return self.es.search(index=self.index_name, body=query, size=20)
        # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20



In [28]:
settings_1 = {
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            }
        }
    }
}

In [29]:
settings = {
        'mappings': {
            'properties': {
                'text': {
                    'type': 'text',
                    'analyzer': 'russian_complex',
                    'search_analyzer': 'russian_complex'
                }
            }
            
        },
        "settings": {
        "analysis" : {
            "analyzer" : {
                "my_analyzer" : {
                    "tokenizer" : "standard",
                    "filter" : ["lowercase", "russian_snow", "english_snow"]
                },
                'russian_complex': {
                    'char_filter': [
                        'yont'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase',
                        'russian_snow'
                    ]
                }
            },
            'char_filter': {
                'yont': {
                    'type': 'mapping',
                    'mappings': [
                        'ё => е'
                    ]
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                },
                'white_20': {
                    'type': 'whitespace',
                    'max_token_length': 5
                }
            },
            "filter" : {
                "russian_snow" : {
                    "type" : "snowball",
                    "language" : "Russian"
                },
                "english_snow" : {
                    "type" : "snowball",
                    "language" : "English"
                }
            }
        }
    }
}

In [30]:
index = Index("docs", settings_1)

In [31]:
start = time.time()
index.add_documents("res/json")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200001), HTML(value='')))


0:03:38.548274


In [32]:
stem_index = Index('stem_docs', settings)

In [33]:
start = time.time()
stem_index.add_documents("res/json")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200001), HTML(value='')))

KeyboardInterrupt: 

In [None]:
stem_index.get_doc_by_id('1000039')

In [34]:
class Query:
    def __init__(self, task_id, query, relevant_docs):
        self.task_id = task_id
        self.query = query
        self.relevant_docs = relevant_docs


In [35]:

def json_query(query):
    return {
    'query': {
        'bool': {
            'should': [
                {
                    'match': {
                        'text': query.query
                    }
                }
            ]
        }
    }
}
        


In [36]:
def pagerank_query(query):
    return  {
    'query': {
        'bool': {
            'should': [
                {
                    'match': {
                        'text': query.query
                    }
                },
                {
                    'rank_feature': {
                        'field': 'pagerank',
                        'saturation': {
                            'pivot': 10
                        },
                        'boost': '5.0'
                    }
                }
            ]
        }
    }
}

In [37]:
class Metrics:
    def __init__(self, p, r, r_precision, map_score):
        self.r = r
        self.p = p
        self.r_precision = r_precision
        self.map_score = map_score
    
    def __str__(self):
        return f"r = {self.r}\np = {self.p}\nr_precision = {self.r_precision}\nMAP = {self.map_score}"
    
    __repr__ = __str__


class SearchQualityChecker:
    def __init__(self, queries, index):
        self.queries = queries
        self.index = index
        self.results = {}
        self.metrics = {}
        
    def get_results(self, get_query=json_query):
        r_precision_total = 0
        map_score_total = 0
        r_total = 0
        p_total = 0
        for q in tqdm(self.queries):
            res = self.index.search(get_query(q))
            print(q.task_id)
            pretty_print_result(res)
            scores = get_score(res)
            metric = Metrics(p=self.p(20, q, scores), r=self.r(20, q, scores), r_precision=self.r_precision(q, scores),
                            map_score=self.map_score(q, scores, 20))
            p_total += metric.p
            r_total += metric.r
            r_precision_total += metric.r_precision
            map_score_total += metric.map_score
            self.metrics[q.task_id] = metric
        Q = len(self.queries)
        print(Q)
        return Metrics(p=p_total / Q, r=r_total / Q, r_precision=r_precision_total / Q, map_score=map_score_total / Q)
    
    def r_precision(self, query, search_res_score):
        return self.r(len(query.relevant_docs), query, search_res_score)
    
    def map_score(self, query, search_res_score, n):
        m = 0
        for k in range(1, n):       
            m += self.p(k, query, search_res_score)
        R = len(query.relevant_docs)
        return m / n
    
    def p(self, k, query, search_res_score):
        r = 0
        for doc, _ in search_res_score[:k]:
            if doc in query.relevant_docs:
                r += 1
        return r / k
    
    def r(self, k, query, search_res_score):
        R = len(query.relevant_docs)
        r = 0
        for doc, _ in search_res_score[:k]:
            if doc in query.relevant_docs:
                r += 1
        return r / R if R != 0 else 0 if len(search_res_score) > 0 else 1



In [57]:
def find_diff_metrics(quality_checker, other_checker, k=20, comp=lambda x : x.map_score):
     res = []
     for task_id in quality_checker.metrics:
        metric = quality_checker.metrics[task_id]
        other_metric = other_checker.metrics[task_id]
        res.append((task_id, abs(comp(metric) - comp(other_metric)), metric, other_metric))
     res.sort(reverse=True, key=lambda x: x[1])
     return res

In [48]:
def get_relevance():
    res = {}
    xml_tree = etree.parse("data/or_relevant-minus_table.xml")
    root = xml_tree.getroot()
    for task in root.getchildren():
        relevant_docs = set()
        for document in task.getchildren():
            if document.get("relevance") == "vital":
                relevant_docs.add(document.get("id"))
        res[task.get("id")] = relevant_docs
    print(len(res))
    return res


def generate_queries_plain_texts():
    relevances = get_relevance()
    xml_tree = etree.parse("data/web2008_adhoc.xml")
    root = xml_tree.getroot()
    res = []
    for task in root.getchildren():
        if task.get("id") is not None:
            for query_text in task.getchildren():
                try:
                    res.append(Query(task.get("id"), query_text.text, relevances[task.get("id")]))
                except Exception as e:
                    print(e)
    print(len(res))
    return res

In [49]:
#queries = generate_queries_plain_texts()
#print(queries)
quality_checker = SearchQualityChecker(queries, index)
plain_text_res = quality_checker.get_results()

HBox(children=(IntProgress(value=0, max=547), HTML(value='')))

arw49633
Total documents: 1911
Doc 993272, score is 13.157524
Doc 990390, score is 13.157524
Doc 359461, score is 13.157524
Doc 360857, score is 12.175998
Doc 114021, score is 9.994495
Doc 863522, score is 9.937239
Doc 863494, score is 9.902861
Doc 599962, score is 9.885215
Doc 969889, score is 9.791648
Doc 802136, score is 9.790344
Doc 77084, score is 9.769339
Doc 89548, score is 9.744069
Doc 1319849, score is 9.6186695
Doc 623630, score is 9.608261
Doc 971553, score is 9.583519
Doc 1283004, score is 9.556349
Doc 1427859, score is 9.48445
Doc 86909, score is 9.48445
Doc 77545, score is 9.461813
Doc 658433, score is 9.452979
arw49662
Total documents: 0
arw49674
Total documents: 10000
Doc 74265, score is 20.83848
Doc 876069, score is 19.587797
Doc 1115273, score is 17.429165
Doc 19995, score is 17.099628
Doc 19214, score is 17.099628
Doc 1509861, score is 16.007002
Doc 74270, score is 15.681253
Doc 681222, score is 15.483906
Doc 1360296, score is 15.483906
Doc 878912, score is 14.794117

arw49944
Total documents: 9626
Doc 7317, score is 23.408134
Doc 942354, score is 22.699839
Doc 362460, score is 21.800516
Doc 1369472, score is 21.049526
Doc 860912, score is 19.23954
Doc 378916, score is 18.96234
Doc 1173677, score is 18.649426
Doc 1339324, score is 18.490355
Doc 1177576, score is 18.223392
Doc 1296174, score is 18.18085
Doc 1208015, score is 17.757048
Doc 1271351, score is 17.16899
Doc 1480002, score is 17.083025
Doc 857813, score is 16.92191
Doc 1467276, score is 16.744686
Doc 466123, score is 16.445835
Doc 378992, score is 16.202778
Doc 1309622, score is 15.204734
Doc 646295, score is 15.13085
Doc 1339827, score is 14.97822
arw49950
Total documents: 10000
Doc 489340, score is 24.557968
Doc 866689, score is 24.529598
Doc 1299659, score is 24.529598
Doc 269146, score is 23.233757
Doc 494355, score is 23.154278
Doc 1012870, score is 22.91935
Doc 265146, score is 22.587479
Doc 554088, score is 22.587479
Doc 586432, score is 22.379158
Doc 583987, score is 22.379158
Doc 

Doc 869740, score is 11.258926
Doc 772518, score is 11.098099
Doc 1153112, score is 10.972708
Doc 1151542, score is 10.855631
Doc 681812, score is 10.855631
Doc 875807, score is 10.602444
Doc 406748, score is 10.602444
arw50334
Total documents: 100
Doc 1417446, score is 13.323986
Doc 528825, score is 13.281955
Doc 1361098, score is 13.175722
Doc 962463, score is 12.746555
Doc 1460362, score is 12.555131
Doc 1460353, score is 12.139685
Doc 1004001, score is 11.968523
Doc 870369, score is 11.086902
Doc 87177, score is 10.9626665
Doc 1510788, score is 10.7207
Doc 1028943, score is 10.662691
Doc 540587, score is 10.662691
Doc 699734, score is 10.662691
Doc 51541, score is 10.492373
Doc 50633, score is 10.492373
Doc 51986, score is 10.492373
Doc 52335, score is 10.492373
Doc 1379381, score is 10.492373
Doc 20003, score is 10.492373
Doc 30273, score is 10.4352255
arw50340
Total documents: 90
Doc 1033624, score is 15.955619
Doc 382878, score is 15.824734
Doc 1467906, score is 15.546444
Doc 70

arw50640
Total documents: 10000
Doc 659122, score is 27.914932
Doc 1467824, score is 27.227854
Doc 525099, score is 26.654903
Doc 1083510, score is 25.425962
Doc 562614, score is 25.298504
Doc 437130, score is 24.784813
Doc 347413, score is 24.429518
Doc 63744, score is 24.409668
Doc 1509245, score is 24.305544
Doc 1241125, score is 24.095848
Doc 327487, score is 24.046532
Doc 1172102, score is 24.009014
Doc 414661, score is 23.79013
Doc 794798, score is 23.757137
Doc 797577, score is 23.631844
Doc 1213094, score is 23.582073
Doc 350749, score is 23.388721
Doc 987417, score is 23.358229
Doc 5256, score is 23.320189
Doc 1177738, score is 23.28132
arw50642
Total documents: 10000
Doc 587618, score is 30.704493
Doc 499796, score is 28.821005
Doc 1309114, score is 28.792028
Doc 833891, score is 28.571018
Doc 87687, score is 27.684397
Doc 667405, score is 27.168718
Doc 37719, score is 26.969955
Doc 1077038, score is 26.903975
Doc 1271751, score is 26.788918
Doc 1319406, score is 26.676342
Do

arw50992
Total documents: 10000
Doc 1486127, score is 20.211136
Doc 1089808, score is 19.603209
Doc 334467, score is 16.129107
Doc 542998, score is 16.103674
Doc 801873, score is 16.103674
Doc 536864, score is 15.039522
Doc 1189188, score is 14.620038
Doc 1137755, score is 14.620038
Doc 801484, score is 13.89085
Doc 1290210, score is 13.591004
Doc 98479, score is 13.591004
Doc 272140, score is 9.298777
Doc 1445097, score is 6.468836
Doc 334662, score is 6.093822
Doc 1228033, score is 6.086164
Doc 106557, score is 6.0330353
Doc 639087, score is 6.0330353
Doc 1040615, score is 6.0123744
Doc 543380, score is 6.0123744
Doc 1212644, score is 6.010377
arw51002
Total documents: 1027
Doc 524586, score is 15.47694
Doc 814021, score is 11.81953
Doc 1101954, score is 11.6875
Doc 686382, score is 11.591975
Doc 956521, score is 11.481452
Doc 723829, score is 11.2277775
Doc 1273459, score is 11.111229
Doc 21863, score is 11.07958
Doc 914871, score is 10.96565
Doc 822495, score is 10.9004345
Doc 9148

arw51261
Total documents: 10000
Doc 922588, score is 17.162914
Doc 49556, score is 15.936261
Doc 1314149, score is 15.840897
Doc 552845, score is 15.381245
Doc 1367552, score is 14.829798
Doc 691982, score is 14.630392
Doc 1281682, score is 14.559975
Doc 1194053, score is 14.534484
Doc 1453711, score is 14.374748
Doc 971134, score is 14.25202
Doc 948462, score is 13.64081
Doc 1172030, score is 13.4893875
Doc 1383249, score is 13.4893875
Doc 13422, score is 13.399129
Doc 1050925, score is 13.311316
Doc 1077082, score is 13.311316
Doc 1372649, score is 13.118333
Doc 588475, score is 13.013096
Doc 1290544, score is 12.867237
Doc 625753, score is 12.867237
arw51324
Total documents: 1102
Doc 1100421, score is 10.752583
Doc 1277822, score is 10.6715145
Doc 1090572, score is 10.6239605
Doc 536094, score is 10.389012
Doc 294952, score is 10.347915
Doc 431494, score is 10.285963
Doc 584536, score is 10.255918
Doc 51247, score is 10.246686
Doc 1498635, score is 10.181152
Doc 1038919, score is 10

arw51707
Total documents: 10000
Doc 973441, score is 29.061052
Doc 981946, score is 29.061052
Doc 973225, score is 28.106766
Doc 1290346, score is 26.442741
Doc 1278744, score is 24.119064
Doc 1361465, score is 23.969234
Doc 1511915, score is 23.768211
Doc 1348572, score is 23.408598
Doc 899699, score is 22.996086
Doc 1376405, score is 22.93291
Doc 900215, score is 22.886318
Doc 1511920, score is 22.640072
Doc 971176, score is 22.429548
Doc 658861, score is 22.327877
Doc 981951, score is 22.30986
Doc 626861, score is 21.881338
Doc 574168, score is 21.620586
Doc 1385691, score is 21.179123
Doc 1019039, score is 20.740513
Doc 1467340, score is 20.172121
arw51738
Total documents: 2109
Doc 1149549, score is 18.941196
Doc 847094, score is 15.833202
Doc 1203611, score is 15.281902
Doc 1276552, score is 14.94002
Doc 606458, score is 13.42048
Doc 349169, score is 13.06801
Doc 836216, score is 12.916482
Doc 1184950, score is 12.903099
Doc 607591, score is 12.011375
Doc 937551, score is 11.52724

arw52021
Total documents: 10000
Doc 91769, score is 26.309206
Doc 826968, score is 25.40319
Doc 1463340, score is 24.554974
Doc 262720, score is 23.374235
Doc 499282, score is 23.374235
Doc 265077, score is 22.791801
Doc 60678, score is 22.692688
Doc 978386, score is 22.660421
Doc 265076, score is 22.648115
Doc 1015587, score is 22.609379
Doc 1211046, score is 22.421856
Doc 553252, score is 22.38124
Doc 345911, score is 22.190826
Doc 1015586, score is 22.070807
Doc 347670, score is 22.058498
Doc 497150, score is 22.04501
Doc 978385, score is 21.901958
Doc 347672, score is 21.901958
Doc 553255, score is 21.88794
Doc 265078, score is 21.528446
arw52024
Total documents: 10000
Doc 1190396, score is 21.867609
Doc 944076, score is 21.86595
Doc 1045088, score is 21.40886
Doc 297533, score is 20.695402
Doc 345203, score is 20.522846
Doc 591826, score is 20.341042
Doc 1301374, score is 20.329113
Doc 513866, score is 20.288437
Doc 2038, score is 20.033426
Doc 944083, score is 19.687931
Doc 13013

arw52242
Total documents: 10000
Doc 473449, score is 33.764732
Doc 698416, score is 33.40769
Doc 1154407, score is 33.17814
Doc 1160229, score is 33.17814
Doc 519771, score is 32.899353
Doc 1249616, score is 31.416248
Doc 1151786, score is 30.104698
Doc 255157, score is 29.765295
Doc 88161, score is 29.420006
Doc 698930, score is 29.057316
Doc 1019561, score is 29.04709
Doc 1449807, score is 28.838648
Doc 1356527, score is 28.467165
Doc 846198, score is 28.373228
Doc 355689, score is 28.199564
Doc 987483, score is 28.158024
Doc 666604, score is 27.510475
Doc 1466682, score is 27.202248
Doc 579898, score is 26.943216
Doc 1095981, score is 26.912403
arw52248
Total documents: 7046
Doc 1266671, score is 30.682915
Doc 1498520, score is 25.1149
Doc 104318, score is 23.920715
Doc 1038706, score is 23.875668
Doc 1224444, score is 23.875668
Doc 105248, score is 23.374718
Doc 537841, score is 23.349453
Doc 1012588, score is 23.124964
Doc 1486654, score is 22.607388
Doc 104692, score is 22.462488

arw52658
Total documents: 1077
Doc 268139, score is 11.174965
Doc 444199, score is 11.053
Doc 342742, score is 11.034332
Doc 558824, score is 11.034332
Doc 361054, score is 10.954773
Doc 1468188, score is 10.936199
Doc 1232991, score is 10.92126
Doc 379989, score is 10.852321
Doc 1425658, score is 10.727658
Doc 924193, score is 10.724531
Doc 1244342, score is 10.540855
Doc 528610, score is 10.493166
Doc 660524, score is 10.477083
Doc 1449359, score is 10.174033
Doc 339884, score is 10.149076
Doc 455120, score is 10.149076
Doc 345368, score is 10.13599
Doc 1237208, score is 10.123482
Doc 685945, score is 10.123482
Doc 477104, score is 10.113241
arw52662
Total documents: 10000
Doc 1051244, score is 19.861017
Doc 1348263, score is 18.003328
Doc 1273303, score is 18.003328
Doc 381320, score is 18.003328
Doc 1346589, score is 18.003328
Doc 381321, score is 17.994059
Doc 88763, score is 17.82624
Doc 88962, score is 17.82624
Doc 1427074, score is 17.82624
Doc 1274818, score is 17.82624
Doc 89

arw52912
Total documents: 1086
Doc 1012041, score is 10.99682
Doc 886462, score is 10.914975
Doc 1303724, score is 10.87716
Doc 1111496, score is 10.80677
Doc 1103298, score is 10.707136
Doc 886360, score is 10.499869
Doc 577540, score is 10.406397
Doc 1161827, score is 10.305868
Doc 1131564, score is 10.27551
Doc 570157, score is 10.236144
Doc 1266973, score is 10.209791
Doc 1012038, score is 10.198736
Doc 1500905, score is 10.173619
Doc 1360579, score is 10.166444
Doc 634709, score is 10.158311
Doc 442089, score is 10.119838
Doc 1521902, score is 10.119838
Doc 118869, score is 10.107348
Doc 1503507, score is 10.098138
Doc 346069, score is 10.051152
arw52920
Total documents: 10000
Doc 264130, score is 17.815317
Doc 1483577, score is 15.918356
Doc 1499088, score is 15.572641
Doc 18575, score is 15.567059
Doc 280304, score is 15.298507
Doc 1271866, score is 15.298507
Doc 949644, score is 15.165854
Doc 1017426, score is 15.150962
Doc 674335, score is 14.978092
Doc 552288, score is 14.953

arw53305
Total documents: 10000
Doc 1361552, score is 19.693933
Doc 890717, score is 18.753351
Doc 1084506, score is 18.282314
Doc 925719, score is 18.282314
Doc 817301, score is 18.23365
Doc 570384, score is 18.073954
Doc 1136044, score is 17.963703
Doc 690787, score is 17.963703
Doc 919242, score is 17.691374
Doc 818230, score is 17.56844
Doc 65922, score is 17.517279
Doc 885456, score is 17.457247
Doc 1055390, score is 17.423624
Doc 1102167, score is 17.188303
Doc 910330, score is 17.100098
Doc 815965, score is 16.832191
Doc 37574, score is 16.79794
Doc 1469951, score is 16.775585
Doc 870816, score is 16.573742
Doc 688495, score is 16.530725
arw53317
Total documents: 723
Doc 264868, score is 11.754594
Doc 584257, score is 11.676565
Doc 583948, score is 11.57556
Doc 25265, score is 11.542558
Doc 496841, score is 11.537966
Doc 346704, score is 11.526178
Doc 512414, score is 11.497738
Doc 583112, score is 11.466701
Doc 1015283, score is 11.458538
Doc 1382185, score is 11.449308
Doc 346

arw53593
Total documents: 10000
Doc 1342791, score is 22.428156
Doc 1318209, score is 19.865807
Doc 322109, score is 19.229635
Doc 486179, score is 18.462109
Doc 377162, score is 18.449724
Doc 1341005, score is 18.093874
Doc 1343037, score is 17.739027
Doc 458993, score is 17.08294
Doc 943720, score is 15.94191
Doc 614914, score is 15.651144
Doc 1204054, score is 15.530867
Doc 1517160, score is 15.201702
Doc 767099, score is 15.190189
Doc 1196309, score is 14.847935
Doc 1516733, score is 14.649106
Doc 1252679, score is 14.649106
Doc 632616, score is 14.517059
Doc 40833, score is 14.452792
Doc 1231702, score is 14.412608
Doc 696938, score is 14.258832
arw53594
Total documents: 10000
Doc 529965, score is 19.580694
Doc 887193, score is 17.333164
Doc 1336463, score is 16.30972
Doc 348577, score is 16.186203
Doc 494975, score is 16.12937
Doc 386900, score is 15.728544
Doc 471318, score is 15.477055
Doc 660565, score is 14.511826
Doc 29028, score is 14.503913
Doc 628270, score is 14.441107
D

arw53765
Total documents: 466
Doc 25804, score is 28.748547
Doc 900028, score is 28.606152
Doc 508789, score is 28.606152
Doc 25879, score is 28.593828
Doc 252417, score is 28.302227
Doc 906314, score is 28.302227
Doc 114511, score is 28.111603
Doc 506074, score is 26.744505
Doc 294293, score is 25.702335
Doc 252410, score is 24.702942
Doc 814973, score is 24.603777
Doc 1442174, score is 24.480232
Doc 1151321, score is 22.301619
Doc 931575, score is 21.89486
Doc 785583, score is 21.812183
Doc 1466935, score is 18.144272
Doc 1032523, score is 18.144272
Doc 513678, score is 18.144272
Doc 1337482, score is 18.051394
Doc 1353754, score is 17.785835
arw53801
Total documents: 7090
Doc 1416489, score is 16.718613
Doc 913237, score is 16.718613
Doc 1447662, score is 16.718613
Doc 1118520, score is 16.69329
Doc 1414297, score is 16.687946
Doc 911078, score is 16.687946
Doc 911075, score is 16.687946
Doc 409084, score is 16.687946
Doc 1000976, score is 16.687946
Doc 1000980, score is 16.687946
D

arw53926
Total documents: 10000
Doc 1177584, score is 24.128721
Doc 857008, score is 23.632528
Doc 593289, score is 22.490389
Doc 855692, score is 22.09192
Doc 593367, score is 21.70517
Doc 478680, score is 21.543695
Doc 6489, score is 21.242926
Doc 1002382, score is 20.932987
Doc 1177642, score is 20.751787
Doc 407637, score is 20.062546
Doc 594256, score is 19.991873
Doc 31641, score is 19.956673
Doc 853162, score is 19.701347
Doc 1417354, score is 18.743649
Doc 912518, score is 18.114407
Doc 682110, score is 18.078485
Doc 822834, score is 17.808968
Doc 820219, score is 17.712477
Doc 785126, score is 17.704903
Doc 1412839, score is 17.400335
arw53977
Total documents: 2617
Doc 1005274, score is 18.219769
Doc 511525, score is 17.011576
Doc 253057, score is 16.780418
Doc 1274698, score is 16.430906
Doc 1353754, score is 16.180033
Doc 1374470, score is 16.091166
Doc 547570, score is 15.841507
Doc 548255, score is 15.49873
Doc 960976, score is 15.3659115
Doc 937338, score is 15.242329
Doc

arw54215
Total documents: 10000
Doc 880425, score is 24.230701
Doc 1164712, score is 23.94061
Doc 269989, score is 22.361134
Doc 256385, score is 22.298912
Doc 1167288, score is 22.231422
Doc 1163687, score is 21.502087
Doc 1302333, score is 20.838165
Doc 880393, score is 20.738384
Doc 529395, score is 20.664461
Doc 1431308, score is 20.640543
Doc 249778, score is 20.612118
Doc 1072424, score is 20.37719
Doc 118454, score is 20.211042
Doc 791654, score is 20.17599
Doc 714505, score is 20.165714
Doc 1199549, score is 20.093285
Doc 567955, score is 20.036255
Doc 416767, score is 20.036255
Doc 409459, score is 20.011595
Doc 690052, score is 19.842396
arw54217
Total documents: 10000
Doc 89138, score is 30.083782
Doc 968388, score is 26.099094
Doc 87856, score is 25.266155
Doc 5689, score is 24.465061
Doc 1426483, score is 23.375008
Doc 1274558, score is 23.296925
Doc 378858, score is 23.177826
Doc 380712, score is 22.80917
Doc 455888, score is 22.710972
Doc 1173814, score is 22.710972
Doc 

arw54521
Total documents: 10000
Doc 330464, score is 27.272547
Doc 840596, score is 25.93009
Doc 659331, score is 24.81066
Doc 615312, score is 24.290325
Doc 567289, score is 20.719181
Doc 578736, score is 20.63322
Doc 398928, score is 20.614246
Doc 1353449, score is 20.590178
Doc 1130678, score is 20.438358
Doc 35669, score is 20.438358
Doc 1047444, score is 19.909227
Doc 272602, score is 19.634527
Doc 669027, score is 19.610968
Doc 688033, score is 19.575314
Doc 924726, score is 19.481474
Doc 1019197, score is 19.30534
Doc 1143725, score is 19.275753
Doc 1054340, score is 19.014574
Doc 1367283, score is 18.875679
Doc 1278450, score is 18.328535
arw54539
Total documents: 10000
Doc 1021086, score is 14.681692
Doc 842844, score is 14.612167
Doc 1524488, score is 14.612167
Doc 601863, score is 14.487108
Doc 287787, score is 14.360635
Doc 599281, score is 14.346095
Doc 1330961, score is 14.1785965
Doc 1330130, score is 13.278646
Doc 362193, score is 13.055929
Doc 285594, score is 12.60108

arw54689
Total documents: 10000
Doc 494026, score is 53.966084
Doc 1108884, score is 53.921078
Doc 493534, score is 53.74503
Doc 720166, score is 52.102768
Doc 935103, score is 51.9352
Doc 493651, score is 51.9352
Doc 510107, score is 51.85098
Doc 315769, score is 51.418472
Doc 935844, score is 51.372864
Doc 935604, score is 50.406345
Doc 430703, score is 49.87857
Doc 510205, score is 48.935257
Doc 655238, score is 48.673306
Doc 1106889, score is 47.10712
Doc 1108558, score is 46.126293
Doc 611670, score is 45.86368
Doc 286429, score is 34.51128
Doc 890072, score is 34.51128
Doc 825472, score is 34.51128
Doc 256395, score is 29.69086
arw54690
Total documents: 10000
Doc 1085052, score is 33.251137
Doc 951227, score is 32.4597
Doc 490032, score is 31.42192
Doc 577457, score is 29.994658
Doc 1460234, score is 29.299063
Doc 1365564, score is 27.797058
Doc 1420762, score is 27.645182
Doc 1085050, score is 27.635612
Doc 951226, score is 27.599525
Doc 1238206, score is 27.536411
Doc 65937, sc

arw55023
Total documents: 10000
Doc 32213, score is 16.412521
Doc 1412691, score is 16.24975
Doc 1001471, score is 16.027082
Doc 33927, score is 16.022879
Doc 409451, score is 15.989201
Doc 275284, score is 15.946986
Doc 819119, score is 15.946108
Doc 286294, score is 15.934877
Doc 868223, score is 15.90553
Doc 911888, score is 15.885907
Doc 37290, score is 15.866585
Doc 781520, score is 15.845396
Doc 276947, score is 15.832124
Doc 1134971, score is 15.762526
Doc 1277638, score is 15.743525
Doc 1449374, score is 15.7307005
Doc 276533, score is 15.7158165
Doc 1131523, score is 15.661043
Doc 1298824, score is 15.641056
Doc 1484438, score is 15.640758
arw55029
Total documents: 500
Doc 1137879, score is 21.939053
Doc 1274977, score is 14.490084
Doc 945666, score is 13.669575
Doc 661647, score is 13.274825
Doc 456527, score is 13.127411
Doc 1249348, score is 12.82178
Doc 1301051, score is 12.748534
Doc 1517394, score is 12.690199
Doc 625724, score is 12.589632
Doc 309081, score is 12.464432

arw55279
Total documents: 10000
Doc 434022, score is 20.422787
Doc 1116174, score is 19.950666
Doc 1503571, score is 18.31614
Doc 1229604, score is 18.126
Doc 450041, score is 17.545767
Doc 262844, score is 17.349432
Doc 891598, score is 16.415735
Doc 1364571, score is 15.958765
Doc 1359784, score is 15.416969
Doc 562581, score is 15.214747
Doc 786691, score is 14.869734
Doc 282757, score is 14.543275
Doc 1035746, score is 14.543275
Doc 1114885, score is 14.024897
Doc 259878, score is 14.024897
Doc 553219, score is 13.584633
Doc 1364226, score is 13.362502
Doc 1161713, score is 12.579705
Doc 1351493, score is 12.260697
Doc 703336, score is 12.154256
arw55289
Total documents: 6
Doc 606468, score is 16.000986
Doc 1499189, score is 15.809934
Doc 1039709, score is 12.288523
Doc 1228759, score is 11.293904
Doc 1175399, score is 5.1604414
Doc 395157, score is 2.37619
arw55347
Total documents: 1131
Doc 691960, score is 21.966454
Doc 1390117, score is 21.245077
Doc 863639, score is 21.195393
D

arw55577
Total documents: 10000
Doc 1450667, score is 5.5565906
Doc 337109, score is 5.5521684
Doc 964830, score is 5.543637
Doc 879636, score is 5.543637
Doc 883776, score is 5.543637
Doc 556009, score is 5.5397325
Doc 1262318, score is 5.5393105
Doc 1010646, score is 5.5168953
Doc 1495388, score is 5.513728
Doc 19739, score is 5.513512
Doc 1050924, score is 5.5133295
Doc 436955, score is 5.5111876
Doc 844942, score is 5.5099325
Doc 1509214, score is 5.5099325
Doc 20426, score is 5.5099325
Doc 1180992, score is 5.508147
Doc 1203550, score is 5.505351
Doc 460452, score is 5.4822826
Doc 1314778, score is 5.477102
Doc 927456, score is 5.471763
arw55590
Total documents: 9736
Doc 845511, score is 20.40667
Doc 1339076, score is 18.387228
Doc 542026, score is 18.120113
Doc 558739, score is 15.768151
Doc 513029, score is 15.353232
Doc 833112, score is 15.204534
Doc 1051631, score is 15.115694
Doc 1480118, score is 14.804653
Doc 368331, score is 14.700967
Doc 869478, score is 14.6347475
Doc 64

arw55815
Total documents: 7505
Doc 602094, score is 7.069868
Doc 1016100, score is 7.044832
Doc 776934, score is 7.0439997
Doc 26902, score is 7.039356
Doc 1263982, score is 7.0358706
Doc 499198, score is 7.030027
Doc 811918, score is 7.0000763
Doc 97915, score is 6.9774623
Doc 28419, score is 6.9636436
Doc 1314127, score is 6.9614778
Doc 553768, score is 6.959434
Doc 586097, score is 6.959434
Doc 1362107, score is 6.9552307
Doc 287545, score is 6.947352
Doc 266176, score is 6.947352
Doc 1315274, score is 6.9428434
Doc 641634, score is 6.942332
Doc 607827, score is 6.939466
Doc 961622, score is 6.9324512
Doc 1180688, score is 6.9286857
arw55856
Total documents: 10000
Doc 834035, score is 23.549618
Doc 1176535, score is 19.541033
Doc 1481006, score is 19.347952
Doc 1310530, score is 19.19634
Doc 287335, score is 17.81851
Doc 1228769, score is 17.81851
Doc 794341, score is 17.798008
Doc 103730, score is 16.947235
Doc 288280, score is 16.24774
Doc 940593, score is 16.034643
Doc 335329, sc

arw56173
Total documents: 10000
Doc 1489138, score is 22.296854
Doc 381986, score is 22.14376
Doc 1377453, score is 21.250582
Doc 1490424, score is 20.940008
Doc 1217591, score is 20.266653
Doc 623515, score is 19.44732
Doc 86304, score is 19.319592
Doc 1214944, score is 19.284071
Doc 869040, score is 19.250141
Doc 1112775, score is 19.250141
Doc 44152, score is 19.250141
Doc 1315604, score is 19.069729
Doc 1338395, score is 18.930765
Doc 1487170, score is 18.742647
Doc 1249712, score is 18.522486
Doc 991842, score is 18.30818
Doc 549928, score is 17.988754
Doc 1029311, score is 17.53175
Doc 1262714, score is 17.43184
Doc 1260779, score is 16.41144
arw56192
Total documents: 3956
Doc 1000332, score is 8.45192
Doc 1411661, score is 8.45192
Doc 378775, score is 8.409225
Doc 1055180, score is 8.384908
Doc 1374855, score is 8.36358
Doc 525314, score is 8.351188
Doc 503424, score is 8.31775
Doc 984070, score is 8.295311
Doc 1298617, score is 8.279227
Doc 1462057, score is 8.279227
Doc 694264

arw56597
Total documents: 10000
Doc 1409610, score is 0.30556768
Doc 1077248, score is 0.30437756
Doc 1459068, score is 0.30426902
Doc 1234647, score is 0.30392897
Doc 320801, score is 0.30387834
Doc 1153649, score is 0.30386227
Doc 1050943, score is 0.3037649
Doc 452748, score is 0.30376276
Doc 1268249, score is 0.3036822
Doc 357297, score is 0.30361316
Doc 1101758, score is 0.303599
Doc 374920, score is 0.30359265
Doc 1401396, score is 0.30352962
Doc 794043, score is 0.30351302
Doc 1415217, score is 0.30348232
Doc 376051, score is 0.30347097
Doc 1276713, score is 0.30346283
Doc 597248, score is 0.30345318
Doc 1244361, score is 0.30344826
Doc 258954, score is 0.30330715
arw56607
Total documents: 40
Doc 1209565, score is 14.226777
Doc 1483833, score is 13.98334
Doc 523092, score is 13.662513
Doc 42297, score is 11.685933
Doc 422434, score is 11.685933
Doc 441042, score is 11.685933
Doc 418013, score is 11.685933
Doc 676903, score is 11.622284
Doc 681562, score is 11.622284
Doc 842307, 

arw56834
Total documents: 10000
Doc 1417437, score is 19.871393
Doc 1300210, score is 19.846777
Doc 485173, score is 19.194553
Doc 1102020, score is 16.139612
Doc 1435370, score is 15.406837
Doc 269403, score is 15.256485
Doc 975280, score is 15.155065
Doc 1450667, score is 15.066439
Doc 557506, score is 15.031763
Doc 626613, score is 14.908583
Doc 1136419, score is 14.85356
Doc 822133, score is 14.766443
Doc 816783, score is 14.558392
Doc 850850, score is 14.520383
Doc 953598, score is 14.457537
Doc 1473606, score is 14.309714
Doc 867452, score is 14.269127
Doc 1227026, score is 14.22765
Doc 789899, score is 13.723457
Doc 1463738, score is 13.702687
arw56839
Total documents: 10000
Doc 1418888, score is 17.88842
Doc 105576, score is 17.271183
Doc 1506193, score is 16.9691
Doc 854904, score is 16.651415
Doc 1276329, score is 15.897912
Doc 1402197, score is 15.731503
Doc 258755, score is 15.139602
Doc 1323154, score is 14.868531
Doc 257484, score is 14.817522
Doc 942393, score is 14.1589

arw57208
Total documents: 21
Doc 560699, score is 16.317055
Doc 869933, score is 16.157982
Doc 1136944, score is 14.408125
Doc 1135481, score is 14.149885
Doc 560380, score is 13.81593
Doc 295504, score is 13.267935
Doc 1299025, score is 12.97388
Doc 1187196, score is 12.97388
Doc 1361207, score is 12.760273
Doc 1361711, score is 12.692576
Doc 865884, score is 12.55645
Doc 560970, score is 12.292774
Doc 561544, score is 8.786911
Doc 1137064, score is 8.786911
Doc 871561, score is 8.786911
Doc 296764, score is 8.786911
Doc 1187336, score is 8.530809
Doc 689905, score is 8.530809
Doc 1135960, score is 8.289214
Doc 27126, score is 4.6231756
arw57216
Total documents: 10000
Doc 1100286, score is 13.258922
Doc 1408411, score is 12.313218
Doc 892478, score is 12.220003
Doc 1494517, score is 11.975804
Doc 498329, score is 11.721535
Doc 1484753, score is 11.597618
Doc 1490164, score is 11.540541
Doc 1452378, score is 11.377218
Doc 663166, score is 11.165039
Doc 921130, score is 11.120546
Doc 57

arw57618
Total documents: 3956
Doc 791703, score is 27.35584
Doc 1000332, score is 8.45192
Doc 1411661, score is 8.45192
Doc 378775, score is 8.409225
Doc 1055180, score is 8.384908
Doc 1374855, score is 8.36358
Doc 525314, score is 8.351188
Doc 503424, score is 8.31775
Doc 984070, score is 8.295311
Doc 1298617, score is 8.279227
Doc 1462057, score is 8.279227
Doc 694264, score is 8.279227
Doc 984382, score is 8.279227
Doc 692716, score is 8.279227
Doc 1453247, score is 8.264429
Doc 1422083, score is 8.253533
Doc 437414, score is 8.2478485
Doc 723595, score is 8.231344
Doc 791701, score is 8.217637
Doc 1376865, score is 8.1996975
arw57620
Total documents: 10000
Doc 1462411, score is 18.638817
Doc 1297346, score is 18.202877
Doc 1467505, score is 17.966133
Doc 462612, score is 17.966133
Doc 1446671, score is 16.907497
Doc 357361, score is 16.715446
Doc 647918, score is 16.202995
Doc 570822, score is 16.048801
Doc 1123039, score is 15.081566
Doc 702965, score is 14.983066
Doc 335557, sco

arw57881
Total documents: 10000
Doc 575214, score is 20.189432
Doc 965044, score is 19.294333
Doc 585578, score is 15.903329
Doc 105132, score is 12.282671
Doc 1430591, score is 11.899367
Doc 347330, score is 11.838234
Doc 962880, score is 11.655708
Doc 799700, score is 9.450956
Doc 1158877, score is 9.32595
Doc 1243202, score is 9.041545
Doc 1407793, score is 8.761847
Doc 1144476, score is 8.07221
Doc 1311572, score is 7.240443
Doc 1094376, score is 7.235602
Doc 311924, score is 7.1314797
Doc 1162458, score is 7.101149
Doc 76199, score is 7.074702
Doc 865655, score is 7.0254793
Doc 660609, score is 7.004841
Doc 1190789, score is 7.004841
arw57932
Total documents: 10000
Doc 880639, score is 19.948145
Doc 986290, score is 19.63608
Doc 906465, score is 19.34823
Doc 395954, score is 19.078842
Doc 446652, score is 17.954287
Doc 486491, score is 17.615168
Doc 1086269, score is 17.168764
Doc 49459, score is 17.122644
Doc 1378167, score is 16.943407
Doc 827938, score is 16.86401
Doc 1479444, 

arw58309
Total documents: 10000
Doc 893805, score is 29.099545
Doc 893800, score is 28.63488
Doc 573076, score is 27.136295
Doc 1272980, score is 26.227573
Doc 14202, score is 25.629623
Doc 1347725, score is 25.349981
Doc 963247, score is 25.271112
Doc 1085068, score is 24.866455
Doc 1287074, score is 24.617167
Doc 1221750, score is 24.447718
Doc 65978, score is 24.185253
Doc 539628, score is 24.12385
Doc 899211, score is 24.114517
Doc 1109251, score is 23.7959
Doc 903306, score is 23.693695
Doc 1043979, score is 23.564892
Doc 1282616, score is 23.563654
Doc 773606, score is 23.215055
Doc 1157976, score is 23.201338
Doc 87671, score is 22.980484
arw58328
Total documents: 10000
Doc 1181049, score is 14.501312
Doc 1069025, score is 14.300163
Doc 774457, score is 13.869848
Doc 678850, score is 13.771218
Doc 964505, score is 13.689619
Doc 876651, score is 13.689619
Doc 1079497, score is 13.689619
Doc 44756, score is 13.499004
Doc 420852, score is 13.374075
Doc 1079506, score is 13.374075
D

arw58642
Total documents: 10000
Doc 1491107, score is 20.404606
Doc 1297031, score is 14.113967
Doc 261338, score is 13.839907
Doc 684577, score is 12.944818
Doc 1408446, score is 12.636912
Doc 801491, score is 12.566297
Doc 599684, score is 12.552715
Doc 543015, score is 12.500737
Doc 863196, score is 12.500737
Doc 282598, score is 12.494591
Doc 293718, score is 12.385689
Doc 63780, score is 12.352356
Doc 1285445, score is 12.337606
Doc 1095927, score is 12.331781
Doc 647203, score is 12.173464
Doc 1345705, score is 12.019936
Doc 80777, score is 11.947611
Doc 1431940, score is 11.947611
Doc 444782, score is 11.801967
Doc 1140736, score is 11.796758
arw58656
Total documents: 10000
Doc 1011797, score is 18.854252
Doc 888082, score is 18.648142
Doc 885267, score is 15.682592
Doc 720243, score is 15.160116
Doc 622524, score is 14.612402
Doc 1160367, score is 14.23526
Doc 1215054, score is 13.591521
Doc 1009296, score is 8.266839
Doc 1409610, score is 0.30556768
Doc 1077248, score is 0.304

arw59116
Total documents: 5176
Doc 528773, score is 33.121437
Doc 476884, score is 32.836548
Doc 853932, score is 32.345642
Doc 527818, score is 29.908985
Doc 8377, score is 29.198898
Doc 363156, score is 28.15946
Doc 7810, score is 28.13807
Doc 1421242, score is 27.748856
Doc 529918, score is 25.342451
Doc 1327859, score is 25.20743
Doc 120056, score is 24.127384
Doc 800512, score is 23.981722
Doc 1143981, score is 23.776539
Doc 646927, score is 23.53299
Doc 1461932, score is 23.53299
Doc 1179158, score is 22.676788
Doc 1327100, score is 22.53567
Doc 855154, score is 22.473766
Doc 1483779, score is 22.142532
Doc 1368721, score is 22.067005
arw59118
Total documents: 10000
Doc 1102707, score is 22.294468
Doc 1267950, score is 20.357536
Doc 256824, score is 18.739723
Doc 604411, score is 18.726316
Doc 1276050, score is 17.700895
Doc 1401043, score is 16.954275
Doc 257572, score is 16.81118
Doc 1004627, score is 16.290197
Doc 565785, score is 15.953986
Doc 565501, score is 15.930285
Doc 1

In [41]:
plain_text_res

r = 0.20970577689512557
p = 0.30219378427787935
r_precision = 0.1801915553804202
MAP = 0.344494963276825




In [50]:
from pymystem3 import Mystem
from nltk.corpus import stopwords
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from string import punctuation
from nltk.corpus import stopwords

russian_stopwords = stopwords.words("russian")
english_stopwords = stopwords.words("english")
black_list = ["°", "№", "©", "...", "//", "://", "</", "\">", "=\"", "=\'", "\r", "\n", "\t"]
stem = Mystem()

def lemmatize(text):
    words = nltk.word_tokenize(text.lower())
    tokens = []
    for word in words:
        tokens.extend(stem.lemmatize(word))
    tokens = [token for token in tokens if token != " " and token.strip() not in punctuation \
              and token not in russian_stopwords and token not in english_stopwords \
              and token not in black_list \
              and token.find("\r") == -1 \
              and token.find("\n") == -1 \
              and token.find("\t") == -1 \
              and not (token.isdigit() and len(token) == 1)]
    return ' '.join(tokens)

def generate_queries_lemmas():
    relevances = get_relevance()
    xml_tree = etree.parse("data/web2008_adhoc.xml")
    root = xml_tree.getroot()
    res = []
    for task in tqdm(root.getchildren()):
        if task.get("id") is not None:
            for query_text in task.getchildren():
                try:
                    res.append(Query(task.get("id"), lemmatize(query_text.text), relevances[task.get("id")]))
                except:
                    pass
    print(len(res))
    return res



In [51]:
queries_lemmas = generate_queries_lemmas()

547


HBox(children=(IntProgress(value=0, max=29232), HTML(value='')))


547


In [None]:
queries_lemmas[8].query

In [52]:
lemma_index = Index("lemma_docs", settings_1)

start = time.time()
lemma_index.add_documents("data/json_filtered_tokens_texts")
elapsed = time.time() - start
print(str(timedelta(seconds=elapsed)))

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))


0:02:46.363651


In [53]:
lemma_quality_checker = SearchQualityChecker(queries_lemmas, lemma_index)
lemma_res = lemma_quality_checker.get_results()

HBox(children=(IntProgress(value=0, max=547), HTML(value='')))

arw49633
Total documents: 5080
Doc 914238, score is 13.799336
Doc 1393503, score is 13.701229
Doc 1407642, score is 13.604507
Doc 567839, score is 13.571222
Doc 1385740, score is 13.571222
Doc 1504166, score is 12.54204
Doc 1016244, score is 12.132923
Doc 993272, score is 11.921361
Doc 990390, score is 11.921361
Doc 359461, score is 11.921361
Doc 360857, score is 11.359218
Doc 495806, score is 10.947564
Doc 854005, score is 10.142758
Doc 691503, score is 9.414591
Doc 1340878, score is 9.019914
Doc 1319901, score is 7.9332757
Doc 35586, score is 7.8946466
Doc 114021, score is 7.8873105
Doc 599962, score is 7.8450575
Doc 863522, score is 7.8382044
arw49662
Total documents: 0
arw49674
Total documents: 10000
Doc 1503640, score is 22.287449
Doc 674640, score is 22.287449
Doc 876069, score is 21.912388
Doc 49202, score is 20.201603
Doc 575107, score is 19.590178
Doc 1109272, score is 19.590178
Doc 664883, score is 19.063917
Doc 74265, score is 18.712236
Doc 284872, score is 18.100986
Doc 131

arw49938
Total documents: 10000
Doc 1304391, score is 15.79537
Doc 1370851, score is 11.94859
Doc 593233, score is 11.929664
Doc 108087, score is 11.817169
Doc 417346, score is 11.62208
Doc 824462, score is 11.510288
Doc 1052266, score is 11.509953
Doc 657686, score is 11.491143
Doc 918846, score is 11.426479
Doc 541451, score is 11.406113
Doc 1304086, score is 11.33633
Doc 541404, score is 11.316824
Doc 1300065, score is 11.224773
Doc 362534, score is 11.203213
Doc 1178551, score is 11.174843
Doc 362535, score is 11.17198
Doc 1184140, score is 11.16831
Doc 970087, score is 11.16831
Doc 857427, score is 11.110881
Doc 694597, score is 11.052099
arw49944
Total documents: 10000
Doc 7317, score is 20.949793
Doc 942354, score is 20.436937
Doc 624744, score is 19.993721
Doc 1173677, score is 19.758795
Doc 1480002, score is 19.615366
Doc 362460, score is 19.473494
Doc 1339324, score is 19.33974
Doc 1309622, score is 19.226002
Doc 1369472, score is 19.144844
Doc 1296174, score is 19.084396
Doc

arw50231
Total documents: 8
Doc 1225583, score is 14.690051
Doc 774111, score is 14.279586
Doc 1285919, score is 14.153366
Doc 623607, score is 13.208949
Doc 44774, score is 8.222673
Doc 877305, score is 6.9237366
Doc 768002, score is 6.9237366
Doc 38757, score is 6.9237366
arw50276
Total documents: 10000
Doc 4053, score is 22.949055
Doc 675632, score is 19.641657
Doc 878847, score is 19.247711
Doc 771209, score is 19.166359
Doc 422593, score is 18.929113
Doc 26891, score is 18.830061
Doc 47141, score is 18.696009
Doc 649450, score is 18.668884
Doc 1449392, score is 18.485468
Doc 815863, score is 18.419012
Doc 417668, score is 18.243214
Doc 1365208, score is 18.068588
Doc 679077, score is 17.922443
Doc 988276, score is 17.8506
Doc 280888, score is 17.714481
Doc 968919, score is 17.714481
Doc 41888, score is 17.666801
Doc 1253795, score is 17.597158
Doc 474809, score is 17.428158
Doc 522355, score is 17.425522
arw50280
Total documents: 9877
Doc 858190, score is 19.791473
Doc 969597, sco

arw50632
Total documents: 10000
Doc 24817, score is 21.761023
Doc 485757, score is 21.694103
Doc 257703, score is 21.353956
Doc 1402977, score is 20.872845
Doc 375369, score is 20.75485
Doc 1300095, score is 20.236612
Doc 489207, score is 20.236612
Doc 1329866, score is 20.090866
Doc 258258, score is 19.309658
Doc 293657, score is 19.295757
Doc 565729, score is 19.29205
Doc 1437819, score is 19.105576
Doc 494515, score is 18.657513
Doc 513816, score is 18.608042
Doc 1189020, score is 18.546522
Doc 1436220, score is 18.482197
Doc 1518370, score is 18.437813
Doc 1301645, score is 18.40222
Doc 943357, score is 18.383879
Doc 822492, score is 18.367775
arw50640
Total documents: 10000
Doc 659122, score is 24.24823
Doc 522523, score is 24.189165
Doc 112621, score is 24.086151
Doc 624498, score is 23.89555
Doc 665356, score is 23.714025
Doc 1467824, score is 23.695723
Doc 350749, score is 23.679964
Doc 1083510, score is 23.577042
Doc 463550, score is 23.558205
Doc 1358897, score is 23.46224
Do

arw50992
Total documents: 10000
Doc 1486127, score is 19.565475
Doc 1089808, score is 19.132927
Doc 334467, score is 15.8987
Doc 801873, score is 14.812623
Doc 542998, score is 14.812623
Doc 536864, score is 14.433851
Doc 1137755, score is 13.6394005
Doc 1189188, score is 13.6394005
Doc 98479, score is 12.620554
Doc 1290210, score is 12.620554
Doc 801484, score is 12.551125
Doc 272140, score is 8.664051
Doc 1445097, score is 5.631472
Doc 1356545, score is 5.550957
Doc 543380, score is 5.5301948
Doc 1040615, score is 5.5301948
Doc 106557, score is 5.527626
Doc 639087, score is 5.527626
Doc 71973, score is 5.524686
Doc 1319632, score is 5.524686
arw51002
Total documents: 4237
Doc 1185833, score is 16.427021
Doc 1230185, score is 15.08635
Doc 1030052, score is 14.435724
Doc 669017, score is 12.603048
Doc 8599, score is 12.269979
Doc 524586, score is 11.966372
Doc 826408, score is 11.139467
Doc 446178, score is 11.13147
Doc 1238116, score is 11.117227
Doc 671795, score is 11.003828
Doc 109

arw51349
Total documents: 10000
Doc 541402, score is 25.554665
Doc 354813, score is 25.554665
Doc 896613, score is 25.4735
Doc 814745, score is 25.371973
Doc 1050860, score is 24.86758
Doc 1249277, score is 24.819841
Doc 1518543, score is 24.686565
Doc 1515926, score is 24.663258
Doc 484424, score is 24.323034
Doc 909967, score is 24.267609
Doc 1226632, score is 24.18976
Doc 440527, score is 23.81388
Doc 1166257, score is 23.736593
Doc 355728, score is 23.73097
Doc 648121, score is 23.727432
Doc 539311, score is 23.657415
Doc 1452063, score is 23.614717
Doc 32824, score is 22.971523
Doc 778209, score is 22.933285
Doc 550755, score is 22.789783
arw51424
Total documents: 79
Doc 769095, score is 15.068143
Doc 797336, score is 14.752416
Doc 5272, score is 14.663641
Doc 774553, score is 14.529088
Doc 1127008, score is 14.529088
Doc 574814, score is 14.425873
Doc 1127150, score is 14.425873
Doc 312153, score is 14.425873
Doc 418551, score is 14.124842
Doc 572440, score is 14.124842
Doc 12662

arw51844
Total documents: 10000
Doc 283844, score is 20.656525
Doc 1036895, score is 20.56541
Doc 707211, score is 19.891476
Doc 307027, score is 19.802536
Doc 9119, score is 19.74901
Doc 284803, score is 19.694746
Doc 1136520, score is 19.01958
Doc 687759, score is 17.054613
Doc 1142025, score is 16.944338
Doc 12468, score is 16.896263
Doc 284793, score is 16.835588
Doc 306018, score is 16.835588
Doc 12231, score is 16.728334
Doc 1142033, score is 16.648512
Doc 1035903, score is 16.622538
Doc 1036900, score is 16.622538
Doc 12467, score is 16.610743
Doc 1035908, score is 16.610743
Doc 1035916, score is 16.610743
Doc 1470357, score is 16.41521
arw51855
Total documents: 10000
Doc 1171944, score is 11.543243
Doc 1170549, score is 11.519812
Doc 1212350, score is 11.433384
Doc 1403978, score is 11.374843
Doc 1357476, score is 11.340864
Doc 67144, score is 11.316428
Doc 1417998, score is 11.31015
Doc 850960, score is 11.30537
Doc 1170664, score is 11.281133
Doc 1445783, score is 11.256857
D

arw52125
Total documents: 3316
Doc 261838, score is 14.896479
Doc 1227844, score is 14.736691
Doc 1225588, score is 14.439638
Doc 1100542, score is 13.993925
Doc 667525, score is 13.146842
Doc 1506536, score is 12.821537
Doc 1376937, score is 12.794041
Doc 1385812, score is 12.691777
Doc 453319, score is 12.532604
Doc 664704, score is 12.485756
Doc 1101715, score is 12.485756
Doc 1273645, score is 12.253114
Doc 898929, score is 12.241103
Doc 84873, score is 12.227117
Doc 1273557, score is 12.221868
Doc 83211, score is 12.221203
Doc 688036, score is 12.217633
Doc 1478666, score is 12.216946
Doc 546301, score is 12.216207
Doc 1425779, score is 12.215445
arw52130
Total documents: 23
Doc 1305188, score is 17.766418
Doc 1008317, score is 17.205988
Doc 1214185, score is 14.578485
Doc 724306, score is 12.380008
Doc 24615, score is 12.218867
Doc 1028017, score is 12.218867
Doc 1231145, score is 12.218867
Doc 512355, score is 12.218867
Doc 25790, score is 12.218867
Doc 1007549, score is 12.0618

arw52490
Total documents: 10000
Doc 320801, score is 2.696746
Doc 1153649, score is 2.696456
Doc 1234647, score is 2.691239
Doc 1279604, score is 2.6868446
Doc 1219475, score is 2.6436272
Doc 1310659, score is 2.6386907
Doc 1479695, score is 2.6386907
Doc 1509458, score is 2.631515
Doc 833499, score is 2.631515
Doc 988104, score is 2.631515
Doc 1509922, score is 2.631515
Doc 845065, score is 2.6278067
Doc 612082, score is 2.625306
Doc 531583, score is 2.625306
Doc 1509591, score is 2.6243782
Doc 1510760, score is 2.6243782
Doc 1510270, score is 2.6243782
Doc 846067, score is 2.6243782
Doc 1510066, score is 2.6243782
Doc 1176640, score is 2.6243782
arw52503
Total documents: 10000
Doc 28001, score is 17.93972
Doc 691220, score is 17.935331
Doc 1030702, score is 17.235563
Doc 703709, score is 16.71531
Doc 688532, score is 16.52903
Doc 278631, score is 16.309853
Doc 1364509, score is 16.225624
Doc 265034, score is 16.10484
Doc 1080294, score is 16.09547
Doc 1081638, score is 16.09547
Doc 1

arw52825
Total documents: 10000
Doc 288543, score is 14.531702
Doc 1518195, score is 14.472599
Doc 462094, score is 14.452526
Doc 1237714, score is 14.085773
Doc 288482, score is 14.070376
Doc 1365089, score is 13.944577
Doc 1206903, score is 13.872034
Doc 823108, score is 13.753472
Doc 377105, score is 13.329754
Doc 49782, score is 13.162613
Doc 1143489, score is 13.075568
Doc 1272205, score is 12.648964
Doc 72642, score is 12.490864
Doc 1332956, score is 12.342947
Doc 593618, score is 12.331185
Doc 286752, score is 12.008669
Doc 1331400, score is 12.008669
Doc 285916, score is 12.008669
Doc 869853, score is 11.908874
Doc 610184, score is 11.863944
arw52834
Total documents: 10000
Doc 345476, score is 28.598526
Doc 807641, score is 27.912489
Doc 624697, score is 27.534595
Doc 426362, score is 26.587685
Doc 1503723, score is 26.386587
Doc 583955, score is 26.119305
Doc 19775, score is 25.57252
Doc 883557, score is 25.078926
Doc 772506, score is 25.078926
Doc 470525, score is 24.995953
D

arw53202
Total documents: 10000
Doc 1179266, score is 15.871748
Doc 252211, score is 15.743658
Doc 957839, score is 15.606548
Doc 964732, score is 15.457533
Doc 890680, score is 15.445134
Doc 110519, score is 15.424434
Doc 491764, score is 15.414047
Doc 1281358, score is 15.402998
Doc 1246318, score is 15.286074
Doc 1059764, score is 15.165575
Doc 553854, score is 15.079774
Doc 855361, score is 14.927911
Doc 365814, score is 14.927911
Doc 658900, score is 14.801195
Doc 374603, score is 14.777653
Doc 299311, score is 14.543029
Doc 34112, score is 14.525992
Doc 1051806, score is 14.471552
Doc 972894, score is 14.4515705
Doc 94172, score is 14.437599
arw53207
Total documents: 10000
Doc 82135, score is 14.379086
Doc 1225746, score is 13.890604
Doc 428042, score is 12.768988
Doc 391149, score is 12.105948
Doc 1056685, score is 12.086409
Doc 1474358, score is 12.086409
Doc 1456018, score is 12.078672
Doc 15150, score is 12.005146
Doc 519948, score is 11.998784
Doc 632994, score is 11.895075


arw53594
Total documents: 10000
Doc 887067, score is 18.742044
Doc 1143662, score is 18.69343
Doc 1014947, score is 18.399586
Doc 478074, score is 18.193825
Doc 891644, score is 17.681221
Doc 416615, score is 17.48112
Doc 631012, score is 17.264503
Doc 299755, score is 17.254484
Doc 1273343, score is 17.246012
Doc 1082001, score is 16.82966
Doc 1232727, score is 16.71434
Doc 885722, score is 16.71434
Doc 1462044, score is 16.562183
Doc 29028, score is 16.519894
Doc 916333, score is 16.417727
Doc 54824, score is 16.258955
Doc 950909, score is 16.17251
Doc 1014954, score is 16.069464
Doc 618530, score is 16.02259
Doc 1014948, score is 15.759073
arw53610
Total documents: 2164
Doc 617818, score is 18.056519
Doc 1190205, score is 17.898804
Doc 1297910, score is 17.151234
Doc 807554, score is 17.117218
Doc 1258456, score is 16.885162
Doc 638017, score is 16.227814
Doc 1502005, score is 16.027704
Doc 1277709, score is 16.026857
Doc 1276309, score is 16.026857
Doc 626081, score is 15.110659
Do

arw53874
Total documents: 10000
Doc 1243992, score is 15.1493435
Doc 363194, score is 14.330003
Doc 1082011, score is 14.247816
Doc 1384168, score is 13.161551
Doc 413648, score is 13.161551
Doc 888436, score is 12.72921
Doc 1323148, score is 12.584463
Doc 814253, score is 12.352543
Doc 624489, score is 12.314097
Doc 23662, score is 12.28207
Doc 698939, score is 12.138827
Doc 904734, score is 12.133778
Doc 337040, score is 11.889465
Doc 251837, score is 11.888153
Doc 447004, score is 11.847031
Doc 1029665, score is 11.824385
Doc 1301101, score is 11.798386
Doc 918205, score is 11.695078
Doc 1194336, score is 11.6236515
Doc 698927, score is 11.608623
arw53876
Total documents: 10000
Doc 554392, score is 23.624575
Doc 678744, score is 20.030088
Doc 1369847, score is 19.903555
Doc 1461280, score is 19.903555
Doc 363797, score is 19.903555
Doc 855105, score is 19.724104
Doc 1214419, score is 19.557959
Doc 292590, score is 18.94745
Doc 478939, score is 18.786568
Doc 855899, score is 18.60909

arw54238
Total documents: 4264
Doc 1505359, score is 18.87997
Doc 497748, score is 17.740402
Doc 1524367, score is 17.176983
Doc 268587, score is 17.017044
Doc 981327, score is 17.005535
Doc 267892, score is 16.76175
Doc 1419767, score is 16.6264
Doc 782228, score is 16.58788
Doc 1000040, score is 16.58788
Doc 986343, score is 16.557121
Doc 1083332, score is 16.333414
Doc 449460, score is 16.131054
Doc 301766, score is 15.955451
Doc 979176, score is 15.7656765
Doc 1013946, score is 15.759623
Doc 55593, score is 15.614643
Doc 584607, score is 15.442999
Doc 1013964, score is 15.132832
Doc 1459879, score is 15.037942
Doc 945237, score is 14.4169235
arw54264
Total documents: 5631
Doc 1047380, score is 18.664583
Doc 1081990, score is 18.664583
Doc 481881, score is 18.035116
Doc 1071454, score is 18.035116
Doc 932644, score is 17.485239
Doc 423331, score is 17.252821
Doc 1359716, score is 17.252821
Doc 464815, score is 17.252821
Doc 1503691, score is 17.252821
Doc 677778, score is 17.252821


arw54584
Total documents: 10000
Doc 613412, score is 19.046606
Doc 306927, score is 18.436026
Doc 1108610, score is 17.722757
Doc 868160, score is 17.377474
Doc 628972, score is 16.521915
Doc 296099, score is 16.240162
Doc 306930, score is 15.083913
Doc 1472176, score is 14.953192
Doc 300024, score is 14.783103
Doc 1218891, score is 14.526531
Doc 1150901, score is 14.504645
Doc 708620, score is 14.388118
Doc 1294194, score is 14.280734
Doc 1490298, score is 14.07993
Doc 1522186, score is 14.025143
Doc 65389, score is 13.8825245
Doc 1217978, score is 13.821773
Doc 1490439, score is 13.793146
Doc 1481638, score is 13.764902
Doc 396990, score is 13.722465
arw54588
Total documents: 10000
Doc 562653, score is 18.594326
Doc 818227, score is 13.822156
Doc 1482597, score is 13.440811
Doc 543417, score is 13.286085
Doc 874482, score is 12.609288
Doc 112463, score is 12.31119
Doc 1517136, score is 11.99952
Doc 264644, score is 11.547705
Doc 1302457, score is 11.547705
Doc 659744, score is 11.145

arw54862
Total documents: 10000
Doc 374968, score is 15.148007
Doc 1328953, score is 14.877043
Doc 528143, score is 14.867136
Doc 530071, score is 14.801147
Doc 871084, score is 14.779827
Doc 1370460, score is 14.703693
Doc 1460630, score is 14.6133795
Doc 314928, score is 14.603029
Doc 528566, score is 14.56616
Doc 1145609, score is 14.49601
Doc 951349, score is 14.474516
Doc 374711, score is 14.369734
Doc 1272198, score is 14.255692
Doc 1100027, score is 14.181939
Doc 364275, score is 14.051626
Doc 1473519, score is 13.971083
Doc 649091, score is 13.905801
Doc 1109090, score is 13.894958
Doc 888405, score is 13.884058
Doc 802202, score is 13.871987
arw54881
Total documents: 1466
Doc 473539, score is 17.834948
Doc 711160, score is 14.328013
Doc 1469951, score is 13.264419
Doc 657910, score is 12.613114
Doc 511651, score is 10.326382
Doc 511647, score is 10.326382
Doc 724978, score is 10.22026
Doc 510826, score is 10.212114
Doc 24519, score is 10.212114
Doc 1418793, score is 10.19719
D

arw55221
Total documents: 1135
Doc 427282, score is 24.92881
Doc 1435523, score is 21.125961
Doc 858190, score is 20.863605
Doc 793249, score is 20.157042
Doc 629099, score is 19.368465
Doc 1455930, score is 16.631586
Doc 1489436, score is 14.46824
Doc 1116227, score is 13.637297
Doc 1500149, score is 13.598094
Doc 615755, score is 13.285145
Doc 1419667, score is 13.079536
Doc 535633, score is 12.861548
Doc 280554, score is 12.057722
Doc 892580, score is 11.984461
Doc 1033519, score is 10.895615
Doc 47426, score is 10.8689575
Doc 85381, score is 10.68723
Doc 83594, score is 10.440433
Doc 1392373, score is 10.411607
Doc 549727, score is 10.313179
arw55274
Total documents: 10000
Doc 535520, score is 16.643396
Doc 950415, score is 16.359968
Doc 968524, score is 16.280754
Doc 1510267, score is 14.935328
Doc 1487548, score is 14.395701
Doc 1372566, score is 13.766995
Doc 1402806, score is 13.766995
Doc 257830, score is 13.618306
Doc 1101286, score is 13.472795
Doc 1373762, score is 13.44934

arw55673
Total documents: 10000
Doc 1132006, score is 76.845
Doc 1169835, score is 76.57254
Doc 410696, score is 76.17692
Doc 274438, score is 74.32992
Doc 412106, score is 72.32881
Doc 412271, score is 70.95854
Doc 1416166, score is 70.89632
Doc 1418447, score is 70.812546
Doc 277139, score is 70.24293
Doc 36604, score is 68.54698
Doc 1134027, score is 68.48841
Doc 1413856, score is 67.53546
Doc 1004089, score is 66.690674
Doc 1418458, score is 66.30113
Doc 820202, score is 66.19211
Doc 779640, score is 65.33687
Doc 1448460, score is 64.77015
Doc 910595, score is 64.75357
Doc 410691, score is 64.711334
Doc 274655, score is 64.70859
arw55684
Total documents: 3139
Doc 652767, score is 14.749732
Doc 1219990, score is 13.303504
Doc 1297896, score is 12.740871
Doc 571068, score is 12.740871
Doc 45112, score is 12.740871
Doc 1387413, score is 12.646253
Doc 648531, score is 12.646253
Doc 345319, score is 12.553028
Doc 356020, score is 12.46117
Doc 254301, score is 12.3706455
Doc 539881, scor

arw56114
Total documents: 10000
Doc 794247, score is 12.821131
Doc 886473, score is 12.819908
Doc 1012218, score is 12.789115
Doc 1267915, score is 12.785375
Doc 597916, score is 12.72398
Doc 596580, score is 12.6220455
Doc 1011625, score is 12.574554
Doc 883988, score is 12.558659
Doc 1411759, score is 12.084833
Doc 549725, score is 11.824541
Doc 428202, score is 11.824541
Doc 62773, score is 11.814307
Doc 638773, score is 11.794165
Doc 580288, score is 11.771905
Doc 973417, score is 11.764775
Doc 1187629, score is 11.661938
Doc 87096, score is 11.596517
Doc 1516874, score is 11.58624
Doc 625186, score is 11.58624
Doc 928972, score is 11.525633
arw56131
Total documents: 4
Doc 811534, score is 15.245528
Doc 1116842, score is 14.534079
Doc 96744, score is 13.4049
Doc 667861, score is 7.7736864
arw56143
Total documents: 10000
Doc 1323032, score is 22.81255
Doc 918836, score is 21.268724
Doc 1264434, score is 20.882956
Doc 405244, score is 20.521042
Doc 1350157, score is 20.356344
Doc 357

arw56645
Total documents: 15
Doc 1213181, score is 13.122666
Doc 1295584, score is 12.148546
Doc 766762, score is 11.855202
Doc 869235, score is 10.354983
Doc 20057, score is 9.935871
Doc 355940, score is 9.549367
Doc 1480370, score is 8.551416
Doc 396873, score is 8.551416
Doc 1176501, score is 8.551416
Doc 1339760, score is 8.551416
Doc 1511348, score is 8.263557
Doc 1317959, score is 6.1824217
Doc 446389, score is 3.8533492
Doc 403499, score is 3.0798457
Doc 1170086, score is 1.0973514
arw56656
Total documents: 6
Doc 972455, score is 9.32278
Doc 1193751, score is 9.32278
Doc 973247, score is 9.32278
Doc 1519880, score is 9.32278
Doc 1052812, score is 8.441898
Doc 809352, score is 7.5064793
arw56686
Total documents: 10000
Doc 1132195, score is 28.733358
Doc 1489835, score is 23.076305
Doc 1261352, score is 22.944237
Doc 605048, score is 22.118538
Doc 61046, score is 22.118538
Doc 823570, score is 21.95973
Doc 1084474, score is 21.53808
Doc 1483833, score is 21.140429
Doc 1209565, sco

arw56982
Total documents: 3878
Doc 1283497, score is 8.61468
Doc 116714, score is 8.550953
Doc 304337, score is 8.462748
Doc 707159, score is 8.462748
Doc 8709, score is 8.462748
Doc 1033175, score is 8.462748
Doc 306537, score is 8.462748
Doc 1467567, score is 8.462748
Doc 12424, score is 8.462748
Doc 386185, score is 8.462748
Doc 462679, score is 8.462748
Doc 281090, score is 8.462748
Doc 1410419, score is 8.462748
Doc 281097, score is 8.462748
Doc 284751, score is 8.462748
Doc 1142952, score is 8.462748
Doc 1033176, score is 8.455419
Doc 1035741, score is 8.455419
Doc 1142041, score is 8.432699
Doc 443429, score is 8.429588
arw57015
Total documents: 10000
Doc 1350157, score is 20.856085
Doc 487039, score is 19.766352
Doc 964134, score is 19.766352
Doc 26062, score is 19.766352
Doc 1299717, score is 19.292267
Doc 1110630, score is 18.975859
Doc 650625, score is 18.951672
Doc 11583, score is 18.59583
Doc 912032, score is 18.542833
Doc 1412613, score is 18.270765
Doc 296990, score is 1

arw57417
Total documents: 10000
Doc 1250172, score is 17.13884
Doc 270748, score is 17.13884
Doc 529180, score is 15.173453
Doc 117275, score is 14.925998
Doc 345359, score is 14.206284
Doc 1446165, score is 14.139196
Doc 1373279, score is 13.975466
Doc 1205903, score is 13.844126
Doc 1290902, score is 13.732079
Doc 799066, score is 13.6098585
Doc 1500936, score is 13.497383
Doc 938740, score is 13.087098
Doc 404319, score is 13.030219
Doc 1463635, score is 12.729358
Doc 1481180, score is 12.5147
Doc 918333, score is 12.500273
Doc 406685, score is 12.500273
Doc 1204381, score is 12.32405
Doc 1128686, score is 12.03842
Doc 1031262, score is 12.03842
arw57445
Total documents: 7571
Doc 918145, score is 25.650448
Doc 1323042, score is 24.675495
Doc 1049729, score is 24.30177
Doc 1031693, score is 24.301134
Doc 357376, score is 24.186193
Doc 702462, score is 23.840342
Doc 1250683, score is 23.543264
Doc 1323051, score is 22.437233
Doc 916219, score is 22.328756
Doc 540715, score is 21.52824

arw57724
Total documents: 2792
Doc 1477991, score is 15.483676
Doc 967070, score is 15.035362
Doc 777035, score is 13.091854
Doc 406471, score is 12.325357
Doc 807096, score is 12.041454
Doc 894758, score is 11.507176
Doc 707390, score is 11.2631445
Doc 304930, score is 11.2631445
Doc 599248, score is 10.834952
Doc 1351926, score is 10.58208
Doc 1507844, score is 10.565641
Doc 466206, score is 10.499098
Doc 1228731, score is 10.335282
Doc 22621, score is 10.2893915
Doc 982586, score is 10.2893915
Doc 91774, score is 10.2893915
Doc 1179215, score is 10.211017
Doc 1008040, score is 10.170642
Doc 681290, score is 10.109232
Doc 891479, score is 9.971743
arw57730
Total documents: 10000
Doc 1174018, score is 17.245808
Doc 336222, score is 17.23945
Doc 288198, score is 16.525717
Doc 1489835, score is 16.346264
Doc 1138109, score is 16.075714
Doc 1060671, score is 15.757256
Doc 1138110, score is 15.6366825
Doc 898099, score is 15.520697
Doc 1113800, score is 15.520697
Doc 1059714, score is 15.

arw58008
Total documents: 10000
Doc 106734, score is 50.02405
Doc 1218649, score is 49.218796
Doc 1275021, score is 49.027893
Doc 399971, score is 49.012314
Doc 608213, score is 47.69914
Doc 636588, score is 47.055447
Doc 71886, score is 44.96726
Doc 698433, score is 44.280148
Doc 106019, score is 43.675537
Doc 1043267, score is 43.012436
Doc 1040807, score is 42.084705
Doc 1227273, score is 41.90063
Doc 841626, score is 41.11454
Doc 1261746, score is 41.06623
Doc 441721, score is 40.335087
Doc 839993, score is 40.138596
Doc 1401139, score is 40.023357
Doc 1466732, score is 39.94716
Doc 599482, score is 39.898407
Doc 713313, score is 39.697487
arw58015
Total documents: 8109
Doc 1510576, score is 15.233423
Doc 1043401, score is 14.550827
Doc 1261909, score is 14.284216
Doc 988682, score is 13.822187
Doc 109418, score is 13.503086
Doc 395807, score is 13.443358
Doc 845077, score is 13.443358
Doc 833277, score is 13.32873
Doc 833395, score is 13.283126
Doc 1480707, score is 13.283126
Doc 

arw58392
Total documents: 10000
Doc 842188, score is 16.175564
Doc 636641, score is 16.175564
Doc 1015717, score is 15.46916
Doc 556287, score is 15.44821
Doc 15574, score is 15.324477
Doc 494193, score is 15.2632885
Doc 933653, score is 15.254921
Doc 496012, score is 15.088171
Doc 301744, score is 15.067539
Doc 1147735, score is 14.893277
Doc 652888, score is 14.801023
Doc 40309, score is 14.567307
Doc 636921, score is 14.34304
Doc 439543, score is 14.34304
Doc 1280088, score is 14.2866
Doc 288535, score is 14.127954
Doc 986645, score is 14.058027
Doc 512814, score is 13.908368
Doc 999239, score is 13.805296
Doc 639955, score is 13.756645
arw58403
Total documents: 10000
Doc 555898, score is 16.700348
Doc 28674, score is 16.700348
Doc 1191441, score is 16.22716
Doc 37808, score is 15.7009945
Doc 608433, score is 15.668861
Doc 629042, score is 15.558431
Doc 1296191, score is 15.546085
Doc 1084744, score is 15.473769
Doc 1181581, score is 15.412655
Doc 1206719, score is 15.391674
Doc 982

arw58766
Total documents: 10000
Doc 833525, score is 15.141186
Doc 350611, score is 12.648222
Doc 350286, score is 12.648222
Doc 1500235, score is 12.354792
Doc 549746, score is 12.081139
Doc 1501284, score is 11.924647
Doc 1489452, score is 11.924647
Doc 386749, score is 11.849148
Doc 1500346, score is 11.663614
Doc 1307895, score is 11.609453
Doc 1472176, score is 11.609453
Doc 428326, score is 11.609453
Doc 29359, score is 11.600216
Doc 1218054, score is 11.478485
Doc 1307111, score is 11.399263
Doc 1186164, score is 11.380814
Doc 1307839, score is 11.369194
Doc 1472050, score is 11.282434
Doc 1287826, score is 11.21966
Doc 665602, score is 11.152847
arw58809
Total documents: 10000
Doc 269524, score is 15.53888
Doc 1455933, score is 14.422829
Doc 860339, score is 14.175802
Doc 417770, score is 14.122268
Doc 1284785, score is 14.122268
Doc 249810, score is 14.066691
Doc 418973, score is 14.066691
Doc 694658, score is 13.93699
Doc 676271, score is 13.519023
Doc 773859, score is 13.519

arw59134
Total documents: 2098
Doc 1118630, score is 20.464365
Doc 1495348, score is 20.4038
Doc 365768, score is 18.87925
Doc 852863, score is 18.025393
Doc 1264509, score is 18.021502
Doc 1495347, score is 16.646132
Doc 1115155, score is 16.099173
Doc 371132, score is 14.649252
Doc 1180875, score is 14.649252
Doc 1482378, score is 14.003223
Doc 294087, score is 13.989494
Doc 505232, score is 13.885551
Doc 46091, score is 13.769841
Doc 31534, score is 13.697264
Doc 1118627, score is 13.587566
Doc 1309070, score is 13.434001
Doc 943816, score is 13.046581
Doc 1512569, score is 12.633415
Doc 501967, score is 12.18092
Doc 55478, score is 12.18092
arw59172
Total documents: 2797
Doc 369692, score is 9.168687
Doc 277937, score is 8.970928
Doc 863925, score is 8.937612
Doc 83248, score is 8.84964
Doc 1382640, score is 8.800354
Doc 59200, score is 8.768355
Doc 1020646, score is 8.731795
Doc 1290989, score is 8.724941
Doc 1060014, score is 8.705168
Doc 602377, score is 8.682893
Doc 1456605, sc

In [54]:
lemma_res

r = 0.2535468230071016
p = 0.3577696526508227
r_precision = 0.21191923008395347
MAP = 0.38558780597371284

In [58]:
find_diff_metrics(quality_checker, lemma_quality_checker)

[('arw58304', 0.884148104862114, r = 0.05555555555555555
  p = 0.05
  r_precision = 0.05555555555555555
  MAP = 0.06322031619051743, r = 1.0
  p = 0.9
  r_precision = 1.0
  MAP = 0.9473684210526315),
 ('arw52186', 0.734162416745621, r = 0.02631578947368421
  p = 0.05
  r_precision = 0.02631578947368421
  MAP = 0.0359385701587714, r = 0.39473684210526316
  p = 0.75
  r_precision = 0.39473684210526316
  MAP = 0.7701009869043924),
 ('arw53079', 0.7096398559730602, r = 0.0625
  p = 0.2
  r_precision = 0.0625
  MAP = 0.20178999492079988, r = 0.265625
  p = 0.85
  r_precision = 0.265625
  MAP = 0.9114298508938601),
 ('arw55168', 0.6842088800002887, r = 0.08333333333333333
  p = 0.1
  r_precision = 0.08333333333333333
  MAP = 0.04060674404714652, r = 0.375
  p = 0.45
  r_precision = 0.375
  MAP = 0.7248156240474352),
 ('arw56114', 0.6739408044226155, r = 0.0
  p = 0.0
  r_precision = 0.0
  MAP = 0.0, r = 0.375
  p = 0.45
  r_precision = 0.375
  MAP = 0.6739408044226155),
 ('arw53029', 0.65659

In [None]:
id_to_pagerank = {}
with open('res/pagerank.txt','r') as f:
    for line in f:
        docId, docURL, rank = line.split()
        id_to_pagerank[int(docId)] = float(rank)

In [None]:
len(id_to_pagerank)


In [None]:
 for doc_name in tqdm(os.listdir("data/json_filtered_tokens_texts")):
        with open(f"data/json_filtered_tokens_texts/{doc_name}", "r+", encoding="utf-8") as inf:
            doc_id = int(''.join(list(filter(str.isdigit, doc_name))))
            doc = json.load(inf)
            try:
                doc["pagerank"] = id_to_pagerank.get(doc_id)
            except:
                pass
            inf.seek(0)        # <--- should reset file position to the beginning.
            json.dump(doc, inf, indent=4, ensure_ascii=False)
            inf.truncate()

In [None]:
settings_with_pagerank = {
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "pagerank": {
                "type": "rank_feature"
            }
        }
    }
}

In [None]:
pr_index = Index("pagerank_index", settings_with_pagerank)

In [None]:
pr_index.add_documents("data/json_filtered_tokens_texts")

In [None]:
pr_quality_checker = SearchQualityChecker(queries_lemmas, pr_index)
pr_res = pr_quality_checker.get_results(pagerank_query)

In [None]:
pr_res

In [None]:
pr_index.get_doc_by_id(1000039)

In [None]:
settings_titles = {
        'mappings': {
            'properties': {
                'content': {
                    'type': 'text',
                },
                'title': {
                    'type': 'text'
                }
            }
            
        }
    }