This repository has been archived by the owner on Sep 18, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
evaluate_model.py
225 lines (175 loc) · 8.19 KB
/
evaluate_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""
Extract sessions, split into training and test sets, then train
a dynamic bayesian network to predict what users click on last.
We assume that the user continued clicking on things until they
found what they were looking for.
"""
import pandas as pd
import numpy as np
import sys
import os
import logging
from database import setup_database, get_searches, get_content_items, get_clicked_urls, get_skipped_urls
from sklearn.model_selection import train_test_split
from checks import SeriesProperties, DataFrameChecker
from uncertainty import product_relative_error, ratio_relative_error, sum_error
from clean_data_from_bigquery import normalise_search_terms
from estimate_relevance import SimplifiedDBNModel
from pyclick.click_models import SDBN
logging.basicConfig(filename='estimate_relevance.log',level=logging.INFO)
class PyClickModelAdapter:
"""
Takes a pyclick model and wraps it in a class that can calculate
relevance estimates from the model's params.
"""
@staticmethod
def from_json(json_file):
model = SDBN()
with open(json_file) as f:
json_str = f.read()
model.from_json(json_str)
return PyClickModelAdapter(model)
def __init__(self, model):
self.model = model
def relevance(self, query):
attr_param = self.model.param_names.attr
documents = self.model.params[attr_param]._container[query].keys()
# Filter out any document that has been examined less than 10 times
# I could do this earlier on, but this might mess up some of the sessions and test data
documents = [d for d in documents if self.model.params[attr_param]._container[query][d]._denominator >= 10]
return self.predict_relevance(query, documents)
def predict_relevance(self, query, documents):
return pd.Series(
(self.model.predict_relevance(query, document) for document in documents),
index=documents
).sort_values(ascending=False)
class QueryDocumentRanker:
"""
Generates new rankings for queries based on a model that estimates relevance
of each document. All rankings are cached in memory.
"""
def __init__(self, trained_model):
self.model = trained_model
self.query_rankings = {}
def rank(self, query):
"""
Rank all results for a query by relevance
"""
try:
return self.query_rankings[query]
except KeyError:
ranking = self.model.relevance(query).rank(method= 'min', ascending=False)
self.query_rankings[query] = ranking
return ranking
class ModelTester:
def __init__(self, ranker):
self.ranker = ranker
def evaluate(self, test_set):
"""
Evaluate how well our model describes the test data.
This calculates two metrics:
- The number of saved clicks (known bad results that are avoided in the new ranking)
- Change in rank of the user's preferred document (showing it higher reduces the
number of URLs the user has to examine to find it)
"""
return self._evaluate(test_set)
def _evaluate(self, test_set):
# TODO: make sure training set contains the same queries as the test set(!)
test_set['saved_clicks'] = test_set.apply(self.count_saved_clicks, axis=1)
test_set['change_in_rank'] = test_set.apply(self.change_in_rank_of_preferred_document, axis=1)
return test_set
def count_saved_clicks(self, test_row):
"""
Evaluate the number of known-bad results that would be avoided if the model's
preferred ranking was used (because those documents are now ranked below the final clicked one)
In this model, every click except the last one is assumed to be bad.
If a user clicked 1, 3, and 4, and the model's rank is 1, 4, 3, then the saved clicks is 1,
because the user would not have needed to click document 3.
This doesn't go negative even if the model ranks the preferred document at the bottom,
because we assume the user keeps scanning through the results until they find a document
they are satisfied with.
"""
query = test_row.search_term_lowercase
new_ranking = self.ranker.rank(query)
final_click_url = test_row.final_click_url
try:
final_click_new_ranking = new_ranking[final_click_url]
except KeyError:
#print(f"User clicked on something that wasn't in the training set (query={query}, doc={test_row.final_click_url})")
return 0
rubbish_urls = [url for url in test_row.clicked_urls if url != final_click_url]
saved_clicks_count = 0
for url in rubbish_urls:
try:
rank = new_ranking[url]
except KeyError:
#print(f"User clicked on something that wasn't in the training set (query={query}, doc={url})")
continue
if rank > final_click_new_ranking:
saved_clicks_count += 1
return saved_clicks_count
def change_in_rank_of_preferred_document(self, test_row):
"""
Work out how much the preferred document has gone up or down.
A positive value indicates that the doc is closer to the top
(so the user is assumed to examine less before finding it).
A negative value indicates that the doc is further down the page.
But:
- More of the tests set have seen the results closer to the top
- Less of the test set have seen the results closer to the bottom
- The distribution of final clicks is biased by this
"""
query = test_row.search_term_lowercase
new_ranking = self.ranker.rank(query)
old_rank = test_row.final_click_rank
try:
new_rank = new_ranking[test_row.final_click_url]
except KeyError:
#print(f"User clicked on something that wasn't in the training set (query={query}, doc={test_row.final_click_url})")
# Since the doc didn't appear in the training set, we are not
# really saying anything about its new rank. So just ignore it.
return 0
return (old_rank - new_rank)
if __name__ == '__main__':
conn = setup_database()
content_items = get_content_items(conn)
pyclick_model = PyClickModelAdapter.from_json('data/june10/sdbn_model2.json')
# NOTE: this comes from an earlier version of the db because I broke the code
# by changing the schema
my_model = SimplifiedDBNModel.from_csv('data/june10/my-sdbn_model.csv')
pyclick_ranker = QueryDocumentRanker(pyclick_model)
my_model_ranker = QueryDocumentRanker(my_model)
queries = my_model.document_params.index.get_level_values(0).unique()
print(f'loaded {len(queries)} queries')
for query in queries:
pyclick_rank = pyclick_ranker.rank(query)
my_rank = my_model_ranker.rank(query)
if len(pyclick_rank) == 0:
continue
comparison = []
for content_item, rank in my_rank.iteritems():
try:
title = content_items.loc[content_item].title
except Exception:
# Titles come from current results, may not be there
# for results lower down
title = content_item
print(title)
try:
rank2 = pyclick_rank[content_item]
examined = pyclick_model.model.params[pyclick_model.model.param_names.attr].get(query, content_item)._denominator
except Exception:
rank2 = '?'
examined = '?'
print(f'Mine: {rank}; Pyclick: {rank2}')
print(f'Examined: {examined}')
df1 = my_rank.to_frame()
df2 = pyclick_rank.to_frame()
results = df1.join(df2, lsuffix='mine', rsuffix='pyclick', how='outer').join(content_items)
results.to_csv(f'data/june10/queries/{query}.csv', index=False)
import pdb; pdb.set_trace()
# # How does the new ranker do against the saved-effort metrics?
# tester = ModelTester(ranker)
# evaluation = tester.evaluate(test)
# print(f'Median change in rank: {evaluation.change_in_rank.mean()}')
# print(f'Median saved clicks: {evaluation.saved_clicks.median()}')