In [3]:
from whoosh.index import create_in
from whoosh.fields import *

from tqdm import tqdm_notebook as tqdm

In [6]:
schema = Schema(id = NUMERIC(stored=True), content=TEXT)
ix = create_in("indexdir", schema)
writer = ix.writer()

with open('pri_cfc.txt', 'r') as f:
    for l in tqdm(f):
        idx, doc = l.split(' ', 1)
        writer.add_document(id=int(idx), content=doc)
writer.commit()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
from whoosh.index import open_dir
from whoosh.qparser import *
ix = open_dir("indexdir")

In [37]:
def query(q):
    ids = []
    with ix.searcher() as searcher:
        query = QueryParser(fieldname="content", schema=ix.schema, group=OrGroup).parse(q)
        results = searcher.search(query, limit=100)
        for r in results:
            ids += r.values()
    return ids

print(query('second document'))

[387, 227, 140, 1130, 494, 817, 872, 590, 1087, 507, 768, 717, 1035, 225, 470, 762, 452, 822, 990, 1160, 180, 842, 349, 58, 303, 437, 597, 744, 242, 272, 627, 421, 719, 1110, 519, 1067, 1099, 315, 424, 137, 643, 1038, 1206, 258]


In [42]:
def stats(predictions, labels):
    predictions = set(predictions)
    labels = set(labels)
    both = predictions.intersection(labels)
    
    precision = len(both) / len(predictions)
    recall = len(both) / len(labels)
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    
    return precision, recall, f1

stats([1, 2, 3, 4], [2])

(0.25, 1.0, 0.4)

In [46]:
import numpy as np
precisions, recalls, f1s = [], [], []

with open('pri_queries.txt', 'r') as f:
    for q in f:
        label = [int(val) for val in next(f).strip().split()]
        q = q.strip()
#         print('Q:', q.strip())
#         print('Label:', label)
        
        pred = query(q)
        precision, recall, f1 = stats(predictions=pred, labels=label)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        print(f'Precision: {precision:.2f}\t Recall: {recall:.2f}\t F1: {f1:.2f}')

print(np.mean(precisions), np.mean(recalls), np.mean(f1s))

Precision: 0.18	 Recall: 0.53	 F1: 0.27
Precision: 0.03	 Recall: 0.43	 F1: 0.06
Precision: 0.11	 Recall: 0.26	 F1: 0.15
Precision: 0.04	 Recall: 0.44	 F1: 0.07
Precision: 0.35	 Recall: 0.27	 F1: 0.30
Precision: 0.12	 Recall: 0.50	 F1: 0.19
Precision: 0.06	 Recall: 0.21	 F1: 0.09
Precision: 0.03	 Recall: 0.14	 F1: 0.05
Precision: 0.07	 Recall: 0.70	 F1: 0.13
Precision: 0.13	 Recall: 0.52	 F1: 0.21
Precision: 0.16	 Recall: 0.73	 F1: 0.26
Precision: 0.04	 Recall: 0.57	 F1: 0.07
Precision: 0.09	 Recall: 0.38	 F1: 0.15
Precision: 0.17	 Recall: 0.31	 F1: 0.22
Precision: 0.25	 Recall: 0.24	 F1: 0.25
Precision: 0.36	 Recall: 0.20	 F1: 0.26
Precision: 0.17	 Recall: 0.31	 F1: 0.22
Precision: 0.16	 Recall: 0.76	 F1: 0.26
Precision: 0.08	 Recall: 0.36	 F1: 0.13
Precision: 0.26	 Recall: 0.57	 F1: 0.36
Precision: 0.07	 Recall: 0.28	 F1: 0.11
Precision: 0.17	 Recall: 0.24	 F1: 0.20
Precision: 0.07	 Recall: 0.20	 F1: 0.10
Precision: 0.08	 Recall: 0.26	 F1: 0.12
Precision: 0.14	 Recall: 0.27	 F1: 0.19


In [48]:
D = [0, 0, 1, 1, 0, 0, 1, 1, 0, 1]
C = [0, 1, 1, 1, 0, 0, 0, 1, 1, 1]

In [51]:
tp, tn, fp, fn = 0, 0, 0, 0
for i, d1 in enumerate(D):
    for j, d2 in enumerate(D):
        if j <= i:
            continue
            
        if c[i] == c[j]:
            if d1 == d2:
                tp += 1
            else:
                fp += 1
        else:
            if d1 == d2:
                fn += 1
            else:
                tn += 1
            
print((tp + tn) / (tp + tn + fp + fn))

0.5333333333333333
