In [6]:
from datetime import datetime, timedelta

# Timing

## Estimation

In [45]:
current_iter = 228

max_iter = 354
start_time = datetime.strptime("Oct 19 2022, 14:18:54", "%b %d %Y, %H:%M:%S")
cur_time = datetime.now()
time_since_start = cur_time - start_time
iter_per_sec = current_iter/time_since_start.total_seconds()
seconds_total = max_iter/iter_per_sec
time_total = timedelta(seconds=seconds_total)
seconds_left = (max_iter-current_iter)/iter_per_sec
time_left = timedelta(seconds=seconds_left)

print( f"current progress: {current_iter/max_iter*100}%")
print( f"expected total duration: {time_total.days} days {time_total.seconds//3600} hours {(time_total.seconds//60)%60} minutes {time_total.seconds%60} seconds left" )
print( f"expected duration: {time_left.days} days {time_left.seconds//3600} hours {(time_left.seconds//60)%60} minutes {time_left.seconds%60} seconds left" )
print( (cur_time + time_left).strftime("%b %d %Y, %H:%M:%S") )

current progress: 64.40677966101694%
expected total duration: 0 days 4 hours 42 minutes 50 seconds left
expected duration: 0 days 1 hours 40 minutes 40 seconds left
Oct 19 2022, 19:01:44


## Duration

In [39]:
def print_time_metrics(max_iter, start_ts, end_ts, ts_format="%b %d %Y, %H:%M:%S", resume_iter=0):
    start_time = datetime.strptime(start_ts, ts_format)
    finish_time = datetime.strptime(end_ts, ts_format)
    duration = finish_time - start_time
    time_per_chunk = duration.total_seconds()/(max_iter-resume_iter)
    print( f"{time_per_chunk} seconds per iteration" )
    print( f"total taken duration: {duration.days} days {duration.seconds//3600} hours {(duration.seconds//60)%60} minutes {duration.seconds%60} seconds left" )
    full_duration = timedelta(seconds=time_per_chunk*max_iter)
    print( f"expected total duration: {full_duration.days} days {full_duration.seconds//3600} hours {(full_duration.seconds//60)%60} minutes {full_duration.seconds%60} seconds left" )

## Local

### Train

In [32]:
print_time_metrics(
    resume_iter = 0, 
    max_iter = 80000,
    start_ts = "Oct 17 2022, 19:26:21",
    end_ts = "Oct 18 2022, 05:43:48",
)

0.4630875 seconds per iteration
total taken duration: 0 days 10 hours 17 minutes 27 seconds left
expected total duration: 0 days 10 hours 17 minutes 27 seconds left


In [33]:
print_time_metrics(
    resume_iter = 0,
    max_iter = 500000,
    start_ts = "Sep 26 2022, 19:55:00",
    end_ts = "Sep 29 2022, 19:05:00",
)

0.5124 seconds per iteration
total taken duration: 2 days 23 hours 10 minutes 0 seconds left
expected total duration: 2 days 23 hours 10 minutes 0 seconds left


### Index

In [24]:
print_time_metrics(
    resume_iter = 1, 
    max_iter = 354,
    start_ts = "Oct 18, 11:39:40",
    end_ts = "Oct 18, 15:56:10",
    ts_format = "%b %d, %H:%M:%S"
)

43.59773371104816 seconds per chunk
total taken duration: 0 days 4 hours 16 minutes 30 seconds left
expected total duration: 0 days 4 hours 17 minutes 13 seconds left


In [42]:
print_time_metrics(
    resume_iter = 0, 
    max_iter = 354,
    start_ts = "Oct 18 2022, 22:42:31",
    end_ts = "Oct 19 2022, 02:48:09",
)

41.632768361581924 seconds per iteration
total taken duration: 0 days 4 hours 5 minutes 38 seconds left
expected total duration: 0 days 4 hours 5 minutes 38 seconds left


In [42]:
print_time_metrics(
    resume_iter = 0, 
    max_iter = 354,
    start_ts = "Oct 19 2022, 08:17:55",
    end_ts = "Oct 19 2022, 02:48:09",
)

41.632768361581924 seconds per iteration
total taken duration: 0 days 4 hours 5 minutes 38 seconds left
expected total duration: 0 days 4 hours 5 minutes 38 seconds left


### Retrieval

In [27]:
print_time_metrics(
    max_iter = 101093,
    start_ts = "Oct 19 2022, 06:32:36",
    end_ts = "Oct 19 2022, 07:26:26",
)

0.03195077799649827 seconds per iteration
total taken duration: 0 days 0 hours 53 minutes 50 seconds left
expected total duration: 0 days 0 hours 53 minutes 50 seconds left


In [40]:
print_time_metrics(
    max_iter = 101093,
    start_ts = "Oct 18 2022, 18:48:51",
    end_ts = "Oct 18 2022, 19:41:02",
)

0.030971481704964735 seconds per iteration
total taken duration: 0 days 0 hours 52 minutes 11 seconds left
expected total duration: 0 days 0 hours 52 minutes 11 seconds left


## EC2

### Index

In [28]:
print_time_metrics(
    resume_iter = 207,
    max_iter = 354,
    start_ts = "Oct 08 2022, 02:06:22",
    end_ts = "Oct 08 2022, 07:14:33",
)

125.78911564625851 seconds per chunk
total taken duration: 0 days 5 hours 8 minutes 11 seconds left
expected total duration: 0 days 12 hours 22 minutes 9 seconds left


### Retrieval

In [41]:
print_time_metrics(
    resume_iter = 0,
    max_iter = 101093,
    start_ts = "Oct 08 2022, 07:33:26",
    end_ts = "Oct 08 2022, 08:15:08",
)

0.024749488095120335 seconds per iteration
total taken duration: 0 days 0 hours 41 minutes 42 seconds left
expected total duration: 0 days 0 hours 41 minutes 42 seconds left


# Remove Unused Dev Queries

In [6]:
from collections import defaultdict
from tqdm.auto import tqdm

In [7]:
qid2positives = defaultdict(list)
qid2ranking = defaultdict(list)
qid2mrr = {}
qid2recall = {depth: {} for depth in [50, 100, 200, 1000]}#, 5000, 10000]}

with open(r'data/qrels.dev.tsv') as f:
    for line in tqdm(f):
        qid, _, pid, label = map(int, line.strip().split())
        assert label == 1

        qid2positives[qid].append(pid)
len(qid2positives)

0it [00:00, ?it/s]

55578

In [9]:
queries = dict()
with open(r"data/queries.dev.tsv", encoding='utf-8') as f:
    for line in f:
        qid, query, *_ = line.strip().split('\t')
        qid = int(qid)

        assert (qid not in queries), ("Query QID", qid, "is repeated!")
        queries[qid] = query
len(queries)

101093

In [12]:
queries_cleaned = {k:v for k,v in queries.items() if k in qid2positives}
len(queries_cleaned)

55578

In [15]:
with open(r"data/queries.dev_clean.tsv", 'w', encoding='utf-8') as f:
    for qid, content in queries_cleaned.items():
        content = f'{qid}\t{content}\n'
        f.write(content)