In [None]:
from tqdm import tqdm

from typing import Dict, List, Tuple
ArticleName = str
Text = str
Term = str
CollectionData = None
RankingParams = None

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install fake-useragent

Collecting fake-useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
Building wheels for collected packages: fake-useragent
  Building wheel for fake-useragent (setup.py) ... [?25l[?25hdone
  Created wheel for fake-useragent: filename=fake_useragent-0.1.11-py3-none-any.whl size=13502 sha256=280372894d4cb5541c7b98f29d4fe2f2c36e04e8f9b01c014642599c567ac5e7
  Stored in directory: /root/.cache/pip/wheels/ed/f7/62/50ab6c9a0b5567267ab76a9daa9d06315704209b2c5d032031
Successfully built fake-useragent
Installing collected packages: fake-useragent
Successfully installed fake-useragent-0.1.11


In [None]:
import gzip
import os
import pandas as pd
import pickle
import re
import requests

from bs4 import BeautifulSoup as bs
from fake_useragent import UserAgent
from hashlib import md5
from tqdm.notebook import tqdm

### 1. Поиск в BOW модели (1 балл)
#### 1.1 Датасет
В качестве документов для поиска, требуется использовать статьи википедии, скачанные в одном из предыдущих домашних заданий.
Реализуйте получение содержимого документа по названию статьи в соответствии с выбранной вами схемы хранения документов. Также, преобразуйте содержимое в текст (не html) любым разумным способом. Постарайтесь при этом вырезать заведомо бесполезные для поиска данные со страницы.

In [None]:
directory = "/content/drive/MyDrive/wiki_"
pref = "https://simple.wikipedia.org/"

In [None]:
def hmd5(s):
    return md5(s.encode("utf-8")).hexdigest().lower()

In [None]:
def make_dirs(path):
    os.makedirs(os.path.dirname(path), exist_ok=True)

In [None]:
def normalize(parsed):
    parts = []
    for p in parsed.find_all("p"):
        parts.append(re.sub("\[\d*\]|\s+", ' ',p.text).strip())
    lis = set()
    for p in parsed.find_all("span", {"class", "mw-headline"}):
        ul = p.find_next("ul")
        if ul:
            for li in ul.find_all("li"):
                res = li.find("a", {"class":False})
                if res and res.text not in lis and "wiki" not in res.text and res.get("title") and "Special" not in res.get("title"):
                    parts.append(res.text)
                    lis.add(res.text)
    text = " ".join(parts)
    return text

In [None]:
def get_article_text(article_name: ArticleName) -> Text:
    article_hash = hmd5(article_name)
    dump_path = directory + "/" + article_hash[:2] + "/" + article_hash[2:4] + "/" + article_hash[4:]
    if not os.path.exists(dump_path):
        url = pref + "wiki/" + article
        response = requests.get(url, headers={'User-Agent': agent})
        if response.status_code != 404:
            while response.status_code != 200:
                time.sleep(0.1)
                response = requests.get(url, headers={'User-Agent': agent})
                
            make_dirs(dump_path)
            with gzip.open(dump_path, "wb") as f:
                f.write(response.text.encode("utf-8"))
            parsed = bs(response.text, 'html.parser')
    else:
        with gzip.open(dump_path, "rb") as f:
            parsed = bs(f.read().decode("utf-8"), "html.parser")
    return normalize(parsed)
    
for article_name in tqdm(["Software_Development_Kit", "Gangrene", "COVID-19_pandemic_in_Belarus", "Guitar_Hero:_Aerosmith"]):
    print(f'Article: {article_name}\n{get_article_text(article_name)}\n\n')

  0%|          | 0/4 [00:00<?, ?it/s]

Article: Software_Development_Kit
A software development kit (SDK or "devkit") is usually a set of development tools that allows a software developer to create applications for a certain software package, software framework, hardware platform, computer system, video game console, operating system, or similar platform. SDKs vary greatly between a simple application programming interface to hardware used to simulate a system.


Article: Gangrene
Gangrene is a serious medical condition that causes the decay and death of body tissue, usually in the extremities such as the fingers, hands, toes, and feet. The two main types of gangrene are dry gangrene and wet gangrene. A third less common type is a form of wet gangrene known as gas gangrene. A very rare type which affects the internal organs is known as internal gangrene. Dry gangrene is usually caused by a loss of blood supply to the affected area, such as may happen following an injury which damages the blood vessels to the affected area.

Чтобы процесс занимал разумное время, поиск требуется производить только по некоторому заданному набору документов. Список целевых статей описан в файле selected_docs.tsv по одной на строку.

In [None]:
from multiprocessing import Pool

def load_docs(selected_docs_fn: ArticleName, threads: int = 4) -> Dict[ArticleName, Text]:    
    docs = {}
    pool = Pool(threads)
    tasks = []
    if os.path.exists("/content/drive/MyDrive/wiki_/docs"):
        with open("/content/drive/MyDrive/wiki_/docs", "rb") as f:
            docs = pickle.load(f)
        return docs
    tasks = []
    for line in tqdm(open(selected_docs_fn)):
        article_name = line.strip()
        tasks.append((article_name, pool.apply_async(get_article_text, (article_name, ))))
    for article_name, task in tqdm(tasks):
        docs[article_name] = task.get(10**6)
    return docs
    
docs = load_docs("./selected_docs.tsv", 32)
print(f'{len(docs)} docs loaded')

15190 docs loaded


#### 1.2. Поиск
Реализуйте разбиение текста на термы.

In [None]:
import nltk

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [None]:
def remove_stop_words(text):
    tokenizer = RegexpTokenizer(r'\w+|\d+')
    words = tokenizer.tokenize(text)
    ps = nltk.stem.PorterStemmer()
    filtered_words = [ps.stem(word) for word in words if word not in stopwords.words('english') and len(word) > 1]
    return filtered_words

In [None]:
docs["Linkin_Park"]

'Linkin Park is an American rock band from Agoura Hills, California. They started making music in 1996, the band became very famous with their first album, Hybrid Theory, which was labeled with the Diamond Award of more than 10 million copies by the RIAA and multi-platinum in several other countries.  Their second album, Meteora, continued the band\'s popularity, reaching number one at the Billboard 200 album chart in 2003. It was promoted with concerts and giving for charities around the world.  In 2003, MTV2 called Linkin Park the sixth greatest band of the music video time and the third best band of the new millennium, only beaten by Oasis and Coldplay.  Billboard ranked Linkin Park #19 on the Best Artists of the Decade (Past 10 Years) chart. The band used the genres nu metal and rap metal to make it suitable for radio yet heavy in layers in Hybrid Theory and Meteora.    The band used other types of music in their next studio album Minutes to Midnight, which was released in 2007.   

In [None]:
remove_stop_words(docs["Linkin_Park"])

['linkin',
 'park',
 'american',
 'rock',
 'band',
 'agoura',
 'hill',
 'california',
 'they',
 'start',
 'make',
 'music',
 '1996',
 'band',
 'becam',
 'famou',
 'first',
 'album',
 'hybrid',
 'theori',
 'label',
 'diamond',
 'award',
 '10',
 'million',
 'copi',
 'riaa',
 'multi',
 'platinum',
 'sever',
 'countri',
 'their',
 'second',
 'album',
 'meteora',
 'continu',
 'band',
 'popular',
 'reach',
 'number',
 'one',
 'billboard',
 '200',
 'album',
 'chart',
 '2003',
 'It',
 'promot',
 'concert',
 'give',
 'chariti',
 'around',
 'world',
 'In',
 '2003',
 'mtv2',
 'call',
 'linkin',
 'park',
 'sixth',
 'greatest',
 'band',
 'music',
 'video',
 'time',
 'third',
 'best',
 'band',
 'new',
 'millennium',
 'beaten',
 'oasi',
 'coldplay',
 'billboard',
 'rank',
 'linkin',
 'park',
 '19',
 'best',
 'artist',
 'decad',
 'past',
 '10',
 'year',
 'chart',
 'the',
 'band',
 'use',
 'genr',
 'nu',
 'metal',
 'rap',
 'metal',
 'make',
 'suitabl',
 'radio',
 'yet',
 'heavi',
 'layer',
 'hybrid',
 

In [None]:
def make_terms(text: Text) -> List[Term]:
    return remove_stop_words(text)

for text in ["Hello, world!", 
             "Guitar Hero: Aerosmith", 
             "So Far, So Good... So What!", 
             "such as mantı, döner, kebabs, Turkish delight, baklava, börek, köfte, and other foods",
            ]:
    print(f'{text} -> {make_terms(text)}')

Hello, world! -> ['hello', 'world']
Guitar Hero: Aerosmith -> ['guitar', 'hero', 'aerosmith']
So Far, So Good... So What! -> ['So', 'far', 'So', 'good', 'So', 'what']
such as mantı, döner, kebabs, Turkish delight, baklava, börek, köfte, and other foods -> ['mantı', 'döner', 'kebab', 'turkish', 'delight', 'baklava', 'börek', 'köfte', 'food']


Рассчитайте статистики термов и документов, которые понадобятся для реализации моделей tfidf и BM25: частота терма для документа, обратная документная частота терма и прочие.

In [None]:
import numpy as np

In [None]:
terms_from_doc = dict()
term_in_docs = dict()
terms = set()
f_ij = dict()
n_j = dict()
doc_len = dict()
for doc in tqdm(docs):
    terms_from_doc[doc] = Counter(make_terms(docs[doc]))
    doc_len[doc] = len(terms_from_doc[doc])
    for term in terms_from_doc[doc]:
        f_ij[(doc, term)] = terms_from_doc[doc][term]
        n_j[term] = n_j.get(term, 0) + 1
        terms.add(term)
        if term not in term_in_docs:
            term_in_docs[term] = set()
        term_in_docs[term].add(doc)

terms = list(terms)
average_doc_len = sum(doc_len[doc] for doc in doc_len) / len(doc_len)
vec = dict()
inv_index = dict()

for i, term in enumerate(terms):
    inv_index[term] = i

for doc in tqdm(docs):
    v = np.zeros(len(terms), dtype=np.float16)
    for term in terms_from_doc[doc]:
        v[inv_index[term]] = f_ij[(doc, term)] * np.log((len(docs) + 1) / n_j[term])
    vec[doc] = v

  0%|          | 0/15190 [00:00<?, ?it/s]

  0%|          | 0/15190 [00:00<?, ?it/s]

In [None]:
def prepare_collection_data(docs: Dict[ArticleName, Text]) -> CollectionData:
    terms_from_doc = dict()
    term_in_docs = dict()
    terms = set()
    f_ij = dict()
    n_j = dict()
    doc_len = dict()
    for doc in tqdm(docs):
        terms_from_doc[doc] = Counter(make_terms(docs[doc]))
        doc_len[doc] = len(terms_from_doc[doc])
        for term in terms_from_doc[doc]:
            f_ij[(doc, term)] = terms_from_doc[doc][term]
            n_j[term] = n_j.get(term, 0) + 1
            terms.add(term)
            if term not in term_in_docs:
                term_in_docs[term] = set()
            term_in_docs[term].add(doc)
    
    terms = list(terms)
    average_doc_len = sum(doc_len[doc] for doc in doc_len) / len(doc_len)
    vec = dict()
    inv_index = dict()

    for i, term in enumerate(terms):
        inv_index[term] = i
    
    for doc in tqdm(docs):
        v = np.zeros(len(terms), dtype=np.float16)
        for term in terms_from_doc[doc]:
            v[inv_index[term]] = f_ij[(doc, term)] * np.log((len(docs) + 1) / n_j[term])
        vec[doc] = v
    
    return {"terms_from_doc" : terms_from_doc, 
                   "term_in_docs" : term_in_docs,
                   "terms" : terms,
                   "f_ij": f_ij, 
                   "n_j" : n_j, 
                   "doc_len" : doc_len,
                   "average_doc_len": average_doc_len,
                   "vec" : vec,
                   "inv_index" : inv_index}
    
collection_data = prepare_collection_data(docs)

  0%|          | 0/15190 [00:00<?, ?it/s]

  0%|          | 0/15190 [00:00<?, ?it/s]

In [None]:
collection_data = {"terms_from_doc" : terms_from_doc, 
                   "term_in_docs" : term_in_docs,
                   "terms" : terms,
                   "f_ij": f_ij, 
                   "n_j" : n_j, 
                   "doc_len" : doc_len,
                   "average_doc_len": average_doc_len,
                   "vec" : vec,
                   "inv_index" : inv_index}

Реализуйте поиск лучших 10 документов в модели tfidf и BM25 с параметрами b = 1, k1 = 1, k2=1.
Строить инвертированный индекс не требуется (но и не запрещается).

In [None]:
def get_relevant_docs(f, term_in_docs):
    relevant_docs = set()
    for term in f:
        if term in term_in_docs:
            relevant_docs.update(term_in_docs[term])
    return relevant_docs

In [None]:
def simple_search(query: Text, collection_data: CollectionData, ranking_params: RankingParams) -> List[Tuple[ArticleName, float]]:
    result = []    

    f = Counter(make_terms(query))
    relevant_docs = get_relevant_docs(f, collection_data["term_in_docs"])
    return [(doc, 0) for doc in relevant_docs]

In [None]:
def TF_IDF(query: Text, collection_data: CollectionData, ranking_params: RankingParams) -> List[Tuple[ArticleName, float]]:
    result = []

    dlen = len(collection_data["terms_from_doc"])
    f_ij = collection_data["f_ij"]
    n_j = collection_data["n_j"]
    doc_len = collection_data["doc_len"]
           

    f = Counter(make_terms(query))
    relevant_docs = get_relevant_docs(f, collection_data["term_in_docs"])

    for doc in relevant_docs:
        value = 0
        for j in f:
            if j in n_j and (doc, j) in f_ij:
                value += f_ij[(doc, j)] * np.log((dlen + 1) / n_j[j])
        result.append((doc, value))
    result.sort(key=lambda x: x[1], reverse=True)
    return result


In [None]:
def BM25(query: Text, collection_data: CollectionData, ranking_params: RankingParams) -> List[Tuple[ArticleName, float]]:
    result = []

    dlen = len(collection_data["terms_from_doc"])
    f_ij = collection_data["f_ij"]
    n_j = collection_data["n_j"]
    doc_len = collection_data["doc_len"]
    average_doc_len = collection_data["average_doc_len"]
    
    k1 = ranking_params["k1"]
    k2 = ranking_params["k2"]
    b = ranking_params["b"]    

    f = Counter(make_terms(query))
    relevant_docs = get_relevant_docs(f, collection_data["term_in_docs"])

    for doc in relevant_docs:
        K = k1 * ((1 - b) + b * doc_len[doc] / average_doc_len)
        value = 0
        for j in f:
            if j in n_j and (doc, j) in f_ij:
                value += np.log((dlen + 1) / n_j[j]) * ((k1 + 1) * f_ij[(doc, j)] / (K + f_ij[(doc, j)])) * ((k2 + 1) * f[j] / (k2 + f[j]))
        result.append((doc, value))
    result.sort(key=lambda x: x[1], reverse=True)
    return result

In [None]:
def search(query: Text, collection_data: CollectionData, ranking_params: RankingParams) -> List[Tuple[ArticleName, float]]:
    if ranking_params["type"] == "BM25":
        return BM25(query, collection_data, ranking_params)
    elif ranking_params["type"] == "TF_IDF":
        return TF_IDF(query, collection_data, ranking_params)
    else:
        return simple_search(query, collection_data, ranking_params)


ranking_params = {"type": "TF_IDF",
                  "k1": 1,
                  "k2": 1,
                  "b": 1}
for query in ["coronovirus in belarus",
              "who won junior eurovision in 2005",
              "science about full-text search",
             ]:
    result = search(query, collection_data, ranking_params)[:5]
    print(f"[{query}]")
    for article_name, score in result:
        print(f"{score:7.2f}  {article_name}")
    print("\n")

[coronovirus in belarus]
  34.67  COVID-19_pandemic_in_Belarus
  28.89  Time_in_Belarus
  17.33  Nuclear_accident
  17.33  Daugava_River
  17.33  Poland


[who won junior eurovision in 2005]
  77.88  Dokkyo_Saitama_High_School
  75.99  Eurovision_Song_Contest_2011
  69.66  Eurovision:_Europe_Shine_a_Light
  60.17  Middle_school
  54.70  List_of_ice_hockey_leagues


[science about full-text search]
 540.45  Warsaw_Uprising
 446.16  Google_Search
 160.94  Science_fiction
 138.07  Shiva
 116.61  Philosophy_of_science




In [None]:
ranking_params = {"type": "BM25",
                  "k1": 1,
                  "k2": 1,
                  "b": 1}
for query in ["coronovirus in belarus",
              "who won junior eurovision in 2005",
              "science about full-text search",
             ]:
    result = search(query, collection_data, ranking_params)[:5]
    print(f"[{query}]")
    for article_name, score in result:
        print(f"{score:7.2f}  {article_name}")
    print("\n")

[coronovirus in belarus]
  10.68  Time_in_Belarus
  10.57  COVID-19_pandemic_in_Belarus
  10.36  Daugava_River
   8.87  Bug_River
   8.58  Eurasian_Union


[who won junior eurovision in 2005]
  18.11  Junior_Eurovision_Song_Contest_2014
  17.82  Junior_Eurovision_Song_Contest_2015
  14.91  Junior_Eurovision_Song_Contest_2004
  14.70  Junior_Eurovision_Song_Contest_2019
  14.06  Katherine_Hansen


[science about full-text search]
  17.59  Information_retrieval
  12.71  Computer_vision
  12.57  Google_Search
  11.26  Binary_search
  11.24  The_Massacre_at_Paris




#### 1.3. Оптимизация качества
Для измерения качества поиска вам предоставляется список из пар (запрос, название статьи), которая означает, что по данному запросу данная статья является релевантной (а остальные – нерелевантны). Пары описаны в файле queries.tsv по одной на строку.

In [None]:
def load_queries(queries_fn: ArticleName) -> List[Tuple[Text, ArticleName]]:
    queries = []
    for line in open(queries_fn):
        query, answer = line.rstrip().split('\t', 1)
        queries.append((query, answer))
    return queries

queries = load_queries("./queries.tsv")
for query, answer in queries:
    assert answer in docs
    
print(f'{len(queries)} queries loaded')
for query, article_name in queries[:5]:
    print(f'{query} -> {article_name}')

200 queries loaded
animals that have shells and live in water -> Shell_(zoology)
how many different types of scorpions are there -> Scorpion
describe the structure of a scientific name for a species -> Binomial_nomenclature
what are the 3 types of plastids in plant cells -> Plastid
who named the cell and how did he come up with that name -> Cell_theory


Оценим поиск по нескольким метрикам: accuracy – доля запросов, где на первой позиции был найден релевантный документ; accuracy@10 – доля запросов, где релевантный документ попал в первую десятку, mrr@10 – средняя обратная позиция релевантного документа в первой десятке.

In [None]:
def run(title, queries: List[Tuple[Text, ArticleName]], collection_data: CollectionData, ranking_params: RankingParams) -> None:
    accuracy = 0.0
    accuracy10 = 0.0
    rr = 0.0
    processed = 0
    with tqdm(queries) as progress:
        for query, answer in progress:
            result = search(query, collection_data, ranking_params)[:10]
            
            rank = None
            for position, (article_name, score) in enumerate(result):
                if article_name == answer:
                    rank = position + 1
                    break
                
            if rank is not None:
                accuracy += (rank == 1)
                accuracy10 += (rank <= 10)
                rr += 1.0 / rank
                
            processed += 1
            progress.set_description(f'Acc: {accuracy/processed:0.2f}, Acc10: {accuracy10/processed:0.2f}, RR: {rr/processed:0.2f}')
    print(f'{title}\n  Accuracy: {accuracy/processed:0.2f}\n  Accuracy10: {accuracy10/processed:0.2f}\n  RR: {rr/processed:0.2f}')
    return accuracy/processed
    
ranking_params = {"type": "no_ranking"} 
run("JustRun", queries, collection_data, ranking_params)

  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.00
  Accuracy10: 0.00
  RR: 0.00


0.0

Сравните реализованные Вами алгоритмы tfidf и BM25 по этим метрикам.

In [None]:
ranking_params = ranking_params = {"type": "TF_IDF",
                  "k1": 1,
                  "k2": 1,
                  "b": 1}
run("JustRun", queries, collection_data, ranking_params)

  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.05
  Accuracy10: 0.23
  RR: 0.10


0.05

In [None]:
ranking_params = ranking_params = {"type": "BM25",
                  "k1": 1,
                  "k2": 1,
                  "b": 1}
run("JustRun", queries, collection_data, ranking_params)

  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.20
  Accuracy10: 0.52
  RR: 0.29


0.195

Подберите оптимальные параметры BM25 для этого набора запросов и документов.

In [None]:
best_params = {}
best_score = 0.0

for b in list(np.linspace(0, 1, 5)) + list(np.linspace(2, 10, 5)):
    for k1 in list(np.linspace(0, 1, 5)) + list(np.linspace(2, 10, 5)):
        for k2 in list(np.linspace(0, 1, 5)) + list(np.linspace(2, 10, 5)):
            ranking_params = ranking_params = {"type": "BM25",
                  "b": b,
                  "k1": k1,
                  "k2": k2}
            score = run("JustRun", queries, collection_data, ranking_params)
            if score > best_score:
                best_params = ranking_params
                best_score = score

  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.22
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.22
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.22
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.21
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.21
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.21
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.21
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.21
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.21
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.06
  Accuracy10: 0.21
  RR: 0.11


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.09
  Accuracy10: 0.28
  RR: 0.14


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.09
  Accuracy10: 0.28
  RR: 0.14


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.09
  Accuracy10: 0.28
  RR: 0.14


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.09
  Accuracy10: 0.28
  RR: 0.14


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.28
  RR: 0.15


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.28
  RR: 0.15


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.28
  RR: 0.15


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.28
  RR: 0.15


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.28
  RR: 0.15


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.28
  RR: 0.15


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.32
  RR: 0.16


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.32
  RR: 0.16


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.10
  Accuracy10: 0.32
  RR: 0.16


  0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
for b in list(np.linspace(0.5, 1, 17)):
    ranking_params = ranking_params = {"type": "BM25",
          "b": b,
          "k1": 1,
          "k2": 1}
    score = run("JustRun", queries, collection_data, ranking_params)
    print(ranking_params)

  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.21
  Accuracy10: 0.50
  RR: 0.29
{'type': 'BM25', 'b': 0.5, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.21
  Accuracy10: 0.51
  RR: 0.30
{'type': 'BM25', 'b': 0.53125, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.51
  RR: 0.30
{'type': 'BM25', 'b': 0.5625, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.51
  RR: 0.31
{'type': 'BM25', 'b': 0.59375, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.51
  RR: 0.30
{'type': 'BM25', 'b': 0.625, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.52
  RR: 0.30
{'type': 'BM25', 'b': 0.65625, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.51
  RR: 0.30
{'type': 'BM25', 'b': 0.6875, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.52
  RR: 0.30
{'type': 'BM25', 'b': 0.71875, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.52
  RR: 0.30
{'type': 'BM25', 'b': 0.75, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.52
  RR: 0.30
{'type': 'BM25', 'b': 0.78125, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.53
  RR: 0.30
{'type': 'BM25', 'b': 0.8125, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.54
  RR: 0.30
{'type': 'BM25', 'b': 0.84375, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.23
  Accuracy10: 0.54
  RR: 0.31
{'type': 'BM25', 'b': 0.875, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.20
  Accuracy10: 0.53
  RR: 0.29
{'type': 'BM25', 'b': 0.90625, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.20
  Accuracy10: 0.54
  RR: 0.29
{'type': 'BM25', 'b': 0.9375, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.20
  Accuracy10: 0.53
  RR: 0.29
{'type': 'BM25', 'b': 0.96875, 'k1': 1, 'k2': 1}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.20
  Accuracy10: 0.52
  RR: 0.29
{'type': 'BM25', 'b': 1.0, 'k1': 1, 'k2': 1}


In [None]:
for k2 in list(np.linspace(0, 1, 5)) + list(np.linspace(2, 10, 5)):
    ranking_params = ranking_params = {"type": "BM25",
          "b": 0.84375,
          "k1": 8.0,
          "k2": k2}
    score = run("JustRun", queries, collection_data, ranking_params)
    print(ranking_params)

  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 0.0}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 0.25}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 0.75}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 1.0}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.28
  Accuracy10: 0.58
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 2.0}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.28
  Accuracy10: 0.58
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 4.0}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.28
  Accuracy10: 0.58
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 6.0}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.58
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 8.0}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.58
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 10.0}


In [None]:
for b in list(np.linspace(0.75, 0.9, 17)):
    ranking_params = ranking_params = {"type": "BM25",
          "b": b,
          "k1": 8.0,
          "k2": 0.5}
    score = run("JustRun", queries, collection_data, ranking_params)
    print(ranking_params)

  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.75, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.759375, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.76875, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.778125, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.7875, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.796875, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.80625, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.815625, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.825, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.834375, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.59
  RR: 0.38
{'type': 'BM25', 'b': 0.84375, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.57
  RR: 0.38
{'type': 'BM25', 'b': 0.853125, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.57
  RR: 0.38
{'type': 'BM25', 'b': 0.8625, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.8812500000000001, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.55
  RR: 0.37
{'type': 'BM25', 'b': 0.890625, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.29
  Accuracy10: 0.55
  RR: 0.37
{'type': 'BM25', 'b': 0.9, 'k1': 8.0, 'k2': 0.5}


In [None]:
for k2 in list(np.linspace(0.4, 0.6, 17)):
    ranking_params = ranking_params = {"type": "BM25",
          "b": 0.871875,
          "k1": 8.0,
          "k2": k2}
    score = run("JustRun", queries, collection_data, ranking_params)
    print(ranking_params)

  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.4}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.41250000000000003}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.42500000000000004}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.4375}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.45}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.4625}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.475}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.4875}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.5}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.5125}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.525}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.5375}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.55}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.5625}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.575}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.5874999999999999}


  0%|          | 0/200 [00:00<?, ?it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38
{'type': 'BM25', 'b': 0.871875, 'k1': 8.0, 'k2': 0.6}


In [None]:
ranking_params = ranking_params = {"type": "BM25",
          "b": 0.871875,
          "k1": 8.0,
          "k2": 0.5}
score = run("JustRun", queries, collection_data, ranking_params)

Acc: 0.30, Acc10: 0.56, RR: 0.38: 100%|██████████| 200/200 [00:09<00:00, 21.07it/s]

JustRun
  Accuracy: 0.30
  Accuracy10: 0.56
  RR: 0.38





### 2.Поиск в векторной модели (1 балл)

Используйте готовые эмбединги или энкодер нейронной модели, и преобразуйте с помощью него запросы и документы в вектора небольшой размерности.
Реализуйте поиск лучших документов по косинусной мере или скалярному произведению между векторами запроса и документа. Сравните результаты с моделями из предыдущих пунктов по метрикам и позапросно – выигрывает ли векторный вариант на тех примерах, где предыдущие модели не справляются? 

In [None]:
len(collection_data["terms_from_doc"])

15190

In [None]:
def cosine_similarity(query: Text, collection_data: CollectionData, ranking_params: RankingParams) -> List[Tuple[ArticleName, float]]:
    result = []

    dlen = len(collection_data["terms_from_doc"])
    f_ij = collection_data["f_ij"]
    n_j = collection_data["n_j"]
    inv_index = collection_data["inv_index"]

    qv = np.zeros(len(collection_date["terms"]), dtype=np.float16)  
    f = Counter(make_terms(query))
    relevant_docs = get_relevant_docs(f, collection_data["term_in_docs"])

    for term in f:
        if term in inv_index:
            qv[inv_index[term]] = f[term] * log((dlen + 1) / n_j[term])

    for doc in relevant_docs:
        v = collection_date["vec"][doc]
        value = np.dot(qv, v) / (np.linalg.norm(qv) * np.linalg.norm(v))
        result.append((doc, value))
    result.sort(key=lambda x: x[1], reverse=True)
    return result

In [None]:
!wget "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"

--2021-10-23 19:32:53--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2021-10-23 19:33:11 (35.6 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]



In [None]:
!unzip wiki-news-300d-1M.vec.zip

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [None]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

In [None]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open('/content/wiki-news-300d-1M.vec'))

In [None]:
def text_to_vec(text):
    tvec = np.zeros(300, dtype=np.float16)
    terms = make_terms(text)[:300]
    for term in terms:
        if term in embeddings_index:
            tvec += embeddings_index[term]
    tvec /= len(terms)
    return tvec

In [None]:
vec = dict()
for doc in tqdm(docs):
    vec[doc] = text_to_vec(docs[doc])

  0%|          | 0/15190 [00:00<?, ?it/s]

  import sys


In [None]:
def embedding(query: Text, vec: CollectionData) -> List[Tuple[ArticleName, float]]:
    result = []

    qvec = text_to_vec(query)
    
    for doc in vec:
        v = vec[doc]
        value = np.dot(qvec, v) / (np.linalg.norm(qvec) * np.linalg.norm(v))
        result.append((doc, value))
    result.sort(key=lambda x: x[1], reverse=True)
    return result


In [None]:
def run_embedding(title, queries: List[Tuple[Text, ArticleName]], vec: CollectionData) -> None:
    accuracy = 0.0
    accuracy10 = 0.0
    rr = 0.0
    processed = 0
    with tqdm(queries) as progress:
        for query, answer in progress:
            result = embedding(query, vec)[:10]
            
            rank = None
            for position, (article_name, score) in enumerate(result):
                if article_name == answer:
                    rank = position + 1
                    break
                
            if rank is not None:
                accuracy += (rank == 1)
                accuracy10 += (rank <= 10)
                rr += 1.0 / rank
                
            processed += 1
            progress.set_description(f'Acc: {accuracy/processed:0.2f}, Acc10: {accuracy10/processed:0.2f}, RR: {rr/processed:0.2f}')
    print(f'{title}\n  Accuracy: {accuracy/processed:0.2f}\n  Accuracy10: {accuracy10/processed:0.2f}\n  RR: {rr/processed:0.2f}')
    return accuracy/processed
    
run_embedding("JustRun", queries, vec)

  0%|          | 0/200 [00:00<?, ?it/s]

  


JustRun
  Accuracy: 0.09
  Accuracy10: 0.23
  RR: 0.13


0.09

In [None]:
collection_data = {"terms_from_doc" : terms_from_doc, 
                   "term_in_docs" : term_in_docs,
                   "terms" : terms,
                   "f_ij": f_ij, 
                   "n_j" : n_j, 
                   "doc_len" : doc_len,
                   "average_doc_len": average_doc_len,
                   "vec" : vec,
                   "inv_index" : inv_index}