In [27]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import re
from multiprocessing import Pool, Lock, Value
from time import sleep
import os
from tqdm import tqdm
from pymystem3 import Mystem
from _collections import defaultdict

In [2]:
queries = pd.read_csv('queries.numerate.txt', sep='	', header=None)
urls = pd.read_csv('urls.numerate.txt', sep='	', header=None)
samples = pd.read_csv('sample.technosphere.ir1.textrelevance.submission.txt')

In [14]:
docid_df = pd.read_csv('docids.txt')
docid_df.index = docid_df['DocumentId']
docid_df = docid_df.drop(columns=['DocumentId'])
docid_df.head()

Unnamed: 0_level_0,DocumentName
DocumentId,Unnamed: 1_level_1
1,20170707/doc.2351.dat
2,20170707/doc.2661.dat
3,20170707/doc.1883.dat
4,20170707/doc.0713.dat
5,20170707/doc.0996.dat


### Document processing

In [16]:
stem = Mystem()
PATTERN = re.compile(r'[A-Za-zА-Яа-я0-9]+')

In [17]:
def doc2words(doc_id):
    with open('content/content/' + docid_df.iloc[doc_id - 1]['DocumentName'], errors='ignore') as read_file:
        lines = list(read_file)
    html = "".join(lines[1:])
    soup = BeautifulSoup(html)
    [s.extract() for s in soup(['script', 'style', 'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
    body = soup.get_text('\n', True).lower()
    body = PATTERN.findall(body)
    body = ' '.join([stem.lemmatize(word)[0] for word in body])
    
    soup = BeautifulSoup(html)
    title = ' '.join(e.get_text() for e in soup.find_all('title')).lower()
    title = PATTERN.findall(title)
    title = ' '.join([stem.lemmatize(word)[0] for word in title])
      
    headers = ' '.join([e.get_text() for e in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]).lower()
    headers = PATTERN.findall(headers)
    headers = ' '.join([stem.lemmatize(word)[0] for word in headers])

    with open('parsed/{}.txt'.format(doc_id), 'w') as f:
        f.write(title + '\n')
        f.write(headers + '\n')
        f.write(body)

In [25]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(doc_id):
    doc2words(doc_id) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    
with Pool(processes=12) as pool:
    pool.map(func_wrapper, docid_df.index)

38110 objects are processed...

### BM-25

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f9d93756035e66c406a96470c7bf801b5161e238"/>

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c652b6871ce4872c8e924ff0f806bc8b06dc94ed"/>

$f(q_{i},D)$ — частота слова (term frequency, TF) $q_{i}$ в документе D

$|D|$ — длина документа (количество слов в нём)

$avgdl$ — средняя длина документа в коллекции

$k_{1}$ и $b$ — cвободные коэффициенты, обычно их выбирают как $k_{1}=2.0$ и $b=0.75$

$N$ — общее количество документов в коллекции

$n(q_{i})$ — количество документов, содержащих $q_{i}$ 

In [28]:
query2docs = defaultdict(list)

for _, text in queries.iterrows():
    query_id = text[0]
    rows_df = samples.loc[samples['QueryId'] == query_id]
    for row in rows_df.iterrows():
        doc_id = row[1]['DocumentId']
        query2docs[query_id].append(doc_id)

In [36]:
class DocStats:
    def __init__(self):
        self.collection_len = 0
        self.title_len = 0
        self.headers_len = 0
        self.body_len = 0
        self.title2idf = defaultdict(float)
        self.headers2idf = defaultdict(float)
        self.body2idf = defaultdict(float)
        for docs in tqdm(query2docs.values()):
            for doc_id in docs:
                with open('parsed/{}.txt'.format(doc_id), errors='ignore') as f:
                    self.collection_len += 1
                    title = PATTERN.findall(f.readline().lower())
                    self.title_len += len(title)
                    self.wordStats(title, self.title2idf)
                    headers = PATTERN.findall(f.readline().lower())
                    self.headers_len += len(headers)
                    self.wordStats(headers, self.headers2idf)
                    body = PATTERN.findall(f.read().lower())
                    self.body_len += len(body)
                    self.wordStats(body, self.body2idf)
        self.countIDF(self.title2idf)
        self.countIDF(self.headers2idf)
        self.countIDF(self.body2idf)
        self.title_len /= self.collection_len
        self.headers_len /= self.collection_len
        self.body_len /= self.collection_len
        
    def wordStats(self, text, IDFdict):
        processed = set()
        for word in text:
            if word not in processed:
                IDFdict[word] += 1
                processed.add(word)
                
    def countIDF(self, IDFdict):
        idf_sum = 0
        neg_idf = []
        for word in IDFdict:
            idf = np.log(self.collection_len - IDFdict[word] + 0.5) - np.log(IDFdict[word] + 0.5)
            IDFdict[word] = idf
            idf_sum += idf
            if idf < 0:
                neg_idf.append(word)
        eps = 0.25 * idf_sum / len(IDFdict)
        for word in neg_idf:
            IDFdict[word] = eps

In [45]:
class BM25:
    def __init__(self, k=2.0, b=0.75):
        self.k = k
        self.b = b
        self.doc_stats = DocStats()        

    def getBest(self, query_id):
        doc2score = {}
        for doc_id in query2docs[query_id]:
            doc2score[doc_id] = self.getScore(query_id, doc_id)
        doc2score = sorted(doc2score.items(), key=lambda x: x[1], reverse=True)
        return doc2score

    def getScore(self, query_id, doc_id):
        title_tf = defaultdict(float)
        headers_tf = defaultdict(float)
        body_tf = defaultdict(float)
        with open('parsed/{}.txt'.format(doc_id), errors='ignore') as f:
            title = PATTERN.findall(f.readline().lower())
            headers = PATTERN.findall(f.readline().lower())
            body = PATTERN.findall(f.read().lower())
            title_len = len(title)
            headers_len = len(headers)
            body_len = len(body)
            for word in title:
                title_tf[word] += 1
            for word in headers:
                headers_tf[word] += 1
            for word in body:
                body_tf[word] += 1
        
        title_score = 0
        headers_score = 0
        body_score = 0
        
        query = queries.iloc[query_id - 1][1]
        query = PATTERN.findall(' '.join(stem.lemmatize(query.lower())))

        for word in query:
            title_score += self.doc_stats.title2idf[word] * (title_tf[word] * (self.k + 1)) / \
                    (title_tf[word] + self.k * (1 - self.b + self.b * title_len / self.doc_stats.title_len))
            headers_score += self.doc_stats.headers2idf[word] * (headers_tf[word] * (self.k + 1)) / \
                    (headers_tf[word] + self.k * (1 - self.b + self.b * headers_len / self.doc_stats.headers_len))
            body_score += self.doc_stats.body2idf[word] * (body_tf[word] * (self.k + 1)) / \
                    (body_tf[word] + self.k * (1 - self.b + self.b * body_len / self.doc_stats.body_len))
        return 2.7 * title_score + 1.0 * headers_score + 1.5 * body_score

### Make result

In [47]:
model = BM25()

100%|██████████| 399/399 [02:54<00:00,  2.28it/s]


In [48]:
QueryId = []

docsId = []
for query_id in tqdm(range(1, 400)):
    best = model.getBest(query_id)
    for item in best:
        QueryId.append(query_id)
        docsId.append(item[0])

100%|██████████| 399/399 [03:19<00:00,  2.00it/s]


In [49]:
result_df = pd.DataFrame({
    'QueryId' : QueryId,
    'DocumentId' : docsId
})

In [50]:
result_df.head()

Unnamed: 0,QueryId,DocumentId
0,1,78
1,1,28
2,1,51
3,1,53
4,1,83


In [51]:
result_df.to_csv('result.txt', index=False)