In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import re
from multiprocessing import Pool, Lock, Value
from time import sleep
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer 
import json
from tqdm import tqdm

In [2]:
queries = pd.read_csv('queries.numerate.txt', sep='	', header=None)
urls = pd.read_csv('urls.numerate.txt', sep='	', header=None)
samples = pd.read_csv('sample.technosphere.ir1.textrelevance.submission.txt')
urls.index = urls[1]

In [None]:
docnames = []

listdir = os.listdir('content/content/')

for d in listdir:
    listdocs = os.listdir('content/content/' + d)
    listdocs.sort()
    docnames += list(map(lambda x: d + '/' + x, listdocs))
docnames[-5:]

In [None]:
with open('content/content/' + docnames[0], errors='ignore') as read_file:
    lines = list(read_file)
html = "".join(lines[1:])
soup = BeautifulSoup(html)
soup.text

In [None]:
# doc_name | doc_id

doc_ids = []

for docname in docnames:
    with open('content/content/' + docname, errors='ignore') as read_file:
        lines = list(read_file)
    url = lines[0].strip()
    doc_id = urls.at[url, 0]
    doc_ids.append(doc_id)

In [None]:
docid_df = pd.DataFrame({
    'DocumentName' : docnames,
    'DocumentId' : doc_ids
})

In [None]:
docid_df.head()

In [None]:
docid_df = docid_df.sort_values('DocumentId')

In [None]:
docid_df.to_csv('docids.txt', index=False)

In [3]:
docid_df = pd.read_csv('docids.txt')

In [4]:
docid_df.index = docid_df['DocumentId']

In [None]:
docid_df = docid_df.drop(columns=['DocumentId', 'DocumentId.1'])

In [None]:
docid_df.head()

In [None]:
docid_df.iloc[0]['DocumentName']

In [None]:
def doc2words(doc_id):
    with open('content/content/' + docid_df.iloc[doc_id]['DocumentName'], errors='ignore') as read_file:
        lines = list(read_file)
    html = "".join(lines[1:])
    soup = BeautifulSoup(html)
    title = ""
    for a in soup.find_all('a'):
        if a.get('title'):
            title += ' '
            title += re.sub(r'[^A-Za-zА-Яа-я0-9]+', ' ', a['title'])
    if soup.title:
        if soup.title.text:
            title += ' '
            title += re.sub(r'[^A-Za-zА-Яа-я0-9]+', ' ', soup.title.text)
    if soup.text:
        title += ' '
        title += re.sub(r'[^A-Za-zА-Яа-я0-9]+', ' ', soup.text)
    return title

### index + tf-idf

In [None]:
with open('stop_words.txt', "r") as rf:
    lines = [line.strip() for line in rf]
stop_words = set(lines)

{term1: [[doc_id1, term_freq1], [doc_id2, term_freq2], ...], term2: ...}

In [None]:
stemmer = SnowballStemmer("russian")

# for id, doc in tqdm(doc_df.iterrows(), total=38114, position=0):
def create_index(doc_id):
    text = doc2words(doc_id)
    loc_index = {}
    terms = str(text).split()
    terms = [t.lower().strip() for t in terms]
    terms = [t for t in terms if not t in stop_words]
    terms = [stemmer.stem(term) for term in terms]
    for term in terms:
        if term in loc_index.keys():
            doc_id, num = loc_index[term][-1]
            if doc_id == id:
                loc_index[term][-1][1] += 1
            else:
                loc_index[term].append([id, 1])
        else:
            loc_index[term] = [[id, 1]]
            
    # computing tf
    for term in terms:
        loc_index[term][-1][1] /= len(terms)
    return loc_index

### поисковые расширения

In [None]:
synonims = {
    "применить": ["использовать"],
    "инстаграм": ["instagram"],
    "кап": ["капитальный"],
    "биос": ["bios"],
    "майнкрафт": ["minecraft"],
    "авто": ["автомобиль", "машина"],
    "гта": ["gta"],
    "опфр": ["пенсионный", "фонд", "российской", "федерации"],
    "ифнс": ["инспекция", "федеральной", "налоговой", "службы"],
    "бесишь": ["раздражаешь", "злишь", "нервируешь"],
    "вай": ["wi"],
    "фай": ["fi"],
    "соц": ["социальный"],
    "вк": ["vk", "vkontakte", "вконтакте"],
    "кс": ["cs", "counter", "strike"],
    "дискорд": ["discord"],
    "киви": ["kiwi"],
    "трейнз": ["trainz"],
    "мерседес": ["mercedes"],
    "симс": ["sims"],
    "биос": ["bios"],
    "псп": ["playstation" "portable", "psp"],
    "мод": ["mode"],
    "одн": ["общедомовые", "нужды"],
    "мегафон": ["megafon"],
    "асти": ["asti"],
    "пдф": ["pdf"],
    "бмп": ["bmp"], 
    "ммр": ["mmr"],
    "ккал": ["калория"],
    "поу": ["pou"]
}

### Make result

In [None]:
QueryId = []

for i in range(1, 400):
    QueryId += [i] * 10
    
# size = 10 * 399

In [None]:
queries.head()

In [None]:
samples.head()

* term-frequency

$tf = \frac{n_t}{\sum{n_k}}$

$n_t$ - число вхождений слова t в документ

$\sum{n_k}$ - общее число слов в данном документе

* inverse document frequency

$idf = log \frac{|D|}{|d_i : t \in d_i|}$ 

$|D|$ - число документов в коллекции

$|d_i : t \in d_i|$ - число документов из коллекции D, в которых встречается t

text[0] - QueryId

text[1] - queries text

docs_list = [[doc_id1, term_freq1], [doc_id2, term_freq2], ...]

In [None]:
stemmer = SnowballStemmer("russian") 

def get_prediction(query_row):
    id, text = query_row
    rows_df = samples.loc[samples['QueryId'] == id]
    D = rows_df.shape[0]
    res = []
    for row in rows_df.iterrows():
        doc_id = row[1]['DocumentId']
        local_index = create_index(doc_id)
        res.append(local_index)
    # unite  indexes
    index = {}
    for i in res:
        for key, value in i.items():
            if key in index:
                index[key] += value
                
            else:
                index[key] = value
    words = text[1].split()
    words = [w.lower().strip() for w in words]
    words = [w for w in words if not w in stop_words]
    add = []
    for word in words:
        if word in synonims:
            add += synonims[word]
    words += add
    words = [stemmer.stem(word) for word in words]
    words_info = [] # list of lists with docs numbers and number of word entries
    
    for i in range(len(words)):
        word = words[i]
        if word in index.keys():
            words_info.append(index[word])
        else:
            words_info.append([])
    print(words_info)
    # id of all docs which contain one of query's word
    all_docs = list(set([doc_info[0] for word_info in words_info 
                         for doc_info in word_info]))
    # word's counter for each document
    docs_cnt = {el:0 for el in all_docs}
    
    # computing sum tf-idf for each document
    for word_info in words_info:
        if word_info:
            idf = np.log(D / len(word_info))
        else:
            idf = 0
        for doc_info in word_info: # doc_info = [doc_id, term_freq]
            docs_cnt[doc_info[0]] += doc_info[1] * idf
    # sort docs by it's frequency for query's words
    docs_cnt = [item for item in docs_cnt.items()]
    docs_cnt = sorted(docs_cnt, key=lambda x: x[1], reverse=True) # [(doc_id1, freq1), (doc_id2, freq2), ...]
    doc_list = []
    i = 0
    for item in docs_cnt: # item[0] = doc_id
        doc_list.append(item[0])
        i += 1
        if i == 10:
            break
    return (id, np.array(doc_list))
    

In [None]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(query_row):
    result = get_prediction(query_row) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return result

with Pool(processes=12) as pool:
    result = pool.map(func_wrapper, queries.head().iterrows())

In [None]:
result

In [None]:
result = sorted(result, key=lambda x: x[0])

docsId = []
for id, docs in result:
    docsId += docs

In [None]:
result_df = pd.DataFrame({
    'QueryId' : QueryId,
    'DocumentId' : docsId
})

In [None]:
result_df.head()

In [None]:
result_df.to_csv('result.txt', index=False)