In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import bs4
import re
from multiprocessing import Pool, Lock, Value
from time import sleep
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer 
import json
from tqdm import tqdm

In [5]:
queries = pd.read_csv('queries.numerate.txt', sep='	', header=None)
urls = pd.read_csv('urls.numerate.txt', sep='	', header=None)
urls.index = urls[1]

In [3]:
docnames = []

listdir = os.listdir('content/content/')

for d in listdir:
    listdocs = os.listdir('content/content/' + d)
    listdocs.sort()
    docnames += list(map(lambda x: d + '/' + x, listdocs))
docnames[-5:]

['20170711/doc.4873.dat',
 '20170711/doc.4874.dat',
 '20170711/doc.4875.dat',
 '20170711/doc.4876.dat',
 '20170711/doc.4877.dat']

In [4]:
def doc2words(docname):
    with open('content/content/' + docname, errors='ignore') as read_file:
        lines = list(read_file)
    url = lines[0].strip()
    html = "".join(lines[1:])
    soup = BeautifulSoup(html)
    doc_id = urls.at[url, 0]
    if soup.text:
        text = re.sub(r'[^A-Za-zА-Яа-я0-9]+', ' ', soup.text)
    else:
        text = ""
    if soup.title:
        if soup.title.text:
            title = re.sub(r'[^A-Za-zА-Яа-я0-9]+', ' ', soup.title.text)
        else:
            title = ""
    else:
        title = ""

    return (title + text, doc_id)

In [5]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(docname):
    res = doc2words(docname) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=10) as pool:
    docs = pool.map(func_wrapper, docnames)

38110 objects are processed...

In [6]:
docs_text = list(map(lambda x: x[0], docs))
docs_id = list(map(lambda x: x[1], docs))

doc_df = pd.DataFrame({'id': docs_id, 'text': docs_text})
doc_df = doc_df.sort_values(by=['id'])
doc_df.head()

Unnamed: 0,id,text
21640,1,Хорватия Хорватия EVA RU Вход Европа Черногори...
21950,2,Три недели в Европе Начало Три недели в Европе...
21172,3,Как правильно сделать визу в Польшу шенген Q A...
20002,4,Новости Новости О нас Отрасли Страны Новости О...
20285,5,Аллергия на Магические Грибы fb2 КулЛиб Классн...


In [7]:
doc_df.to_csv("text_relevance.csv")

### index + tf-idf

In [2]:
doc_df = pd.read_csv("text_relevance.csv")

In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words |= set(stopwords.words('russian'))
with open('stop_words.txt', "r") as rf:
    lines = [line.strip() for line in rf]
stop_words |= set(lines)
stop_words |= set(['.', '...', '-'])

[nltk_data] Downloading package stopwords to /home/anse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
doc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38114 entries, 0 to 38113
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  38114 non-null  int64 
 1   id          38114 non-null  int64 
 2   text        38114 non-null  object
dtypes: int64(2), object(1)
memory usage: 893.4+ KB


In [5]:
doc_df.index = doc_df['id']

In [6]:
doc_df.head()

Unnamed: 0_level_0,Unnamed: 0,id,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,21640,1,Хорватия Хорватия EVA RU Вход Европа Черногори...
2,21950,2,Три недели в Европе Начало Три недели в Европе...
3,21172,3,Как правильно сделать визу в Польшу шенген Q A...
4,20002,4,Новости Новости О нас Отрасли Страны Новости О...
5,20285,5,Аллергия на Магические Грибы fb2 КулЛиб Классн...


{term1: [[doc_id1, term_freq1], [doc_id2, term_freq2], ...], term2: ...}

In [7]:
stemmer = SnowballStemmer("russian")

# for id, doc in tqdm(doc_df.iterrows(), total=38114, position=0):
def create_index(doc_row):
    id, doc = doc_row
    loc_index = {}
    terms = doc['text'].split()
    terms = [t.lower() for t in terms]
    terms = [t for t in terms if not t in stop_words]
    terms = [stemmer.stem(term) for term in terms]
    for term in terms:
        if term in loc_index.keys():
            doc_id, num = loc_index[term][-1]
            if doc_id == id:
                loc_index[term][-1][1] += 1
            else:
                loc_index[term].append([id, 1])
        else:
            loc_index[term] = [[id, 1]]
            
    # computing tf
    for term in terms:
        loc_index[term][-1][1] /= len(terms)
    return loc_index

In [8]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(doc_row):
    res = create_index(doc_row) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=12) as pool:
    res = pool.map(func_wrapper, doc_df.loc[30000:].iterrows())

8110 objects are processed...

In [9]:
# unite  indexes

index = {}

for i in res:
    for key, value in i.items():
        if key in index:
            index[key] += value
        else:
            index[key] = value

In [10]:
with open('index4.json', 'w') as f:
    json.dump(index, f)

In [2]:
with open('index1.json', 'r') as f:
    index = json.load(f)

In [3]:
for i in range(2, 5):
    with open('index{}.json'.format(i), 'r') as f:
        index2 = json.load(f)
    for key, value in index2.items():
        if key in index:
            index[key] += value
        else:
            index[key] = value

In [4]:
with open('index.json', 'w') as f:
    json.dump(index, f)

### Make result

In [None]:
with open('index.json', 'r') as f:
    index = json.load(f)

In [6]:
queries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       399 non-null    int64 
 1   1       399 non-null    object
dtypes: int64(1), object(1)
memory usage: 6.4+ KB


In [7]:
QueryId = []

for i in range(1, 400):
    QueryId += [i] * 10
    
# size = 10 * 399

* term-frequency

$tf = \frac{n_t}{\sum{n_k}}$

$n_t$ - число вхождений слова t в документ

$\sum{n_k}$ - общее число слов в данном документе

* inverse document frequency

$idf = log \frac{|D|}{|d_i : t \in d_i|}$ 

$|D|$ - число документов в коллекции

$|d_i : t \in d_i|$ - число документов из коллекции D, в которых встречается t

text[0] - QueryId

text[1] - queries text

docs_list = [[doc_id1, term_freq1], [doc_id2, term_freq2], ...]

In [10]:
stemmer = SnowballStemmer("russian") 

D = len(index)
docsId = np.ones(10 * 399, dtype=int)

for id, text in tqdm(queries.iterrows(), total=399, position=0):
    words = text[1].split()
    words = [w for w in words if not w in stop_words]
    words = [stemmer.stem(word) for word in words]
    
    words_info = [] # list of lists with docs numbers and number of word entries
    
    for i in range(len(words)):
        word = words[i]
        if word in index.keys():
            words_info.append(index[word])
        else:
            words_info.append([])
    # id of all docs which contain one of query's word
    all_docs = list(set([doc_info[0] for word_info in words_info 
                         for doc_info in word_info]))
    # word's counter for each document
    docs_cnt = {el:0 for el in all_docs}
    
    # computing sum tf-idf for each document
    for word_info in words_info:
        if word_info:
            idf = np.log(D / len(word_info))
        else:
            idf = 0
        for doc_info in word_info: # doc_info = [doc_id, term_freq]
            docs_cnt[doc_info[0]] += doc_info[1] * idf

    # sort docs by it's frequency for query's words
    docs_cnt = [item for item in docs_cnt.items()]
    docs_cnt = sorted(docs_cnt, key=lambda x: x[1], reverse=True) # [(doc_id1, freq1), (doc_id2, freq2), ...]
    i = 0
    
    # docsId[id - 1 + i]
    for item in docs_cnt: # item[0] = doc_id
        docsId[(text[0] - 1) * 10 + i] = item[0]
        i += 1
        if i == 10:
            break

100%|██████████| 399/399 [00:14<00:00, 28.17it/s]


In [11]:
result_df = pd.DataFrame({
    'QueryId' : QueryId,
    'DocumentId' : docsId
})

In [12]:
result_df.head()

Unnamed: 0,QueryId,DocumentId
0,1,34821
1,1,25494
2,1,2118
3,1,1808
4,1,21108


In [13]:
result_df.to_csv('result.txt', index=False)