In [1]:
from tqdm import tqdm
import pandas as pd
import os
import gzip
import shutil
from time import sleep
import pymorphy2
from pyaspeller import YandexSpeller
from lru import LRU
import pickle
from time import sleep
from concurrent.futures import ThreadPoolExecutor
import tarfile
import bz2

In [2]:
df_marks = pd.read_csv('train.marks.tsv', delimiter='\t')
df_marks = df_marks.rename(columns={"2": "QueryId", "135041": "DocumentId"})
df_marks = df_marks.drop(columns=['1'])
df_example = pd.read_csv('sample.csv')
all_groups = df_marks.append(df_example)
all_groups = all_groups.reset_index()
all_groups = all_groups.drop(columns=['index'])
all_groups = all_groups.sort_values(by=['QueryId', 'DocumentId']).reset_index()
all_groups = all_groups.drop(columns=['index'])
rev_frame = all_groups.sort_values(by=['DocumentId', 'QueryId']).reset_index()
rev_frame = rev_frame.drop(columns=['index'])

In [3]:
query = all_groups['QueryId'].values
doc = all_groups['DocumentId'].values
q_to_doc = dict()
for i in tqdm(range(query.shape[0])):
    if query[i] not in q_to_doc:
        q_to_doc[query[i]] = []
    q_to_doc[query[i]].append(doc[i])

100%|██████████| 606049/606049 [00:00<00:00, 1258008.69it/s]


In [4]:
query_r = rev_frame['QueryId'].values
doc_r = rev_frame['DocumentId'].values
doc_to_q = dict()
for i in tqdm(range(query_r.shape[0])):
    if doc_r[i] not in doc_to_q:
        doc_to_q[doc_r[i]] = []
    doc_to_q[doc_r[i]].append(query_r[i])

100%|██████████| 606049/606049 [00:00<00:00, 688981.47it/s]


In [5]:
os.makedirs('clean_title_collection')
os.makedirs('norm_title_collection')

In [6]:
q_ind = all_groups['QueryId'].unique()
for i in tqdm(q_ind):
    os.makedirs('norm_title_collection/'+str(i))
    os.makedirs('clean_title_collection/'+str(i))

100%|██████████| 6311/6311 [00:02<00:00, 2928.05it/s]


In [7]:
title_data = dict()

In [11]:
with gzip.open("docs.tsv.gz") as file:
    for line in tqdm(file):
        a,b,c = line.decode('utf-8', errors = 'ignore').lower().strip('\n').split('\t')
        title = b
        title_data[int(a)] = title

582167it [15:15, 636.06it/s]  


In [12]:
with open('title_data.pickle', 'wb') as file:
    pickle.dump(title_data, file)

In [4]:
title_data[135041]

'наша деятельность'

In [9]:
raw_data = []
for key in tqdm(title_data):
    raw_data.append((key, title_data[key]))

100%|██████████| 582167/582167 [00:00<00:00, 1916356.76it/s]


In [10]:
def check(x):
    try:
        spellchecker2 = YandexSpeller()
        a = spellchecker2.spelled(x[1])
    except:
        sleep(1)
        spellchecker2 = YandexSpeller()
        a = spellchecker2.spelled(x[1])
    return (x[0], a)

In [11]:
with ThreadPoolExecutor(7) as executor:
    results = list(tqdm(executor.map(check, raw_data), total=582167))

100%|██████████| 582167/582167 [10:13:45<00:00, 15.81it/s]   


In [12]:
spelled_title_data = dict()

In [13]:
for elem in tqdm(results):
    spelled_title_data[elem[0]] = elem[1]

100%|██████████| 582167/582167 [00:00<00:00, 1672266.75it/s]


In [14]:
with open('spelled_title_data.pickle', 'wb') as file:
    pickle.dump(spelled_title_data, file)

In [6]:
spelled_title_data[135041]

'наша деятельность'

In [3]:
spell_morph_title = dict()
raw_data = []
for key in tqdm(tmp):
    raw_data.append((key, tmp[key]))

100%|██████████| 582167/582167 [00:00<00:00, 1748581.30it/s]


In [5]:
morph = pymorphy2.MorphAnalyzer()
def pymorphy_tokenizer(text):
    res = []
    for word in text.split():
        res.append(morph.parse(word)[0].normal_form)
    return ' '.join(res)

In [6]:
for elem in tqdm(raw_data):
    spell_morph_title[elem[0]] = pymorphy_tokenizer(elem[1])

100%|██████████| 582167/582167 [18:11<00:00, 533.45it/s] 


In [7]:
with open('clean_title_data.pickle', 'wb') as file:
    pickle.dump(spell_morph_title, file)

In [8]:
tmp[135041]

'наш деятельность'

In [9]:
norm_title = dict()
raw_data = []
for key in tqdm(title_data):
    raw_data.append((key, title_data[key]))

100%|██████████| 582167/582167 [00:00<00:00, 1398716.40it/s]


In [10]:
for elem in tqdm(raw_data):
    norm_title[elem[0]] = pymorphy_tokenizer(elem[1])

100%|██████████| 582167/582167 [18:19<00:00, 529.48it/s] 


In [11]:
with open('norm_title_data.pickle', 'wb') as file:
    pickle.dump(norm_title, file)

In [9]:
with open('title_dict/norm_title_data.pickle', 'rb') as file:
    tmp = pickle.load(file)

In [10]:
tmp[135041]

'наш деятельность'

In [None]:
#с заголовками все в порядке

In [12]:
#читаем урлы и id сайтов
id_url = dict()
url_id = dict()
with open("url.data", "r") as file:
    for line in tqdm(file):
        a,b = line.strip('\n').split('\t')
        id_url[int(a)] = b
        url_id[b] = int(a)

582167it [00:00, 607553.63it/s]


In [26]:
with open('id_url.pickle', 'wb') as file:
    pickle.dump(id_url, file)

In [27]:
with open('url_id.pickle', 'wb') as file:
    pickle.dump(url_id, file)

In [13]:
df_q = pd.read_csv('queries.tsv', delimiter='\t', header=None)

In [20]:
df_q = pd.read_csv('queries.tsv', delimiter='\t', header=None)
id_querry = dict()
querry_id = dict()
ids = df_q[0].values
querrys = df_q[1].values
for i in range(ids.shape[0]):
    id_querry[ids[i]] = querrys[i]
    querry_id[querrys[i]] = ids[i]

In [21]:
with open('id_querry.pickle', 'wb') as file:
    pickle.dump(id_querry, file)
with open('querry_id.pickle', 'wb') as file:
    pickle.dump(querry_id, file)

In [24]:
spellchecker2 = YandexSpeller()
morph = pymorphy2.MorphAnalyzer()
def pymorphy_tokenizer(text):
    res = []
    for word in text.split():
        res.append(morph.parse(word)[0].normal_form)
    return ' '.join(res)

In [25]:
id_querry_spelled = dict()
id_querry_clean = dict()
id_querry_norm = dict()
querry_id_spelled = dict()
for key in tqdm(id_querry):
    id_querry_spelled[key] = spellchecker2.spelled(id_querry[key])
    querry_id_spelled[id_querry_spelled[key]] = key
    id_querry_clean[key] = pymorphy_tokenizer(id_querry_spelled[key])
    id_querry_norm[key] = pymorphy_tokenizer(id_querry[key])

100%|██████████| 6311/6311 [40:15<00:00,  2.61it/s] 


In [26]:
with open('id_querry_spelled.pickle', 'wb') as file:
    pickle.dump(id_querry_spelled, file)
with open('id_querry_clean.pickle', 'wb') as file:
    pickle.dump(id_querry_clean, file)
with open('id_querry_norm.pickle', 'wb') as file:
    pickle.dump(id_querry_norm, file)
with open('querry_id_spelled.pickle', 'wb') as file:
    pickle.dump(querry_id_spelled, file)

In [29]:
with open('id_querry.pickle', 'rb') as file:
    id_querry = pickle.load(file)
with open('id_querry_spelled.pickle', 'rb') as file:
    id_querry_spelled = pickle.load(file)

In [30]:
enlarged_querrys = dict()
for key in tqdm(id_querry):
    if id_querry[key] != id_querry_spelled[key]:
        enlarged_querrys[id_querry[key]] = key
        enlarged_querrys[id_querry_spelled[key]] = key
    else:
        enlarged_querrys[id_querry[key]] = key

100%|██████████| 6311/6311 [00:00<00:00, 839312.97it/s]


In [31]:
with open('enlarged_querrys.pickle', 'wb') as file:
    pickle.dump(enlarged_querrys, file)

In [10]:
#читаем данные из gzip файла
with gzip.open("docs.tsv.gz") as file:
    for line in tqdm(file):
        a,b,c = line.decode('utf-8', errors = 'ignore').lower().strip('\n').split('\t')
        title = b
        title_data[int(a)] = title
        
        norm_title = b.split()
        norm_title = [word for word in pymorphy_tokenizer(norm_title)]
        norm_title = ' '.join(norm_title)
        norm_title_data[int(a)] = norm_title
        
        clean_title = spellchecker.spelled(b)
        clean_title = clean_title.split()
        clean_title = [word for word in pymorphy_tokenizer(clean_title)]
        clean_title = ' '.join(clean_title)
        clean_title_data[int(a)] = clean_title
        
        title = title.encode('utf-8')
        norm_title = norm_title.encode('utf-8')
        clean_title = clean_title.encode('utf-8')
        if int(a) in doc_to_q:
            #norm title
            try:
                with gzip.open(a+'.gz', 'wb') as f:
                    f.write(norm_title)
            except OSError:
                sleep(3)
                with gzip.open(a+'.gz', 'wb') as f:
                    f.write(norm_title)
            q_list = doc_to_q[int(a)]
            for elem in q_list:
                shutil.copy2(a+'.gz', 'norm_title_collection/'+str(elem)+'/'+a+'.gz')
            try:
                os.remove(a+'.gz')
            except PermissionError:
                sleep(3)
                os.remove(a+'.gz')
            
            #clean title
            try:
                with gzip.open(a+'.gz', 'wb') as f:
                    f.write(clean_title)
            except OSError:
                sleep(3)
                with gzip.open(a+'.gz', 'wb') as f:
                    f.write(clean_title)
            q_list = doc_to_q[int(a)]
            for elem in q_list:
                shutil.copy2(a+'.gz', 'clean_title_collection/'+str(elem)+'/'+a+'.gz')
            try:
                os.remove(a+'.gz')
            except PermissionError:
                sleep(3)
                os.remove(a+'.gz')

1442it [09:35,  2.50it/s]


KeyboardInterrupt: 

In [None]:
with open('title_data.pickle', 'wb') as file:
    pickle.dump(title_data, file)
with open('clean_title_data.pickle', 'wb') as file:
    pickle.dump(clean_title_data, file)
with open('norm_title_data.pickle', 'wb') as file:
    pickle.dump(norm_title_data, file)