In [1]:
from tqdm import tqdm
import pandas as pd
import os
import gzip
import zipfile
import shutil
from time import sleep
import pymorphy2
from pyaspeller import YandexSpeller
from lru import LRU
import pickle
from time import sleep
from concurrent.futures import ThreadPoolExecutor
import tarfile
import bz2
import sys
from rank_bm25 import BM25Plus, BM25L, BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with open('query_dict/id_querry_clean.pickle', 'rb') as file:
    id_querry_clean = pickle.load(file)

In [3]:
df_marks = pd.read_csv('train.marks.tsv', delimiter='\t', header=None)
df_marks = df_marks.rename(columns={0: "QueryId", 1: "DocumentId"})
df_marks = df_marks.drop(columns=[2])
df_example = pd.read_csv('sample.csv')
all_groups = df_marks.append(df_example)
all_groups = all_groups.reset_index()
all_groups = all_groups.drop(columns=['index'])
all_groups = all_groups.sort_values(by=['QueryId', 'DocumentId']).reset_index()
all_groups = all_groups.drop(columns=['index'])
rev_frame = all_groups.sort_values(by=['DocumentId', 'QueryId']).reset_index()
rev_frame = rev_frame.drop(columns=['index'])

In [4]:
all_groups

Unnamed: 0,QueryId,DocumentId
0,0,1443
1,0,5912
2,0,5963
3,0,6096
4,0,6230
...,...,...
606045,6310,497350
606046,6310,512263
606047,6310,529857
606048,6310,551291


In [8]:
doc_classic_feats = dict()
doc_classic_feats['BM25Plus_all'] = []
doc_classic_feats['BM25L_all'] = []
doc_classic_feats['BM25Okapi_all'] = []
doc_classic_feats['BM25Plus_500'] = []
doc_classic_feats['BM25L_500'] = []
doc_classic_feats['BM25Okapi_500'] = []
doc_classic_feats['BM25Plus_1000'] = []
doc_classic_feats['BM25L_1000'] = []
doc_classic_feats['BM25Okapi_1000'] = []

In [9]:
def calc_cos_sim(vect, corp, query):
    x = vect.fit_transform(corp)
    q = vect.transform([query])
    return list(cosine_similarity(x,q).flatten())

In [10]:
q_ids = all_groups['QueryId'].unique()
for q1 in tqdm(q_ids):
    corpus = []
    tmp = all_groups[all_groups.QueryId==q1]
    docs = tmp['DocumentId'].values
    for doc1 in docs:
        with gzip.open("clean_doc_collection/{}/{}.gz".format(q1, doc1)) as file:
            a = ['']
            for line in file:
                a = line.decode('utf-8', errors = 'ignore').lower().strip('\n').split('\t')
        corpus.append(a[0])
    corpus_all_tok = [doc.split() for doc in corpus]
    corpus_500_tok = [text[:500] for text in corpus_all_tok]
    corpus_1000_tok = [text[:1000] for text in corpus_all_tok]
    
    model1_all = BM25Plus(corpus_all_tok)
    model2_all = BM25L(corpus_all_tok)
    model3_all = BM25Okapi(corpus_all_tok)
    
    model1_500 = BM25Plus(corpus_500_tok)
    model2_500 = BM25L(corpus_500_tok)
    model3_500 = BM25Okapi(corpus_500_tok)
    
    model1_1000 = BM25Plus(corpus_1000_tok)
    model2_1000 = BM25L(corpus_1000_tok)
    model3_1000 = BM25Okapi(corpus_1000_tok)
    
    cur_q = id_querry_clean[q1]
    one_tok_txt = cur_q.split()
    
    doc_classic_feats['BM25Plus_all'].extend(list(model1_all.get_scores(one_tok_txt)))
    doc_classic_feats['BM25L_all'].extend(list(model2_all.get_scores(one_tok_txt)))
    doc_classic_feats['BM25Okapi_all'].extend(list(model3_all.get_scores(one_tok_txt)))
    
    doc_classic_feats['BM25Plus_500'].extend(list(model1_500.get_scores(one_tok_txt)))
    doc_classic_feats['BM25L_500'].extend(list(model2_500.get_scores(one_tok_txt)))
    doc_classic_feats['BM25Okapi_500'].extend(list(model3_500.get_scores(one_tok_txt)))
    
    doc_classic_feats['BM25Plus_1000'].extend(list(model1_1000.get_scores(one_tok_txt)))
    doc_classic_feats['BM25L_1000'].extend(list(model2_1000.get_scores(one_tok_txt)))
    doc_classic_feats['BM25Okapi_1000'].extend(list(model3_1000.get_scores(one_tok_txt)))

100%|██████████| 6311/6311 [3:53:32<00:00,  2.22s/it]   


In [12]:
with open('bm_feats_upd.pickle', 'wb') as file:
    pickle.dump(doc_classic_feats, file)