<a href="https://colab.research.google.com/github/MitPitt/IR/blob/master/TextRelevancePiterkin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install stanza
!pip install spacy_stanza
!pip install pymorphy2==0.8
!pip install transliterate
!pip install googletrans
!pip install yandex-translater
!pip install pyaspeller

In [0]:
import googletrans
import yandex.Translater
import pyaspeller
from transliterate import translit
import stanza
from spacy_stanza import StanzaLanguage

stanza.download('ru') 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 116kB [00:00, 10.3MB/s]                    
2020-05-20 01:03:46 INFO: Downloading default packages for language: ru (Russian)...
Downloading http://nlp.stanford.edu/software/stanza/1.0.0/ru/default.zip: 100%|██████████| 591M/591M [01:47<00:00, 5.52MB/s]
2020-05-20 01:05:42 INFO: Finished downloading models and saved to /root/stanza_resources.


In [0]:
class QueryProcessor():
    def __init__(self):
        # layouts
        self._rus_chars = "ёйцукенгшщзхъфывапролджэячсмитьбю.Ё!\"№;%:?*()_+ЙЦУКЕНГШЩЗХЪ/ФЫВАПРОЛДЖЭЯЧСМИТЬБЮ," # note \"
        self._eng_chars = "`qwertyuiop[]asdfghjkl;'zxcvbnm,./~!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?" # note \"
        self._trans_ru2en = str.maketrans(self._rus_chars, self._eng_chars)
        self._trans_en2ru = str.maketrans(self._eng_chars, self._rus_chars)
        #translator 1
        self.yandex_tr = yandex.Translater.Translater()
        self.yandex_tr.set_key('trnsl.1.1.20200508T141638Z.ad4b0fa3731d6994.9453309fdac5d82ddf65137156a065683eeb4fe9')
        #translator 2
        self.google_tr = googletrans.Translator()
        #spellchecker
        self.spellchecker = pyaspeller.YandexSpeller(lang=['ru','en'], ignore_capitalization=True, ignore_urls=True, ignore_digits=True,)
        #lemmatizer
        self.nlp = StanzaLanguage(stanza.Pipeline(lang="ru"))

    def _check_layout(self, query):
        det = self.google_tr.detect(query)
        query_ru = query.translate(self._trans_en2ru)
        det_ru = self.google_tr.detect(query_ru)
        #print(det_ru.confidence, det.confidence)
        if det.lang!='ru' and det.lang!='uk' and det_ru.confidence > det.confidence:
            query = query_ru 
        return query

    def _translate(self, query):
        alternatives = []
        if self.google_tr.detect(query).lang != 'ru':
            self.yandex_tr.set_text(query)
            self.yandex_tr.set_from_lang(self.yandex_tr.detect_lang())
            self.yandex_tr.set_to_lang('ru')
            alternatives.append(self.yandex_tr.translate().lower())
            #gtr = self.google_tr.translate(query, dest='ru').text.lower()
            #if gtr not in alternatives:
            #    alternatives.append(gtr)
        else:
            alternatives.append(query)
        return alternatives

    def _spellcheck(self, query):
        for error in self.spellchecker.spell(query):
            #print(error)
            if len(error['s'])>0:
                query = query.replace(error['word'], error['s'][0], 1)
            else:
                word = translit(error['word'], language_code='ru', reversed=True)
                try:
                    www = next(self.spellchecker.spell(word))['s']
                    if len(www) > 0:
                        query = query.replace(error['word'], www[0], 1)
                except StopIteration:
                    pass
        return query

    def _alts(self, query, swaplang='en'):
        """
        get synonymous queries
        """
        self.yandex_tr.set_text(query)
        self.yandex_tr.set_from_lang('ru')
        self.yandex_tr.set_to_lang(swaplang)
        self.yandex_tr.set_text(self.yandex_tr.translate())
        self.yandex_tr.set_from_lang(swaplang)
        self.yandex_tr.set_to_lang('ru')
        alts = [
                query,
                self.yandex_tr.translate().lower(),
                self.google_tr.translate((self.google_tr.translate(query, dest=swaplang).text), dest='ru').text.lower(),
                ]
        return alts

    def _lemmatize(self, query):
        lemmas = self.nlp(query)
        #for word in lemmas:
        #    print(word.lemma_, word.pos_)
        query = [word.lemma_ for word in lemmas if (word.pos_ != 'PUNCT' and word.pos_ !='ADP' and word.pos_ !='ADV' and word.pos_ !='PART' and word.pos_ !='CCONJ')] # delete punctuation and stopwords
        return query

    def process(self, query):
        query = self._check_layout(query)
        alternatives = self._translate(query)
        alternatives = [self._spellcheck(query) for query in alternatives]
        #print(alternatives)
        newalts = []
        languages = ['en'] # ,es,de
        for lang in languages:
            alternatives_trick =  [self._alts(query, swaplang=lang) for query in alternatives]
            alternatives_trick = {item for sublist in alternatives_trick for item in sublist} #flatten and remove dupes
            #print(alternatives_trick)
            newalts += list(alternatives_trick)
        alternatives = newalts
        #print(alternatives)
        alternatives = [self._lemmatize(query) for query in alternatives]
        #query = {item for sublist in alternatives for item in sublist} # merge all uqique lemmas into one query
        #return ' '.join(query)
        return alternatives

In [0]:
Processor = QueryProcessor()

2020-05-20 01:05:49 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |
| ner       | wikiner   |

2020-05-20 01:05:49 INFO: Use device: cpu
2020-05-20 01:05:49 INFO: Loading: tokenize
2020-05-20 01:05:49 INFO: Loading: pos
2020-05-20 01:05:50 INFO: Loading: lemma
2020-05-20 01:05:50 INFO: Loading: depparse
2020-05-20 01:05:51 INFO: Loading: ner
2020-05-20 01:05:53 INFO: Done loading processors!


In [0]:
Processor.process('как файл пдф перевести в бмп') # странно лемматизирует "пдф" но это норм, в документах лемматизирует так же

[['как', 'файл', 'пдф', 'перевести', 'бмп'],
 ['как', 'файл', 'п+f', 'конвертировать', 'bmp'],
 ['как', 'файл', 'формат', 'п+f', 'конвертировать', 'bmp']]

In [0]:
i=0

with open('/content/drive/My Drive/text relevance/lemmatized_queries.txt', mode='a') as  lemqueries:
    with open('/content/drive/My Drive/text relevance/queries.numerate.txt', encoding='utf8') as queries:
        for line in queries:
            query_id, query = line.strip().split('\t')
            print("({0})\t{1}".format(query_id, query))
            lemd = Processor.process(query)
            print(lemd)
            for lem in lemd:
                lemqueries.write(query_id +'\t'+ ' '.join(lem) +'\n')
            i+=1
i

In [0]:
nlp = StanzaLanguage(stanza.Pipeline(lang="ru", use_gpu=True, processors='tokenize,pos,lemma'))

2020-05-18 10:43:17 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |

2020-05-18 10:43:17 INFO: Use device: gpu
2020-05-18 10:43:17 INFO: Loading: tokenize
2020-05-18 10:43:27 INFO: Loading: pos
2020-05-18 10:43:28 INFO: Loading: lemma
2020-05-18 10:43:29 INFO: Done loading processors!


In [0]:
for i in nlp('куда поехать отдыхать'):
  print(i.lemma_, i.pos_)

куда ADV
поехать VERB
отдыхать VERB


In [0]:
from bs4 import BeautifulSoup
import re
import warnings

regex = re.compile('[^а-яА-Яa-zA-Z0-9]')

def parse_html(filepath):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        with open(filepath, "rb") as f:
            
            contents = f.read()

            soup = BeautifulSoup(contents, 'html.parser')

            try:
                title = soup.title.text.lower()
            except AttributeError:
                return [''], ['']
            
            data = soup.findAll(text=re.compile('[а-яА-Я]'))
            def visible(element):
                #print(str(element.encode('utf-8')))
                if element.parent.name in ['style', 'script', '[document]']:
                    return False
                elif re.search('<.*>', str(element.encode('utf-8',errors='surrogatepass'))):
                    return False
                return True
            other_text = filter(visible, data)
            other_text = [x.strip() for x in other_text]
            other_textt = []
            for x in other_text:
                x = regex.sub(' ', x).strip()
                if len(x)>1:
                    other_textt.extend([word.lemma_.lower() for word in nlp(x) if len(word.lemma_)>1 and (word.pos_ != 'PUNCT' and word.pos_ !='ADP' and word.pos_ !='ADV' and word.pos_ !='PART' and word.pos_ !='CCONJ')])
            
            
            title = [word.lemma_.lower() for word in nlp(title) if len(word.lemma_)>1 and (word.pos_ != 'PUNCT' and word.pos_ !='ADP' and word.pos_ !='ADV' and word.pos_ !='PART' and word.pos_ !='CCONJ')]
    return title, other_textt

In [0]:
parse_html("/content/drive/My Drive/text relevance/content/content/20170702/doc.0000.dat")

In [0]:
import os
import os.path

In [0]:
import pandas as pd


id_to_url = pd.read_csv("/content/drive/My Drive/text relevance/urls.numerate.txt", sep='\t', encoding='utf-8', header=None, names=['id','url','lemmatized_text_path'])
id_to_url.head()

Unnamed: 0,id,url,lemmatized_text_path
0,1,eva.ru/forum/topic-tree-mobile.htm?messageId=8...,
1,2,e1.ru/articles/travel/page_4/009/994/article_9...,
2,3,bikepost.ru/qa/post/9065/Kak-pravilno-sdelat-v...,
3,4,rutax.polpred.com/?ns=1&page=2,
4,5,coollib.com/b/259649/read,


In [0]:
i=0
for root, dirs, files in os.walk("/content/drive/My Drive/text relevance/content/content"):
  if root == '/content/drive/My Drive/text relevance/content/content/20170711':
    for filee in files:
        i+=1
        filepath = root+'/'+filee

        lemmatized_path = root + '_lem/'+ filee + '.lemmatized.txt'
        
        #if os.path.getsize(filepath) > 1000000:
        #  continue

        if not os.path.isfile(lemmatized_path):
          
            with open(filepath, 'rb') as f:
                url = f.readline().rstrip()
            print(i)
            print('URL:', url)
            print(lemmatized_path)
            lemmatized_title, lemmatized_text = parse_html(filepath)
            print('title:', ' '.join(lemmatized_title))
            with open(lemmatized_path, 'w') as the_file:
                the_file.write(' '.join(lemmatized_title) + '\n' + ' '.join(lemmatized_text))
        else:
            continue

        id_to_url.loc[id_to_url['url'] == url, 'lemmatized_text_path'] = lemmatized_path

sss

In [0]:
with open('/content/drive/My Drive/text relevance/content/content/20170702/doc.4084.dat.lemmatized.txt') as fil:
    for line in fil:
      print(line)

нестандартный династия вопрос предложение [архив страница prosims новость обзор дополнение файл код объект скина скриншот the sims the sims сима форева

prosims новость обзор дополнение файл код объект скина скриншот the sims the sims сима форева творчество династия нестандартный династия нестандартный династия вопрос предложение pda просмотр полный версия нестандартный династия вопрос предложение страница arnae 11.02.2010 22 14 ограничить способ учеба книга разговор профессор доз никакой комп курсовой фимка 11.02.2010 22 57 тьма ээть компа мочь быть этот время елен все изучать старинка развлечение быть муза инструмент книга электроника телефон быть крайний мера он звонить один раз когда выбирать специальность arnae 11.02.2010 23 45 фимка ой помнить универ казаться закон действовать отсутствие техника фимка 11.02.2010 23 47 тьма правило это нет отсутствие техника казаться логичный год революция хотя стоить бы внести это правило arnae 11.02.2010 23 50 фимка они рассмотреть отчим приезжа

In [0]:
id_to_url = pd.read_csv("/content/drive/My Drive/text relevance/urls.numerate.txt", sep='\t', encoding='utf-8', header=None, names=['id','url','lemmatized_text_path'])
id_to_url.head()

Unnamed: 0,id,url,lemmatized_text_path
0,1,eva.ru/forum/topic-tree-mobile.htm?messageId=8...,
1,2,e1.ru/articles/travel/page_4/009/994/article_9...,
2,3,bikepost.ru/qa/post/9065/Kak-pravilno-sdelat-v...,
3,4,rutax.polpred.com/?ns=1&page=2,
4,5,coollib.com/b/259649/read,


In [0]:
i=0
for root, dirs, files in os.walk("/content/drive/My Drive/text relevance/content/content"):
  if not '_lem' in root:
    for filee in files:
        i+=1
        if i %100 == 0:
            print(i)
        filepath = root+'/'+filee
        
        lemmatized_path = root + '_lem/'+ filee + '.lemmatized.txt'
        
        if os.path.isfile(lemmatized_path): # if lemmatized version exists
            with open(filepath, 'r', encoding='latin1') as f:
                url = f.readline().rstrip()
            id_to_url.loc[id_to_url['url'] == url, 'lemmatized_text_path'] = lemmatized_path
            #print(lemmatized_path, url)
        else:
            continue

In [0]:
import pandas as pd
id_to_url = pd.read_csv('/content/drive/My Drive/text relevance/main.csv', sep='\t', encoding='latin1')

In [0]:
id_to_url.to_csv('/content/drive/My Drive/text relevance/main.csv', sep='\t')


In [0]:
id_to_url.info()

In [0]:
id_to_url.head(5)

In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
import nltk
from nltk.util import ngrams
text = ['cant', 'citadel', 'police']
bigrams = list(ngrams(text, 2)) 
print(' '.join(bigrams[0]))
print(bigrams)

cant citadel
[('cant', 'citadel'), ('citadel', 'police')]


In [0]:
from collections import Counter
import pandas as pd
import numpy as np
from collections import defaultdict
import nltk
from nltk.util import ngrams

class RelevantTexts():
    def __init__(self):
        self.query_to_docs = pd.read_csv('/content/drive/My Drive/text relevance/sample.technosphere.ir1.textrelevance.submission.txt')
        self.main = pd.read_csv('/content/drive/My Drive/text relevance/main.csv', sep='\t', encoding='latin1')
        
        self.mean_title_len = 0
        self.mean_doc_len = 0
        self.doc_num = 0
        self.idf_doc = defaultdict(float)
        self.idf_title = defaultdict(float)
        self.idf_doc_bi = defaultdict(float)
        self.idf_title_bi = defaultdict(float)
        self.k1 = 1.75
        self.b = 0.75
        self.title_coef = 1.5
        self.doc_coef = 1
        self.bigram_coef = 2
    
    def _relevance(self, query, doc): #bm25f ungram bigram
        title_score = 0
        doc_score = 0
        title_bigram_score = 0
        doc_bigram_score = 0
        with open(doc, 'r') as f:
            title = f.readline().strip()
            if title == '':
                return -1
            text = f.readline().strip()

            title_len = len(title.split())
            doc_len = len(text.split())
            
            for word in query:
                tf_title = title.count(word)
                tf_doc = text.count(word)

                title_score += self.idf_title[word] * (tf_title * (self.k1 + 1)) / (tf_title + self.k1 * (1 - self.b + self.b * title_len / self.mean_title_len))
                doc_score += self.idf_doc[word] * (tf_doc * (self.k1 + 1)) / (tf_doc + self.k1 * (1 - self.b + self.b * doc_len / self.mean_doc_len))
            
            bigrams = list(ngrams(query, 2))
            for bi in bigrams:
                tf_title = title.count(' '.join(bi))
                tf_doc = text.count(' '.join(bi))
                title_bigram_score += self.idf_title_bi[bi] * (tf_title * (self.k1 + 1)) / (tf_title + self.k1 * (1 - self.b + self.b * title_len / self.mean_title_len))
                doc_bigram_score += self.idf_doc_bi[bi] * (tf_doc * (self.k1 + 1)) / (tf_doc + self.k1 * (1 - self.b + self.b * doc_len / self.mean_doc_len))

        #print(title)
        #print('ungram scores for title and doc:', title_score, doc_score)
        #print('bigram scores for title and doc:', title_bigram_score, doc_bigram_score)
        #print()
        score = (self.doc_coef * doc_score + self.title_coef * title_score) + self.bigram_coef * (self.title_coef * title_bigram_score + 0.5*doc_bigram_score)
        return score

    def _doc_stats(self, doc_docs):
        sum_title_len = 0
        sum_doc_len = 0
        for doc in doc_docs:
            with open(doc, 'r') as f:
                title = f.readline().strip().split()
                if title != []:
                    self.doc_num += 1
                    text = f.readline().strip().split()
                    sum_title_len += len(title)
                    sum_doc_len += len(text)

                    words = set()
                    for word in text:
                      if not word in words:
                        self.idf_doc[word] += 1
                        words.add(word)

                    words = set()
                    bigrams = list(ngrams(text, 2))
                    for bi in bigrams:
                      if not bi in words:
                        self.idf_doc_bi[bi] += 1
                        words.add(bi)

                    words = set()
                    for word in title:
                      if not word in words:
                        self.idf_title[word] += 1
                        words.add(word)

                    words = set()
                    bigrams = list(ngrams(title, 2))
                    for bi in bigrams:
                      if not bi in words:
                        self.idf_title_bi[bi] += 1
                        words.add(bi)

        self.mean_title_len = sum_title_len / self.doc_num
        self.mean_doc_len = sum_doc_len / self.doc_num
        self._calculate_idf(self.idf_doc)
        self._calculate_idf(self.idf_title)
        self._calculate_idf(self.idf_doc_bi)
        self._calculate_idf(self.idf_title_bi)
        #print(self.idf_doc)
        #input()
        #print(self.idf_title)
        #input()
        #print(self.idf_doc_bi)
        #input()
        #print(self.idf_title_bi)
        #input()

    def _calculate_idf(self, some_idf):
        for word in some_idf.keys():
            idf = np.log( (self.doc_num - some_idf[word] +0.5) / (some_idf[word]) +0.5)
            if idf > 0:
                some_idf[word] = idf
            else:
                some_idf[word] = 0.000001

    def count_scores(self, query_id, query):
        progress = 0
        doc_ids_with_nans = list(self.query_to_docs.loc[self.query_to_docs.QueryId == query_id].DocumentId)
        doc_docs = []
        doc_ids = []
        for doc_id in doc_ids_with_nans:
            d = self.main.loc[self.main.id == doc_id].iloc[0].lemmatized_text_path
            if isinstance(d, str):
                doc_docs.append(d)
                doc_ids.append(doc_id)
        query = query.split()
        relevancies = {}

        self.mean_title_len = 0
        self.mean_doc_len = 0
        self.doc_num = 0
        self.idf_title = defaultdict(float)
        self.idf_doc = defaultdict(float)
        self.idf_doc_bi = defaultdict(float)
        self.idf_title_bi = defaultdict(float)
        self._doc_stats(doc_docs)
        #print(self.mean_title_len, self.mean_doc_len, self.doc_num)
        #input()
        #print(self.idf_title)
        #input()

        for doc, doc_id in zip(doc_docs, doc_ids):
            progress += 1
            rel = self._relevance(query, doc)
            relevancies[doc_id] = rel
        c = Counter(relevancies)
        #print(c)
        #return c.most_common(10)
        return c

    def see_docs(self, query_id):
        doc_ids_with_nans = list(self.query_to_docs.loc[self.query_to_docs.QueryId == query_id].DocumentId)
        doc_docs = []
        doc_ids = []
        for doc_id in doc_ids_with_nans:
            d = self.main.loc[self.main.id == doc_id].iloc[0].lemmatized_text_path
            if isinstance(d, str):
                doc_docs.append(d)
                doc_ids.append(doc_id)
        for doc in doc_docs:
            with open(doc, 'r') as f:
                title = f.readline().strip()
                print(doc[-41:], title)

In [0]:
q = RelevantTexts()

In [0]:
q.see_docs(159)

In [0]:
a = q.count_scores(159, 'увеличить предмет симс 4')
b = q.count_scores(159, 'увеличить предмет sims 4')
top = (a + b).most_common(10)

print(top)

for i, doc_id in enumerate(top):
    #print(q.main.loc[q.main.id == doc_id[0]].iloc[0].lemmatized_text_path[-41:])
    with open(q.main.loc[q.main.id == doc_id[0]].iloc[0].lemmatized_text_path, 'r') as f:
        title = f.readline().strip()
        print(i, doc_id[0], title)

[(15306, 99.18632706615082), (15335, 90.1278978233102), (15296, 70.54351839240063), (15360, 65.18344177169456), (15303, 64.15169229701168), (15340, 48.96807629401731), (15324, 43.57524117592218), (15382, 41.985036673343004), (15326, 41.2827941937404), (15371, 41.12074239621655)]
0 15306 увеличить предмет sims4
1 15335 ответыдmail.ru увеличить предмет sims4
2 15296 увеличить предмет thesims4 youtube
3 15360 увеличить предмет the sims
4 15303 увеличить предмет игра sims
5 15340 увеличение предмет симс youtube
6 15324 увеличивать предмет симс видео wikibit.me
7 15382 увеличить симс количество персонаж семья
8 15326 увеличивать предмет симс youtube
9 15371 увеличить любой вещь симс работа youtube


In [0]:
with open('/content/drive/My Drive/text relevance/submission.txt', 'w') as sub:
    sub.write('QueryId,DocumentId\n')
    for i in range(1,400):
        alts = []
        with open('/content/drive/My Drive/text relevance/lemmatized_queries.txt', 'r') as f:
            for line in f:
                query_id, query = line.strip().split('\t')
                if int(query_id) == i:
                    alts.append(query)
                    print(int(query_id), i, query)
        a = q.count_scores(i, alts[0])
        for alt in alts[1:]:
          b = q.count_scores(i, alt)
          a = a + b

        top = (a).most_common(10) ### top 10
        for doc_id in top:
            sub.write( str(i) + ',' + str(doc_id[0]) + '\n')