In [1]:
import json
import math

In [2]:
porter_data = json.load(open('terms/porter.json'))
mystem_data = json.load(open('terms/mystem.json'))

porter_data_annotation = json.load(open('terms/porter_annotation.json'))
mystem_data_annotation = json.load(open('terms/mystem_annotation.json'))

porter_data_title = json.load(open('terms/porter_title.json'))
mystem_data_title = json.load(open('terms/mystem.json'))

articles_data = json.load(open('articles_porter_mystem.json'))['issue']['articles']

## Поиск

#### Первый параметр - список термом по портеру или mystem.
#### Второй - поисковый запрос из термов

In [3]:
def search(data, query):
    
    terms = query.split(' ')
    
    negative_terms = list(filter(lambda t : t.startswith('-'), terms))
    positive_terms = sort_terms_by_amount(data, list(filter(lambda t : not t.startswith('-'), terms)))
    
    if (len(positive_terms) > 0):
        if (positive_terms[0] in data):
            positive_docs = set(data[positive_terms[0]]['docs'])
            positive_terms = sort_terms_by_amount(data, positive_terms)

            for term in positive_terms:
                positive_docs &= set(data[term]['docs'])

            for term in negative_terms:
                for doc in set(data[term[1:]]['docs']):
                    positive_docs.discard(doc)

            return positive_docs
    return []

#### Сортировка термов согласно количеству документов, в которых они встречаются

In [4]:
def sort_terms_by_amount(data, terms):
    terms_amounts = []
    for term in terms:
        if (term in data):
            terms_amounts.append(data[term]['amount'])
        
    terms_with_amounts_zipped = list(zip(terms_amounts, terms))
    terms_with_amounts_zipped.sort()
    
    terms = [term for amount, term in terms_with_amounts_zipped]
    return terms

## Скоринг

### TF

In [5]:
def tf_annotation_and_title(term, doc, data_type):
    tf = 0
    
    for article in articles_data:
        if article['link'] == doc:
            amount = 0
            
            annotation = article['annotation'][data_type].split(' ')
            for word in annotation:
                if word == term:
                    amount += 1
            
            title = article['title'][data_type].split(' ')
            for word in title:
                if word == term:
                    amount += 1
                    
            tf = amount/(len(annotation) + len(title))
            
    return tf

In [6]:
def tf_annotation(term, doc, data_type):
    tf = 0
    
    for article in articles_data:
        if article['link'] == doc:
            return tf_query(term, article['annotation'][data_type])

In [7]:
def tf_title(term, doc, data_type):
    tf = 0
    
    for article in articles_data:
        if article['link'] == doc:    
            return tf_query(term, article['title'][data_type])

In [8]:
def tf_query(term, query):
    amount = 0
    for el in query.split(' '):
        if (el == term):
            amount += 1
            
    return amount / len(query.split(' '))

### IDF

In [9]:
def idf(data, term):
    if term in data:
        return math.log2(10 / data[term]['amount'])
    else:
        return 0

### Score

In [10]:
def score(data, data_type, query, articles):
    articles_score = []
    for article in articles:
        score = 0
        for term in list(filter(lambda t : not t.startswith('-'), query.split(' '))):
            tf = tf_annotation_and_title(term, article, data_type)
            score += tf * idf(data, term)
        articles_score.append({
                "article": article,
                "score": score
        })

    res = {
        "query": query,
        "articles_score": articles_score
    }
    return res

In [11]:
queries = ['на задач', 'уравнен точек', 'для два']

#### Портер по всему документу

In [12]:
res = []
for query in queries:
    articles = search(porter_data, query)

    res.append(score(porter_data, 'porter', query, articles))
        
with open('search/porter_score.json', 'w') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)

#### mystem по всему документу

In [13]:
# query = 'для два'
res = []
for query in queries:
    articles = search(mystem_data, query)

    res.append(score(mystem_data, 'mystem', query, articles))
    
with open('search/mystem_score.json', 'w') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)

### Score2

In [14]:
def score2(data_title, data_annotation, data_type, query, articles):
    articles_score = []
    for article in articles:
        score = 0
        
        for term in list(filter(lambda t : not t.startswith('-'), query.split(' '))):
            tf = tf_annotation(term, article, data_type)
            score += 0.4 * tf * idf(data_annotation, term)
            
            tf = tf_title(term, article, data_type)
            score += 0.6 * tf * idf(data_title, term)
            
        articles_score.append({
                "article": article,
                "score": score
        })

    res = {
        "query": query,
        "articles_score": articles_score
    }
    return res

In [15]:
res = []
for query in queries:
    articles = search(mystem_data, query)

    res.append(score2(mystem_data_title, mystem_data_annotation, 'mystem', query, articles))
    
with open('search/mystem_score2.json', 'w') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)

## SVD

In [16]:
terms = porter_data
data_type = 'porter'

матрица значений

In [17]:
tf_idf = {}
for article in articles_data:
    link = article['link']
    tf_idf[link] = {}
    for term in terms:
        tf_idf[link][term] = tf_annotation_and_title(term, link, data_type) * idf(terms, term)
        
tf_idf_tr = {}
for term in terms:
    tf_idf_tr[term] = {}
    for article in articles_data:
        link = article['link']
        tf_idf_tr[term][link] = tf_annotation_and_title(term, link, data_type) * idf(terms, term)

cos_sim function

In [18]:
def cos_sim(link, query):
    query_tf_idf = {}
    for t in query.split(' '):
        query_tf_idf[t] = tf_query(t, query) * idf(terms, t)
    
    res = 0
    for el in query_tf_idf:
        res += query_tf_idf[el] * tf_idf[link][el]
        
    return res

Примеры

In [19]:
cos_sim('http://www.mathnet.ru/rus/uzku1348', 'работ класс')

0.07248620304434843

In [20]:
cos_sim('http://www.mathnet.ru/rus/uzku1349', 'и иском функц облада достаточн')

0.0773753904450919