In [1]:
import json
import math

In [2]:
porter_data = json.load(open('terms_porter.json'))
mystem_data = json.load(open('terms_mystem.json'))

porter_data_annotation = json.load(open('terms_porter_annotation.json'))
mystem_data_annotation = json.load(open('terms_mystem_annotation.json'))

porter_data_title = json.load(open('terms_porter_title.json'))
mystem_data_title = json.load(open('terms_mystem.json'))

articles_data = json.load(open('articles_porter_mystem.json'))['issue']['articles']

## Поиск

#### Первый параметр - список термом по портеру или mystem.
#### Второй - поисковый запрос из термов

In [3]:
def search(data, query):
    
    terms = query.split(' ')
    
    negative_terms = list(filter(lambda t : t.startswith('-'), terms))
    positive_terms = list(filter(lambda t : not t.startswith('-'), terms))
        
    positive_docs = set(data[positive_terms[0]]['docs'])
    positive_terms = sort_terms_by_amount(data, positive_terms)
    
    for term in positive_terms:
        positive_docs &= set(data[term]['docs'])
        
    for term in negative_terms:
        for doc in set(data[term[1:]]['docs']):
            positive_docs.discard(doc)
        
    return positive_docs

#### Сортировка термов согласно количеству документов, в которых они встречаются

In [4]:
def sort_terms_by_amount(data, terms):
    terms_amounts = []
    for term in terms:
        if (term in data):
            terms_amounts.append(data[term]['amount'])
        
    terms_with_amounts_zipped = list(zip(terms_amounts, terms))
    terms_with_amounts_zipped.sort()
    
    terms = [term for amount, term in terms_with_amounts_zipped]
    return terms

## Скоринг

### TF

In [5]:
def tf_annotation_and_title(term, doc, data_type):
    tf = 0
    
    for article in articles_data:
        if article['link'] == doc:
            amount = 0
            
            annotation = article['annotation'][data_type].split(' ')
            for word in annotation:
                if word == term:
                    amount += 1
            
            title = article['title'][data_type].split(' ')
            for word in title:
                if word == term:
                    amount += 1
                    
            tf = amount/(len(annotation) + len(title))
            
    return tf

In [6]:
def tf_annotation(term, doc, data_type):
    tf = 0
    
    for article in articles_data:
        if article['link'] == doc:
            amount = 0
            
            annotation = article['annotation'][data_type].split(' ') 
            for word in annotation:
                if word == term:
                    amount += 1
                    
            tf = amount / len(annotation)
            
    return tf

In [7]:
def tf_title(term, doc, data_type):
    tf = 0
    
    for article in articles_data:
        if article['link'] == doc:
            amount = 0
            
            title = article['title'][data_type].split(' ')
            for word in title:
                if word == term:
                    amount += 1
                    
            tf = amount / len(title)
            
    return tf

### IDF

In [8]:
def idf(data, term):
    if term in data:
        return math.log2(10 / data[term]['amount'])
    else:
        return 0

### Score

In [9]:
def score(data, data_type, query, articles):
    articles_score = []
    for article in articles:
        score = 0
        for term in list(filter(lambda t : not t.startswith('-'), query.split(' '))):
            tf = tf_annotation_and_title(term, article, data_type)
            score += tf * idf(data, term)
        articles_score.append({
                "article": article,
                "score": score
        })

    res = {
        "query": query,
        "articles_score": articles_score
    }
    return res

#### Портер по всему документу

In [10]:
query = 'в идеалами'
articles = search(porter_data, query)

if (len(articles) > 0):
    res = score(porter_data, 'porter', query, articles)
    print(json.dumps(res, ensure_ascii=False, indent=4, sort_keys=True))

{
    "articles_score": [
        {
            "article": "http://www.mathnet.ru/rus/uzku1348",
            "score": 0.04003324301529935
        }
    ],
    "query": "в идеалами"
}


#### mystem по всему документу

In [14]:
# query = 'для два'
query = 'на задач'
articles = search(mystem_data, query)

if (len(articles) > 0): 
    res = score(mystem_data, 'mystem', query, articles)
    print(json.dumps(res, ensure_ascii=False, indent=4, sort_keys=True))

{
    "articles_score": [
        {
            "article": "http://www.mathnet.ru/rus/uzku1352",
            "score": 0.003070769564546466
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1353",
            "score": 0.004750096670157814
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1349",
            "score": 0.00349432398724253
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1357",
            "score": 0.00298045281264804
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1350",
            "score": 0.0016703636642313195
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1355",
            "score": 0.00434294552700143
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1354",
            "score": 0.001535384782273233
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1356",
            "score": 0.00271434095437

### Score2

In [12]:
def score2(data_title, data_annotation, data_type, query, articles):
    articles_score = []
    for article in articles:
        score = 0
        
        for term in list(filter(lambda t : not t.startswith('-'), query.split(' '))):
            tf = tf_annotation(term, article, data_type)
            score += 0.4 * tf * idf(data_annotation, term)
            
            tf = tf_title(term, article, data_type)
            score += 0.6 * tf * idf(data_title, term)
            
        articles_score.append({
                "article": article,
                "score": score
        })

    res = {
        "query": query,
        "articles_score": articles_score
    }
    return res

In [13]:
query = 'на задач'
articles = search(mystem_data, query)

if (len(articles) > 0): 
    res = score2(mystem_data_title, mystem_data_annotation, 'mystem', query, articles)
    print(json.dumps(res, ensure_ascii=False, indent=4, sort_keys=True))

{
    "articles_score": [
        {
            "article": "http://www.mathnet.ru/rus/uzku1352",
            "score": 0.001321766029956957
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1353",
            "score": 0.009721695174346517
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1349",
            "score": 0.0016000325625794743
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1357",
            "score": 0.0014139822646051167
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1350",
            "score": 0.0007414785046100004
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1355",
            "score": 0.008648451868425262
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1354",
            "score": 0.0006681454656925278
        },
        {
            "article": "http://www.mathnet.ru/rus/uzku1356",
            "score": 0.00115811