In [1]:
from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec
from gensim import matutils
import re
import os

import numpy as np
import json

import operator



In [2]:
model_w = KeyedVectors.load_word2vec_format(r"news_0_300_2.bin", binary=True, unicode_errors='ignore')
model_d = Doc2Vec.load('Doc2Vec_100s_1000e')

In [3]:
def preprocessing(par):
    par = par.replace('.', ' ')
    par = par.replace(',', ' ')
    par = par.replace(':', ' ')
    par = par.replace(';', ' ')
    par = par.replace('-', ' ')
    par = par.replace('+', ' ')
    par = par.replace('м²', ' ')
    par = par.replace(')', ' ')
    par = par.replace('(', ' ')
    par = par.replace('\\', ' ')
    par = par.replace('/', ' ')
    par = par.replace('"', ' ')
    par = par.replace('!', ' ')
    par = par.replace('?', ' ')
    par = par.replace('\n', ' ')
    par = par.replace('\xa0', ' ')
    par = par.replace('\u200b', ' ')
    par = par.replace('\r', ' ')
    par = re.sub('[0-9]','', par)
    par = par.replace('   ', ' ')
    par = par.replace('  ', ' ')
    par = par.strip(' ')
    par = par.lower()
        
    return par

def similarity(v1, v2):
    v1_norm = matutils.unitvec(np.array(v1))
    v2_norm = matutils.unitvec(np.array(v2))
    sim = np.dot(v1_norm, v2_norm)
    if sim is not None:
        return sim
    else:
        return 0

In [4]:
def beau_data(my_top):
    all_beau = []
    for t in my_top:
        name = re.search('Объявление: (.*?)\nРазмещено:', t).group(1)
        when = re.search('Размещено: (.*?)\nЦена:', t).group(1)
        price = re.search('Цена: (.*?)\nАдрес:', t).group(1)
        address = re.search('Адрес: (.*?)\nТекст объявления:', t).group(1)
        te = re.search('Текст объявления:\n(.*?)$', t)
        if te is None:
            text = 'отсутствует.'
        else:
            text = te.group(1)
        a = [name, when, price, address, text]
        all_beau.append(a)
    return all_beau

In [12]:
def search_w2v(query):
    query = preprocessing(query).split(' ')
    query_vec = np.zeros(100)
    for word in query:
        try:
            v = model_w.get_vector(word)
            query_vec += v
        except Exception as ex:
            continue
    path, dirs, files = next(os.walk("json_w"))

    max_similar = {}

    for i in range(1, len(files)):
        doc2similar = {}
        path = r'json_w\all_json_w2v_' + str(i) +'.txt'
        with open(path) as f:
            doc = json.load(f)
        for key, value in doc.items():
            doc2similar[key] = similarity(query_vec, np.array(value))
        max_similar[max(doc2similar)] = doc2similar[max(doc2similar)]
        
    sorted_dict = sorted(max_similar.items(), key=operator.itemgetter(1), reverse=True)
    
    top = []
    for file in sorted_dict[:10]:
        path = 'All_files\\'+file[0]+'.txt'
        with open(path, 'r', encoding = 'UTF-8') as f:
            text = f.read()
        top.append(text)
    top = beau_data(top)
    return top

def search_d2v(query):
    query = preprocessing(query).split(' ')
    query_vec = model_d.infer_vector(query)
    
    path, dirs, files = next(os.walk("json_d"))

    max_similar = {}

    for i in range(1, len(files)):
        doc2similar = {}
        path = r'json_d\all_json_d2v_' + str(i) +'.txt'
        with open(path) as f:
            doc = json.load(f)
        for key, value in doc.items():
            doc2similar[key] = similarity(query_vec, np.array(value))
        max_similar[max(doc2similar)] = doc2similar[max(doc2similar)]
        
    sorted_dict = sorted(max_similar.items(), key=operator.itemgetter(1), reverse=True)
    
    top = []
    for file in sorted_dict[:10]:
        path = 'All_files\\'+file[0]+'.txt'
        with open(path, 'r', encoding = 'UTF-8') as f:
            text = f.read()
        top.append(text)
    top = beau_data(top)
    return sorted_dict[:10]

In [11]:
from flask import Flask
from flask import url_for, render_template, request, redirect

app = Flask(__name__)

In [12]:
@app.route('/s', methods=['POST', 'GET'])
def index():
    if request.method == 'POST':
        query = request.form['query']
        if 'word2vec' in request.form:
            results = search_w2v(query)
        elif 'doc2vec' in request.form:
            results = search_d2v(query)
        return render_template('result.html', results = results)
    return render_template("search.html")

In [None]:
if __name__ == '__main__':
    app.run(debug=True)