In [1]:
import re
import string
import spacy

import nltk
from nltk.corpus import stopwords

In [63]:
class TextPreprocessor:
    def __init__(self, positive='радость', negative = 'грусть'):
        self.positive = f' {positive.strip()} '
        self.negative = f' {negative.strip()} '
        
        
    def load(self):
        russian_stopwords = stopwords.words("russian")
        russian_stopwords.remove('не')
        russian_stopwords.remove('нет')
        
        self.russian_stopwords = set(russian_stopwords)
        self.nlp = spacy.load('ru_core_news_sm')

    def split_hash_tag(self, text):
        match = re.search(r"#([А-Яа-я]+)", text)
        if match and match.group(1):
            replacements = ' '.join(re.findall('[А-Я][^А-Я]*', match.group(1)))
            return text.replace(match.group(0), replacements)

        return text

    def remove_parenthesis_pairs(self, text):
        text = re.sub(r'((\(|\[|\{)(.*)(\)|\]|\}))', '\g<2>', text)
        return text

    def replace_smiles(self, text):    
        text = re.sub(r'((:|;|=|8)?(-|%|5|c|с)?(\)|\]|\}|3)+|😜|😄|😂|💋|♥)', self.positive, text)
        text = re.sub(r'((:|;|=|8)(-|%|5|c|с)?(d|p|\*)+)', self.positive, text)

        text = re.sub(r'((:|;|=|8)?\'?(-|%|5|c|с)?(\(|\[|\{)+)', self.negative, text)
        text = re.sub(r'((:|;|=|8)\'?(-|%|5|c|с)?(g|o)+)', self.negative, text)

        return text

    def replace_obvious_scores(self, text):
        text = re.sub(r'([7-9]|1[0-9]) из 10', self.positive, text)
        text = re.sub(r'[0-4] из 10', self.negative, text)
        return text

    def collapse_same_letters(self, text):
        text = re.sub(r'([а-яё])\1{2,}', '\g<1>', text)
        return text

    def remove_stop_words(self, text):
        words = text.split(' ')
        text = ' '.join([word for word in words if word not in self.russian_stopwords])
        return text
    
    def lemmatize(self, text):
        text = ' '.join([w.lemma_ for w in self.nlp(text)])
        return text

    def preprocess(self, text):
        text = self.split_hash_tag(text)
        text = text.lower()
        text = text.replace('\n', ' ')
        text = text.replace('ё', 'е')

        text = self.remove_parenthesis_pairs(text)
        text = self.replace_smiles(text)
        text = self.replace_obvious_scores(text)
        text = self.collapse_same_letters(text)

        text = self.remove_stop_words(text)

        text = re.sub(r"[^а-я ]", " ", text)
        text = re.sub(r"[а-я]{35,}", "", text)
        text = re.sub(r" {2,}", " ", text)
        text = text.strip()
        
        text = self.lemmatize(text)
        
        return text

In [64]:
text_preprocessor = TextPreprocessor()
text_preprocessor.load()

In [65]:
text_preprocessor.preprocess('Всё отлично.супер костюм)).советую')

'отлично супер костюм радость советую'

In [None]:
import json
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from bpemb import BPEmb
from pathlib import Path
import operator

In [102]:
class UserRequestClassifier:
    def __init__(self, neural_net_config_path, neural_net_weights_path,
                 input_map_path, input_sequence_config_path, output_map_path,
                 embeddings_path):
        self.neural_net_config_path = neural_net_config_path
        self.neural_net_weights_path = neural_net_weights_path
        self.input_map_path = input_map_path
        self.input_sequence_config_path = input_sequence_config_path
        self.output_map_path = output_map_path
        self.embeddings_path = embeddings_path
        
    def load(self):
        with open(self.neural_net_config_path, 'r', encoding='utf-8') as file:
            json_model = file.read()

        model = tf.keras.models.model_from_json(json_model)
        model.load_weights(self.neural_net_weights_path)

        with open(self.output_map_path, 'r', encoding='utf-8') as file:
            output_map_json = file.read()
            
        with open(self.input_sequence_config_path, 'r', encoding='utf-8') as file:
            self.max_sequence_length = int(file.read())

        reversed_output_map = json.loads(output_map_json)
        output_map = {v: k for k, v in reversed_output_map.items()}
        
        self.model = model
        self.output_map = output_map
        
        self.text_preprocessor = TextPreprocessor()
        self.text_preprocessor.load()
        
        embeddings_keys = []
        with open(embeddings_path, 'r', encoding='utf-8') as file:
            for line in file:
                values = line.split()
                word = values[0].lower()
                embeddings_keys.append(word)
        
        self.bpemb = BPEmb(lang='ru', cache_dir=Path('./'), dim=100, vs=100000)
        
        tokenizer = Tokenizer(len(embeddings_keys))
        tokenizer.fit_on_texts(embeddings_keys)
        
        del embeddings_keys[:]        
        del embeddings_keys
        
        self.tokenizer = tokenizer
    
    def get_vector(self, text):
        text = self.text_preprocessor.preprocess(text)
        seq = self.tokenizer.texts_to_sequences([self.bpemb.encode(text)])
        padded = pad_sequences([seq], self.max_sequence_length)[0]

        result = np.array(padded.T)
        return result

    def classify(self, input_text):
        vector = self.model.predict([self.get_vector(input_text)]).T
        vector = np.concatenate(vector, axis=0 )
        zipped = np.array(list(zip(self.output_map.keys(), vector.T)))
        answer = {x[0]:x[1] for x in zipped}
        answer = sorted(answer.items(), key=lambda x: float(operator.itemgetter(1)(x)), reverse=True)[:10]

        return answer

In [103]:
input_map_path = "model/input-map.json"
input_sequence_config_path = "model/input-sequence-length.txt"
output_map_path = "model/output-map.json"
neural_net_config_path = "model/model-config.json"
neural_net_weights_path = "model/model-weights.h5"
embeddings_path = './ru/ru.wiki.bpe.vs100000.d100.w2v.txt'

classifier = UserRequestClassifier(neural_net_config_path, neural_net_weights_path,
                                   input_map_path, input_sequence_config_path,
                                   output_map_path, embeddings_path)
classifier.load()

In [104]:
classifier.classify('Свобода ничего не стоит, если она не включает в себя свободу ошибаться.')

[(0.0, 0.15936242043972015),
 (-1.0, 0.14125685393810272),
 (1.0, 0.045397017151117325)]

In [108]:
try:
    1/0
except Exception as error:
    print(error)

division by zero
