In [58]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

import os
from pymystem3 import Mystem

In [3]:
nltk.download("stopwords")
nltk.download("punkt")
#define stemmer
stemmer = LancasterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nast1415/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nast1415/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def load_dataset(filename, delim):
    df = pd.read_csv(filename, delimiter=delim)
    # print(df.head())
    intent = df.Intent
    unique_intent = list(set(intent))
    sentences = list(df["Sentence"])
  
    return (intent, unique_intent, sentences)


def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ а-я А-Я a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        #stemming
        words.append([i.lower() for i in w])
    
    return words  

def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    token = Tokenizer(filters = filters)
    token.fit_on_texts(words)
    return token

def max_length(words):
    return(len(max(words, key = len)))

def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

def one_hot(encode):
    o = OneHotEncoder(sparse = False)
    return(o.fit_transform(encode))

In [13]:

intent, unique_intent, sentences = load_dataset("hackaton/dataset.csv", ',')



In [14]:
unique_intent

['rent_equipment',
 'get_service',
 'buy_sportswear',
 'order_food',
 'get_train',
 'buy_food',
 'buy_equipment',
 'buy_or_order_goods',
 'buy_sport_food']

In [15]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))

53706


In [16]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_l = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_l))


Vocab Size = 1131 and Maximum length = 8


In [17]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [18]:
encoded_doc

[[1, 9, 686],
 [1, 9, 687],
 [1, 9, 688],
 [1, 9, 689],
 [1, 9, 690, 691],
 [1, 9, 692, 693],
 [1, 9, 694],
 [1, 9, 695],
 [1, 9, 248],
 [1, 9, 696],
 [1, 9, 697],
 [1, 9, 698],
 [1, 9, 699, 700, 701],
 [1, 9, 702],
 [1, 9, 703],
 [1, 9, 704],
 [1, 9, 705],
 [1, 9, 706],
 [1, 9, 707],
 [1, 9, 708],
 [1, 9, 709],
 [1, 9, 710],
 [1, 9, 711],
 [1, 9, 712, 713],
 [1, 9, 714],
 [1, 9, 715],
 [1, 9, 716],
 [1, 9, 717],
 [1, 9, 718],
 [1, 9, 719],
 [1, 9, 720],
 [1, 9, 721],
 [1, 9, 722],
 [1, 9, 723, 724],
 [1, 9, 725],
 [1, 9, 726, 727],
 [1, 9, 728, 729],
 [1, 9, 730],
 [1, 9, 731],
 [1, 9, 732],
 [1, 9, 733],
 [1, 9, 734, 735],
 [1, 9, 736],
 [1, 9, 737],
 [1, 9, 738],
 [1, 9, 739],
 [1, 9, 740, 741],
 [1, 9, 742],
 [1, 9, 743],
 [1, 9, 744],
 [1, 9, 745],
 [1, 9, 746],
 [1, 9, 747],
 [1, 9, 748],
 [1, 9, 749],
 [1, 9, 750],
 [1, 9, 95],
 [1, 9, 48, 751],
 [1, 9, 249, 752],
 [1, 9, 250, 95],
 [1, 9, 96, 753],
 [1, 9, 251],
 [1, 9, 754, 97],
 [1, 9, 755],
 [1, 9, 756],
 [1, 9, 757],
 [1, 9

In [19]:
padded_doc = padding_doc(encoded_doc, max_l)

In [20]:
padded_doc

array([[  1,   9, 686, ...,   0,   0,   0],
       [  1,   9, 687, ...,   0,   0,   0],
       [  1,   9, 688, ...,   0,   0,   0],
       ...,
       [247,   0,   0, ...,   0,   0,   0],
       [684,   0,   0, ...,   0,   0,   0],
       [685,   0,   0, ...,   0,   0,   0]], dtype=int32)

In [21]:
print("Shape of padded docs = ",padded_doc.shape)
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

Shape of padded docs =  (53706, 8)


In [22]:
output_tokenizer.word_index

{'buy_equipment': 7,
 'buy_food': 6,
 'buy_or_order_goods': 8,
 'buy_sport_food': 9,
 'buy_sportswear': 3,
 'get_service': 2,
 'get_train': 5,
 'order_food': 4,
 'rent_equipment': 1}

In [23]:
encoded_output = encoding_doc(output_tokenizer, intent)
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [24]:
encoded_output.shape

(53706, 1)

In [25]:
output_one_hot = one_hot(encoded_output)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [26]:
output_one_hot.shape

(53706, 9)

In [27]:
from sklearn.model_selection import train_test_split

def create_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
    model.add(Bidirectional(LSTM(128)))
    # model.add(LSTM(128))
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0.5))
    # TODO: вместо 2 количество классов представленных в выборке
    model.add(Dense(9, activation = "softmax"))
  
    return model

In [28]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (42964, 8) and train_Y = (42964, 9)
Shape of val_X = (10742, 8) and val_Y = (10742, 9)


In [29]:
from keras import metrics

In [30]:
model = create_model(vocab_size, max_l)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = [metrics.categorical_accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 8, 128)            144768    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 297       
Total params: 416,457
Trainable params: 271,689
Non-trainable params: 144,768
_________________________________________________________________


In [31]:
train_Y.shape

(42964, 9)

In [32]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 200,  batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 42964 samples, validate on 10742 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 1.25746, saving model to model.h5
Epoch 2/200

Epoch 00002: val_loss improved from 1.25746 to 1.15252, saving model to model.h5
Epoch 3/200

Epoch 00003: val_loss improved from 1.15252 to 1.06859, saving model to model.h5
Epoch 4/200

Epoch 00004: val_loss improved from 1.06859 to 0.95790, saving model to model.h5
Epoch 5/200

Epoch 00005: val_loss improved from 0.95790 to 0.85224, saving model to model.h5
Epoch 6/200

Epoch 00006: val_loss improved from 0.85224 to 0.73298, saving model to model.h5
Epoch 7/200

Epoch 00007: val_loss improved from 0.73298 to 0.62349, saving model to model.h5
Epoch 8/200

Epoch 00008: val_loss improved from 0.62349 to 0.52457, saving model to model.h5
Epoch 9/200

Epoch 00009: val_loss improved from 0.52457 to 0.47294, saving model to model.h5
Epoch 10/200

Epoch 00010: val_loss improved from 0.47294 to 0.45378, saving model to model.h5
Epoch 11/200



Epoch 00033: val_loss did not improve from 0.13881
Epoch 34/200

Epoch 00034: val_loss improved from 0.13881 to 0.13644, saving model to model.h5
Epoch 35/200

Epoch 00035: val_loss did not improve from 0.13644
Epoch 36/200

Epoch 00036: val_loss improved from 0.13644 to 0.13405, saving model to model.h5
Epoch 37/200

Epoch 00037: val_loss did not improve from 0.13405
Epoch 38/200

Epoch 00038: val_loss did not improve from 0.13405
Epoch 39/200

Epoch 00039: val_loss improved from 0.13405 to 0.13382, saving model to model.h5
Epoch 40/200

Epoch 00040: val_loss did not improve from 0.13382
Epoch 41/200

Epoch 00041: val_loss did not improve from 0.13382
Epoch 42/200

Epoch 00042: val_loss improved from 0.13382 to 0.13214, saving model to model.h5
Epoch 43/200

Epoch 00043: val_loss did not improve from 0.13214
Epoch 44/200

Epoch 00044: val_loss improved from 0.13214 to 0.12870, saving model to model.h5
Epoch 45/200

Epoch 00045: val_loss did not improve from 0.12870
Epoch 46/200

Epoc


Epoch 00105: val_loss did not improve from 0.12870
Epoch 106/200

Epoch 00106: val_loss did not improve from 0.12870
Epoch 107/200

Epoch 00107: val_loss did not improve from 0.12870
Epoch 108/200

Epoch 00108: val_loss did not improve from 0.12870
Epoch 109/200

Epoch 00109: val_loss did not improve from 0.12870
Epoch 110/200

Epoch 00110: val_loss did not improve from 0.12870
Epoch 111/200

Epoch 00111: val_loss did not improve from 0.12870
Epoch 112/200

Epoch 00112: val_loss did not improve from 0.12870
Epoch 113/200

Epoch 00113: val_loss did not improve from 0.12870
Epoch 114/200

Epoch 00114: val_loss did not improve from 0.12870
Epoch 115/200

Epoch 00115: val_loss did not improve from 0.12870
Epoch 116/200

Epoch 00116: val_loss did not improve from 0.12870
Epoch 117/200

Epoch 00117: val_loss did not improve from 0.12870
Epoch 118/200

Epoch 00118: val_loss did not improve from 0.12870
Epoch 119/200

Epoch 00119: val_loss did not improve from 0.12870
Epoch 120/200

Epoch 001


Epoch 00141: val_loss did not improve from 0.12870
Epoch 142/200

Epoch 00142: val_loss did not improve from 0.12870
Epoch 143/200

Epoch 00143: val_loss did not improve from 0.12870
Epoch 144/200

Epoch 00144: val_loss did not improve from 0.12870
Epoch 145/200

Epoch 00145: val_loss did not improve from 0.12870
Epoch 146/200

Epoch 00146: val_loss did not improve from 0.12870
Epoch 147/200

Epoch 00147: val_loss did not improve from 0.12870
Epoch 148/200

Epoch 00148: val_loss did not improve from 0.12870
Epoch 149/200

Epoch 00149: val_loss did not improve from 0.12870
Epoch 150/200

Epoch 00150: val_loss did not improve from 0.12870
Epoch 151/200

Epoch 00151: val_loss did not improve from 0.12870
Epoch 152/200

Epoch 00152: val_loss did not improve from 0.12870
Epoch 153/200

Epoch 00153: val_loss did not improve from 0.12870
Epoch 154/200

Epoch 00154: val_loss did not improve from 0.12870
Epoch 155/200

Epoch 00155: val_loss did not improve from 0.12870
Epoch 156/200

Epoch 001


Epoch 00177: val_loss did not improve from 0.12870
Epoch 178/200

Epoch 00178: val_loss did not improve from 0.12870
Epoch 179/200

Epoch 00179: val_loss did not improve from 0.12870
Epoch 180/200

Epoch 00180: val_loss did not improve from 0.12870
Epoch 181/200

Epoch 00181: val_loss did not improve from 0.12870
Epoch 182/200

Epoch 00182: val_loss did not improve from 0.12870
Epoch 183/200

Epoch 00183: val_loss did not improve from 0.12870
Epoch 184/200

Epoch 00184: val_loss did not improve from 0.12870
Epoch 185/200

Epoch 00185: val_loss did not improve from 0.12870
Epoch 186/200

Epoch 00186: val_loss did not improve from 0.12870
Epoch 187/200

Epoch 00187: val_loss did not improve from 0.12870
Epoch 188/200

Epoch 00188: val_loss did not improve from 0.12870
Epoch 189/200

Epoch 00189: val_loss did not improve from 0.12870
Epoch 190/200

Epoch 00190: val_loss did not improve from 0.12870
Epoch 191/200

Epoch 00191: val_loss did not improve from 0.12870
Epoch 192/200

Epoch 001

In [33]:
model = load_model("model.h5")

In [80]:
def predictions(text):
    clean = re.sub(r'[^ а-я А-Я 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    
    stopwords = nltk.corpus.stopwords.words("russian")
    newStopWords = ['скидки','скидкой','кешбек','кб','кэшбэк','кэшбек', 'кэшбек', 'кэш', 'кб', 
                    'кеш', 'кешбек', 'кэшбэк', 'cashback', 'ceshback', 'cashbak', 'cashbac,', 'ceshbak', 'ceshbac', 
                   '%', 'проц', 'п', 'пр', 'працентов', 'прац', 'працентав', 'percents', 'procents',
                   'тыс', 'тыр', 'к', 'т', 'тысяч', 'тыщ', 'кэсов', 'косарей', 'thousands', ' 000',
                   "процентов", "рассрочка", "рассрочку", "кредит", 'р', 'руб', 'рублей', 'rub', 'р.', 'roubles']
    stopwords.extend(newStopWords)
        
    m = Mystem()
    test_word = [m.lemmatize(w)[0] for w in test_word if not w in stopwords] 
#     print(test_word)
    test_ls = word_tokenizer.texts_to_sequences(test_word)

    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    
    test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
    x = padding_doc(test_ls, max_l)
  
    pred = model.predict_proba(x)
    return pred

# Возвращаем самый вероятный интент
def get_final_output(pred, classes, to_print = False):
    predictions = pred[0]
 
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)
 
    if to_print:
        for i in range(pred.shape[1]):
            print("%s has confidence = %s" % (classes[i], (predictions[i])))
    return(classes[0])

In [97]:
# Проверочка
texts = [
    'Купить яблочки зеленые в рассрочку',
    "Куртка зимняя лыжная",
    "кроссовки найк со скидкой и кэшбеком 5%",
    "морская капуста на дом",
    "доставка суши в Москве до 1000 рублей с кэшбеком"
]
df = pd.DataFrame(columns=["QUERY", "INTENT"])
for text in texts:
    pred = predictions(text)
    df = df.append([{"QUERY": text, "INTENT": get_final_output(pred, unique_intent)}])


In [99]:
df

Unnamed: 0,QUERY,INTENT
0,Купить яблочки зеленые в рассрочку,buy_or_order_goods
0,Куртка зимняя лыжная,buy_sportswear
0,кроссовки найк со скидкой и кэшбеком 5%,buy_sportswear
0,морская капуста на дом,buy_or_order_goods
0,доставка суши в Москве до 1000 рублей с кэшбеком,order_food


In [101]:
# Чтение файла с запросами, у которых интент не определён на предыдущем этапе при помощи поиска по словарям
lst = pd.read_csv('hackaton/bad_intents.csv')
lst = lst.QUERY
df = pd.DataFrame(columns=["QUERY", "INTENT"])
for text in lst:
    pred = predictions(text)
    df = df.append([{"QUERY": text, "INTENT": get_final_output(pred, unique_intent)}])
    


In [103]:
# Сохраняем распознанное
df.to_csv('hackaton/lstm_intents.csv', index=None)

In [102]:
df

Unnamed: 0,QUERY,INTENT
0,кроссофки для бега хочу купить с минимальным к...,buy_sport_food
0,очки для плавания купить с максимальным кэшбэком,get_service
0,купить мороженное по самой низкой цене,buy_or_order_goods
0,со скидкой макдональдс,buy_or_order_goods
0,где приобрести мороженное спецпредложение,buy_or_order_goods
0,хоккейное снаряжение заказать спецпредложение,buy_or_order_goods
0,где приобрести фрукты с доставкой,buy_equipment
0,ананасы заказать скидка,order_food
0,где можно поесть,order_food
0,мраморная говядина,buy_food
