### This is first try intent classification

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import codecs
import collections
import json
import re
import numpy as np
import modeling
import tokenization
import tensorflow as tf

import pandas as pd

import tokenization

import sys
# Импорты из https://github.com/google-research/bert
from extract_features import InputExample, InputFeatures, input_fn_builder, model_fn_builder

from extract_features import convert_examples_to_features, _truncate_seq_pair, read_examples

tf.logging.set_verbosity(tf.logging.ERROR)



In [None]:
BERT_BASE_DIR = '/Users/kolsha/Documents/Projects/Python/BERT/multi_cased_L-12_H-768_A-12'

init_checkpoint = BERT_BASE_DIR + '/bert_model.ckpt'

layer_indexes = [-1]

use_one_hot_embeddings = False

max_seq_length = 128

bert_config = modeling.BertConfig.from_json_file(BERT_BASE_DIR +'/bert_config.json')

tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_BASE_DIR+ '/vocab.txt', do_lower_case=False)

for (j, layer_index) in enumerate(layer_indexes):
    print(j, layer_index)

In [None]:
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  master=None,
  tpu_config=tf.contrib.tpu.TPUConfig(
      per_host_input_for_training=is_per_host)
)

In [None]:
def convert_lines_to_examples(lines):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    for line in lines:
        line = tokenization.convert_to_unicode(line)
        if not line:
            continue
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1
    return examples

In [None]:
model_fn = model_fn_builder(
  bert_config=bert_config,
  init_checkpoint=init_checkpoint,
  layer_indexes=layer_indexes,
  use_tpu=False,
  use_one_hot_embeddings=use_one_hot_embeddings)

estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=False,
  model_fn=model_fn,
  config=run_config,
  predict_batch_size=32)
    
def get_embeddings(lines):
    result = []
    
    examples = convert_lines_to_examples(lines)
    
    features = convert_examples_to_features(
      examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)
    
    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature
    


    input_fn = input_fn_builder(
      features=features, seq_length=max_seq_length)
    
    pred = estimator.predict(input_fn, yield_single_examples=True)
    
    for p in pred:
        #print(p['layer_output_0'].shape)
        unique_id = int(p["unique_id"])
        feature = unique_id_to_feature[unique_id]
        #print(feature.tokens)
        layer_output = p["layer_output_0"]
        r = np.array([round(float(x), 6) for x in layer_output[0:1].flat])
        result.append(r)
    
    return np.array(result)

import random

def predict_input(lines, estimator, lb_enc):
    hz_answers = [
        'Переформулируй пожалуйста',
        'Это точно по русски было?',
        'Я Вам не смогу помочь',
    ]
    embeddings = get_embeddings(lines)
    pred = estimator.predict_proba(embeddings)
    result = []
    for p in pred:
        a_max = np.argmax(p, axis=None)
        if p[a_max] > 0.65:
            result.append(lb_enc.inverse_transform([a_max])[0])
        else:
            result.append(random.choice(hz_answers))
            
        
    return result#lb_enc.inverse_transform(pred)

In [None]:
%%time
get_embeddings(['Тест на ', 'Это был тест на ']).shape

# Tests

In [None]:
data = pd.read_csv('intents.csv')
data.head()

In [None]:
# data['text'] = data['text'].str.replace("[^a-zA-Zа-яА-Я]", " ")
data.head()

In [None]:
%%time
embeddings = get_embeddings(data['text'].values)
print(embeddings.shape)

In [None]:
np.save('embeddings_old', embeddings)

In [None]:
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
embeddings = np.load('embeddings_old.npy')
le = preprocessing.LabelEncoder()
le.fit(data['label'].values)
list(le.classes_)

In [None]:
Y = le.transform(data['label'].values)

In [None]:
print(Y)


In [None]:
cv = StratifiedShuffleSplit(n_splits=10, random_state=42) #, shuffle=True


In [None]:
forest = RandomForestClassifier(
    criterion='entropy', n_estimators=140,
#     max_features=None,
#     max_depth=17,
#     min_samples_leaf=2,
    random_state=42)

In [None]:
f1_mac_best = 0
f1_mic_best = 0
f1_mac_all = []
for (train, test) in cv.split(embeddings, Y):
    forest.fit(embeddings[train], Y[train])
    y_pred = forest.predict(embeddings[test])
    
    f1_mac = f1_score(Y[test], y_pred, average='macro')
    f1_mic = f1_score(Y[test], y_pred, average='micro')
    
    f1_mac_all.append(f1_mac)
    print("F1 Macro: {}".format(f1_mac) )
    print("F1 Micro: {}".format(f1_mic) )
    
#     break
    
    if f1_mac > f1_mac_best:
#         best_logreg = copy.copy(logreg)
        f1_mac_best = f1_mac
        f1_mic_best = f1_mic
    
print("BEST F1 Macro: {}".format(f1_mac_best) )
print("BEST F1 Micro: {}".format(f1_mic_best) )

f1_mac_all = np.array(f1_mac_all)
f1_mac_avg = f1_mac_all.mean()
print("AVG  F1 Macro: {}".format(f1_mac_avg) )
print(f1_mac_all.std())

BEST F1 Macro: 0.7565208890443331 <br/>
BEST F1 Micro: 0.7840909090909092

# xgboost

In [None]:
import xgboost as xgb

In [None]:
f1_mac_best = 0
f1_mic_best = 0
f1_mac_all = []
for (train, test) in cv.split(embeddings, Y):
#     forest.fit(embeddings[train], Y[train])
#     y_pred = forest.predict(embeddings[test])
    
    xg_train = xgb.DMatrix(embeddings[train], label=Y[train])
    xg_test = xgb.DMatrix(embeddings[test], label=Y[train])
    # setup parameters for xgboost
    param = {
            'n_estimators': 300,
            'max_depth': 6,
            'objective': 'multi:softmax',
            'learning_rate': 0.05,
            'subsample': 0.9,
            'colsample_bytree': 0.9
            #'eta': 0.1, 
            }

    param['silent'] = 1
    param['nthread'] = 4
    param['num_class'] = len(le.classes_)

    watchlist = [(xg_train, 'train')]#, (xg_test, 'test')
    num_round = 300
    bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=30)
    # get prediction
    y_pred = bst.predict(xg_test)
    f1_mac = f1_score(Y[test], y_pred, average='macro')
    f1_mic = f1_score(Y[test], y_pred, average='micro')
    
    f1_mac_all.append(f1_mac)
    print("F1 Macro: {}".format(f1_mac) )
    print("F1 Micro: {}".format(f1_mic) )
    
#     break
    
    if f1_mac > f1_mac_best:
#         best_logreg = copy.copy(logreg)
        f1_mac_best = f1_mac
        f1_mic_best = f1_mic
    
print("BEST F1 Macro: {}".format(f1_mac_best) )
print("BEST F1 Micro: {}".format(f1_mic_best) )

f1_mac_all = np.array(f1_mac_all)
f1_mac_avg = f1_mac_all.mean()
print("AVG  F1 Macro: {}".format(f1_mac_avg) )
print(f1_mac_all.std())

BEST F1 Macro: 0.819804920434346 <br/>
BEST F1 Micro: 0.8522727272727273


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:

params_grid_xgb = {
          'nthread':[4],
          'objective': ['multi:softmax'],
          'reg_alpha': [0, 0.5],
          'reg_lambda': [0, 0.5],
          'gamma': [0, 0.5],
          'subsample': [0.7, 1],
          'colsample_bytree':[0.7, 1],
          'max_depth': [1, 9, 20],
          'learning_rate': [0.05],
          'n_estimators': [100, 500]
}

xgb_model = xgb.XGBClassifier()


clf = GridSearchCV(xgb_model, params_grid_xgb, n_jobs=-1, 
                   cv=cv, 
#                    scoring='roc_auc',
                   verbose=True, refit=True)
print(params_grid_xgb)

In [None]:
clf.fit(embeddings, Y)
print("Fit end")
print(clf.best_params_, clf.best_score_)

# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=42, solver='lbfgs',#'lbfgs', 'newton-cg'
                            multi_class='multinomial',
                            max_iter=3000,
#                             C=0.75
                           )
import copy

In [None]:
f1_mac_best = 0
f1_mic_best = 0
best_logreg = None
f1_mac_all = []
for (train, test) in cv.split(embeddings, Y):
    logreg.fit(embeddings[train], Y[train])
    y_pred = logreg.predict(embeddings[test])
    
    f1_mac = f1_score(Y[test], y_pred, average='macro')
    f1_mic = f1_score(Y[test], y_pred, average='micro')
    
    f1_mac_all.append(f1_mac)
    print("F1 Macro: {}".format(f1_mac) )
    print("F1 Micro: {}".format(f1_mic) )
    
#     break
    
    if f1_mac > f1_mac_best:
        best_logreg = copy.copy(logreg)
        f1_mac_best = f1_mac
        f1_mic_best = f1_mic
    
print("BEST F1 Macro: {}".format(f1_mac_best) )
print("BEST F1 Micro: {}".format(f1_mic_best) )

f1_mac_all = np.array(f1_mac_all)
f1_mac_avg = f1_mac_all.mean()
print("AVG  F1 Macro: {}".format(f1_mac_avg) )
print(f1_mac_all.std())

In [None]:
BEST F1 Macro: 0.9699559699559699
BEST F1 Micro: 0.9772727272727273
AVG  F1 Macro: 0.9108290028012584
0.03344453642061686

In [None]:
samples = [
    'Здравствуйте, меня зовут Павел, я не понимаю, как можно заблокировать симкарту?',
    'СОС, ХЕЛП, нужно срочно блокнуть симку, СРОЧНО',
    'Алоха, как восстановить симкарту?', # bad sample
    
    'Это снова Павел, где мои деньги, негодяи?',
    'Ребят, случайно отправил деньги не туда',
    
    'А ты точно человек, а скажи чтонибудь по человечьи?'
]
print(predict_input(samples, best_logreg, le))

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(best_logreg, "best_logreg_3_2_19", compress=9)

In [None]:
emb = get_embeddings(samples)
pred = best_logreg.predict_proba(emb)

In [None]:
# # np.set_printoptions(precision=1)
# for p in pred:
#     a_max = np.argmax(p, axis=None)
    
# #     ind = np.unravel_index(np.argmax(a, axis=None), a.shape)

In [None]:
VK_API_ACCESS_TOKEN = ''  
GROUP_ID = 177447412          
VK_API_VERSION = '5.74'

In [None]:
import vk
from requests import *
import os

import re

session = vk.Session(access_token = VK_API_ACCESS_TOKEN)
api = vk.API(session, v = VK_API_VERSION)


longPoll = api.groups.getLongPollServer(group_id = GROUP_ID)

server, key, ts = longPoll['server'], longPoll['key'], longPoll['ts']
while True:
    
    longPoll = post('%s'%server, data = {'act': 'a_check',
                                         'key': key,
                                         'ts': ts,
                                         'wait': 1,
                                         'version': 2}).json()
    ts = longPoll['ts']
    if longPoll['updates'] and len(longPoll['updates']) != 0:
        for update in longPoll['updates']:
            if update['type'] == 'message_new':
                print('message_new')
                # Помечаем сообщение от этого пользователя как прочитанное
                api.messages.setActivity(peer_id = update['object']['user_id'], type='typing', group_id = GROUP_ID)
                api.messages.markAsRead(peer_id = update['object']['user_id'])
                text = update['object']['body']
                text = text.replace("[^a-zA-Zа-яА-Я]", " ")
#                 text = re.sub("\s\s+" , " ", text)
                name = api.users.get(user_ids = update['object']['user_id'])[0]['first_name']
                res  = predict_input([text], best_logreg, le)
                api.messages.send(peer_id = update['object']['user_id'],
                                  message = '{}'.format(res[0]))
                break


    


# SVC

In [None]:
from sklearn.svm import SVC
import copy

In [None]:
svc_clf = SVC(probability=True,
              random_state=42,
              gamma='auto',
              kernel='linear'
              
             )

In [None]:
f1_mac_best = 0
f1_mic_best = 0
best_svc_clf = None
f1_mac_all = []
for (train, test) in cv.split(embeddings, Y):
    svc_clf.fit(embeddings[train], Y[train])
    y_pred = svc_clf.predict(embeddings[test])
    
    f1_mac = f1_score(Y[test], y_pred, average='macro')
    f1_mic = f1_score(Y[test], y_pred, average='micro')
    
    f1_mac_all.append(f1_mac)
    print("F1 Macro: {}".format(f1_mac) )
    print("F1 Micro: {}".format(f1_mic) )
    
#     break
    
    if f1_mac > f1_mac_best:
        best_svc_clf = copy.copy(svc_clf)
        f1_mac_best = f1_mac
        f1_mic_best = f1_mic
    
print("BEST F1 Macro: {}".format(f1_mac_best) )
print("BEST F1 Micro: {}".format(f1_mic_best) )

f1_mac_all = np.array(f1_mac_all)
f1_mac_avg = f1_mac_all.mean()
print("AVG  F1 Macro: {}".format(f1_mac_avg) )
print(f1_mac_all.std())