In [1]:
import re
import os
import pickle
import copy
import string
from collections import Counter
import xmltodict
import rnnmorph

import pandas as pd
import numpy as np
import gensim

import nltk
from pymorphy2 import MorphAnalyzer
from russian_tagsets import converters

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack, vstack, csr_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import class_weight

from keras.models import Model, load_model
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.layers import LSTM, Bidirectional, Dropout, Dense, Input, Embedding, BatchNormalization, \
    TimeDistributed, GRU, Conv1D, MaxPooling1D, Flatten, Reshape, Conv2D
from keras.layers.merge import concatenate

from vocabulary import Vocabulary
from tokenizer import Tokenizer, Token
from vectorizer import GrammemeVectorizer

morph_ru = MorphAnalyzer()
to_ud = converters.converter('opencorpora-int', 'ud14')
nltk.download('stopwords')
RANDOM_SEED = 42

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/yallen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
tokenize = lambda x : [token.text for token in Tokenizer.tokenize(x) \
                       if token.token_type == Token.TokenType.WORD or token.token_type == Token.TokenType.PUNCTUATION]
tokenize_lower = lambda x : [token.lower() for token in tokenize(x)]

In [3]:
def text_to_wordlist(sentence):
    regexp = "[^а-яА-Яёa-zA-Z]"
    sentence = re.sub(regexp, " ", sentence)
    result = sentence.lower().split()
    return result


def stem_sentence(sentence, language):
    words = tokenize(sentence)
    for j in range(len(words)):
        if language == 'ru':
            words[j] = morph_ru.parse(words[j])[0].normal_form
    return " ".join(words)


def get_sentence_tags(sentence):
    words = tokenize(sentence)
    tags = []
    for j in range(len(words)):
        pos = morph_ru.parse(words[j])[0].tag.POS
        if pos is not None:
            tags.append(pos)
    return tags


def bow(train_texts, test_texts, language='ru', stem=False, tokenizer=tokenize_lower, preprocessor=None,
        use_tfidf=False, max_features=None, bow_ngrams=(1,1), analyzer='word'):
    train = copy.deepcopy(train_texts)
    test = copy.deepcopy(test_texts)
    if stem:
        for i in range(len(train)):
            train[i] = stem_sentence(train[i], language)
        for i in range(len(test)):
            test[i] = stem_sentence(test[i], language)

    if use_tfidf:
        vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=bow_ngrams, tokenizer=tokenizer,
                                     preprocessor=preprocessor, max_features=max_features)
    else:
        vectorizer = CountVectorizer(analyzer=analyzer, ngram_range=bow_ngrams, tokenizer=tokenizer,
                                     preprocessor=preprocessor, max_features=max_features)
    data = train+test
    data = vectorizer.fit_transform(data)
    train_data = data[:len(train)]
    test_data = data[len(train):]
    return train_data, test_data

def convert_from_opencorpora_tag(to_ud, tag: str, text: str):
    """
    Конвертировать теги их формата OpenCorpora в Universal Dependencies
    
    :param to_ud: конвертер.
    :param tag: тег в OpenCorpora.
    :param text: токен.
    :return: тег в UD.
    """
    ud_tag = to_ud(str(tag), text)
    pos = ud_tag.split()[0]
    gram = ud_tag.split()[1]
    return pos, gram

In [4]:
companies = {}
def get_sample_text(sample):
    assert sample['column'][3]['@name'] == 'text'
    return sample['column'][3]['#text']


def get_sample_answers_bank(sample):
    answers = {}
    for i in range(4, 12):
        companies[sample['column'][i]['@name']] = i
        answers[sample['column'][i]['@name']] = None if sample['column'][i]['#text'] == 'NULL'\
            else int(sample['column'][i]['#text'])
    return answers

def get_sample_answers_tkk(sample):
    answers = {}
    for i in range(4, 11):
        companies[sample['column'][i]['@name']] = i
        answers[sample['column'][i]['@name']] = None if sample['column'][i]['#text'] == 'NULL'\
            else int(sample['column'][i]['#text'])
    return answers

def get_sample_id(sample):
    assert sample['column'][0]['@name'] == 'id'
    return int(sample['column'][0]['#text'])


def get_data(filename):
    df = pd.DataFrame()
    with open(filename, "r", encoding='utf-8') as f:
        d = xmltodict.parse(f.read(), process_namespaces=True)
        clean_samples = []
        for sample in d['pma_xml_export']['database']['table']:
            sample_id = get_sample_id(sample)
            text = get_sample_text(sample)
            answers = get_sample_answers_bank(sample)
            for company, answer in answers.items():
                if answer is not None:
                    clean_samples.append((sample_id, text, company, answer))
        df['text'] = [sample[1] for sample in clean_samples]
        df['answer'] = [sample[3] for sample in clean_samples]
        df['company'] = [sample[2] for sample in clean_samples]
        df['sample_id'] = [sample[0] for sample in clean_samples]
    return df

In [5]:
train_filename = "/media/yallen/My Passport/Datasets/SentiRuEval-2016/banks_train.xml"
test_filename = "/media/yallen/My Passport/Datasets/SentiRuEval-2016/banks_test.xml"

train = get_data(train_filename)
test = get_data(test_filename)

url_replacement = lambda x: re.sub(r'(?:http[^\s]+)($|\s)', r'url\1', x)
user_replacement = lambda x: re.sub(r'(?:@[^\s]+)($|\s)', r'user\1', x)

train['text'] = train['text'].apply(url_replacement)
train['text'] = train['text'].apply(user_replacement)

test['text'] = test['text'].apply(url_replacement)
test['text'] = test['text'].apply(user_replacement)

In [6]:
print(train.head())
print(test.head())

                                                text  answer     company  \
0                 url Взять кредит тюмень альфа банк       0    alfabank   
1                Мнение о кредитной карте втб 24 url       0         vtb   
2  «Райффайзенбанк»: Снижение ключевой ставки ЦБ ...       0  raiffeisen   
3  Современное состояние кредитного поведения в р...       0    sberbank   
4         user user Главное чтоб банки СБЕР и ВТБ!!!       1    sberbank   

   sample_id  
0          1  
1          2  
2          3  
3          4  
4          5  
                                                text  answer company  \
0              #Автокредит в россельхозбанк 2012 url       0    rshb   
1           #Автокредит в россельхозбанк в череповце       0    rshb   
2  RT user url #Кредитный калькулятор россельхозб...       0    rshb   
3        RT user #Кредитные карты россельхозбанк url       0    rshb   
4      RT user #Кредиты в россельхозбанке ижевск url       0    rshb   

   sample_id  
0    

In [7]:
train_data, test_data = bow(train['text'].tolist(), test['text'].tolist(), stem=True, bow_ngrams=(1,2))
boc_train_data, boc_test_data = bow(train['text'].tolist(), test['text'].tolist(), analyzer='char', tokenizer=None, bow_ngrams=(1,2))
train_data = hstack([train_data, boc_train_data])
test_data = hstack([test_data, boc_test_data])

train_answer = train['answer'].tolist()

clf = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'),
                   param_grid={"tol": [1e-4, 1e-3],},
                   scoring="accuracy", cv=5)
clf.fit(train_data, train_answer)
print(clf)
print(clf.best_score_)
print(clf.best_estimator_)

test['answer'] = clf.predict(test_data)
print(test.head())

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tol': [0.0001, 0.001]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring='accuracy', verbose=0)
0.709306113585
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
                                                text  answer  company  \
0  RT vzglyad: По делу о работе МТС в Узбекистане...       1      mts   
1                RT user на мтс 960 url #на драйвер

In [17]:
def get_submission(filename, test_df, output_filename):
    with open(filename, "r", encoding='utf-8') as f:
        d = xmltodict.parse(f.read(), process_namespaces=True)
        test_answers = test_df['answer'].tolist()
        test_sample_ids = test_df['sample_id'].tolist()
        j = 0
        del d['pma_xml_export']['http://www.phpmyadmin.net/some_doc_url/:structure_schemas']
        for i, sample in enumerate(d['pma_xml_export']['database']['table']):
            sample_id = get_sample_id(sample)
            answers = get_sample_answers_bank(sample)
            for company, answer in answers.items():
                if answer is not None:
                    assert sample_id == test_sample_ids[j]
                    sample['column'][companies[company]]['#text'] = str(test_answers[j])
                    j += 1
        with open(output_filename, "w", encoding='utf-8') as w:
            xmltodict.unparse(d, w, pretty=True)

In [11]:
get_submission(test_filename, test, "submission.xml")

In [15]:
!cd twit-calc-win64/; nodejs calc.js ttk ../submission.xml ../tkk_etalon.xml

Counts    -  positive { tp: 67, tn: 2086, fp: 143, fn: 164 } negative { tp: 745, tn: 999, fp: 294, fn: 422 }
Precision -  { positive: 0.319047619047619, negative: 0.7170356111645814 }
Recall    -  { positive: 0.29004329004329005, negative: 0.6383890317052271 }
F         -  { positive: 0.3038548752834467, negative: 0.6754306436990027 }
F_R       -  0.4896427594912247


In [8]:
VOCAB_PATH = "pickles/banks_vocab.pickle"
def prepare_vocabulary(vocab_path, train, test, shrink_border=None):
    vocabulary = Vocabulary(vocab_path)
    if vocabulary.size() <= 1:
        for sentence in train['text'].tolist():
            for word in tokenize_lower(sentence):
                vocabulary.add_word(word)
        print("Train vocabulary size: {}".format(vocabulary.size()))
        for sentence in test['text'].tolist():
            for word in tokenize_lower(sentence):
                vocabulary.add_word(word) 
        print("Train+test vocabulary size: {}".format(vocabulary.size()))
        vocabulary.save()

    print("Vocabulary size: {}".format(vocabulary.size()))
    if shrink_border is not None:
        vocabulary.shrink(shrink_border)
        print("Vocabulary size after shrink: {}".format(vocabulary.size()))
    return vocabulary

vocabulary = prepare_vocabulary(VOCAB_PATH, train, test)

Vocabulary size: 21864


In [9]:
model = gensim.models.KeyedVectors.load_word2vec_format("pickles/banks_w2v.txt", binary=False)

In [10]:
weights = np.random.uniform(-0.01, 0.01, size=(vocabulary.size() + 1, 500))
unknown_words_count = 0
for i, word in enumerate(vocabulary.index_to_word):
#     parse = morph_ru.parse(word)[0]
#     pos = parse.tag.POS
#     if pos is None:
#         continue
#     if pos == 'ADJF' or pos == 'ADJS' or pos == 'COMP':
#         pos ='ADJ'
#     if pos == 'INFN':
#         pos = 'VERB'
#     if pos == 'ADVB':
#         pos = 'ADV'
#     lemma = parse.normal_form + '_' + pos
#     if lemma in model.wv:
#         weights[i] = model.wv[lemma]
#     else:
#         unknown_words_count += 1
    if word in model.wv:
        weights[i] = model.wv[word]
    else:
        unknown_words_count += 1
weights[0] = np.zeros((500,))
print("Unknown words: ", unknown_words_count)

Unknown words:  2406


In [11]:
from rnnmorph.predictor import RNNMorphPredictor
grammeme_vectorizer = GrammemeVectorizer("pickles/banks_vectorizer.json")
predictor = RNNMorphPredictor()

if grammeme_vectorizer.size() < 1:
    for i, sentence in enumerate(train['text'].tolist()):
        if i % 1000 == 0:
            print(i)
        forms = predictor.predict_sentence_tags(tokenize(sentence))
        for form in forms:
            grammeme_vectorizer.add_grammemes(form.pos, form.tag)
    for i, sentence in enumerate(test['text'].tolist()):
        if i % 1000 == 0:
            print(i)
        forms = predictor.predict_sentence_tags(tokenize(sentence))
        for form in forms:
            grammeme_vectorizer.add_grammemes(form.pos, form.tag)
    grammeme_vectorizer.init_possible_vectors()
    grammeme_vectorizer.save()
print(grammeme_vectorizer.size())

226


In [12]:
CHAR_SET = " абвгдеёжзийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-'\""

def get_samples(sentences, vocabulary, word_max_count, max_word_len):
    n = len(sentences)
    word_matrix = np.zeros((n, word_max_count), dtype='int')
    char_matrix = np.zeros((n, word_max_count, max_word_len), dtype=np.int)
    grammemes_matrix = np.zeros((n, word_max_count, grammeme_vectorizer.grammemes_count()), dtype=np.float)
    for i, sentence in enumerate(sentences):
        if i % 1000 == 0:
            print(i)
        words = tokenize_lower(sentence)[:word_max_count]
        word_matrix[i, -len(words):] = [vocabulary.get_word_index(word) for word in words]
        char_vectors = []
        for word in tokenize(sentence):
            char_indices = np.zeros(max_word_len)
            word_char_indices = [CHAR_SET.index(ch) if ch in CHAR_SET else len(CHAR_SET) for ch in word]
            char_indices[-min(len(word), max_word_len):] = word_char_indices[:max_word_len]
            char_vectors.append(char_indices)
        char_matrix[i, -len(tokenize(sentence)):] = char_vectors
        forms = predictor.predict_sentence_tags(tokenize(sentence))
        tags = [form.pos + "#" + form.tag for form in forms]
        grammemes_matrix[i, -len(tokenize(sentence)):] = [grammeme_vectorizer.get_vector(tag) for tag in tags]
    return word_matrix, char_matrix, grammemes_matrix

def get_train_val_test_sets(x, y, x_test, vocabulary, word_max_count=50, max_word_len=30, val_part=0.1):
    
    word_matrix, char_matrix, grammemes_matrix = get_samples(x["text"].tolist(), vocabulary, word_max_count, max_word_len)

    n = x.shape[0]
    np.random.seed(RANDOM_SEED)
    perm = np.random.permutation(n)
    idx_train = perm[:int(n*(1-val_part))]
    idx_val = perm[int(n*(1-val_part)):]

    word_matrix_train = word_matrix[idx_train]
    char_matrix_train = char_matrix[idx_train]
    grammemes_matrix_train = grammemes_matrix[idx_train]
    y_train = np.array(y, dtype='int32')[idx_train]

    word_matrix_val = word_matrix[idx_val]
    char_matrix_val = char_matrix[idx_val]
    grammemes_matrix_val = grammemes_matrix[idx_val]
    y_val = np.array(y, dtype='int32')[idx_val]

    word_matrix_test, char_matrix_test, grammemes_matrix_test = \
        get_samples(x_test["text"].tolist(), vocabulary, word_max_count, max_word_len)

    return (word_matrix_train, char_matrix_train, grammemes_matrix_train, y_train), \
        (word_matrix_val, char_matrix_val, grammemes_matrix_val, y_val), \
        (word_matrix_test, char_matrix_test, grammemes_matrix_test)

if not os.path.exists("pickles/banks_data_train.pickle"):
    data_train, data_val, data_test = get_train_val_test_sets(train, [a+1 for a in train['answer'].tolist()], test, vocabulary)
    with open("pickles/banks_data_train.pickle", "wb") as train_file:
        pickle.dump(data_train, train_file)
    with open("pickles/banks_data_val.pickle", "wb") as val_file:
        pickle.dump(data_val, val_file)
    with open("pickles/banks_data_test.pickle", "wb") as test_file:
        pickle.dump(data_test, test_file)
else:
    with open("pickles/banks_data_train.pickle", "rb") as train_file:
        data_train = pickle.load(train_file)
    with open("pickles/banks_data_val.pickle", "rb") as val_file:
        data_val = pickle.load(val_file)
    with open("pickles/banks_data_test.pickle", "rb") as test_file:
        data_test = pickle.load(test_file)

In [13]:
class SentiRNN:
    def __init__(self, rnn_units=128, dense_units=64, dropout=0.5, batch_size=32, embeddings_dimensions=500, 
                 char_embeddings_dimension=5, max_word_len=30):
        self.batch_size = batch_size
        self.dropout = dropout
        self.rnn_units = rnn_units
        self.dense_units = dense_units
        self.embeddings_dimensions = embeddings_dimensions
        self.char_embeddings_dimension = char_embeddings_dimension
        self.max_word_len = max_word_len

        self.model = None

    def build(self, weights, vocabulary_size):
        word_index_input = Input(shape=(None,), dtype="int32", name="word_index_input")
        word_embeddings = Embedding(vocabulary_size+1, self.embeddings_dimensions, weights=[weights,],
                                    name="word_embeddings", trainable=False)(word_index_input)
        
        grammemes_input = Input(shape=(None, grammeme_vectorizer.grammemes_count()), name='grammemes')
        grammemes_layer = Dense(30, activation='relu')(grammemes_input)
        
        chars_input = Input(shape=(None, self.max_word_len), name='chars_input')
        chars_layer = Embedding(len(CHAR_SET) + 1, self.char_embeddings_dimension, name='char_embeddings')(chars_input)
        
        chars_layer = TimeDistributed(Conv1D(5, 4, activation='relu'))(chars_layer)
        chars_layer = TimeDistributed(Dropout(self.dropout))(chars_layer)
        chars_layer = TimeDistributed(MaxPooling1D())(chars_layer)
        
        chars_layer = TimeDistributed(Conv1D(3, 3, activation='relu'))(chars_layer)
        chars_layer = TimeDistributed(Dropout(self.dropout))(chars_layer)
        chars_layer = TimeDistributed(MaxPooling1D())(chars_layer)
        
        chars_layer = TimeDistributed(Flatten())(chars_layer)
        
        layer = concatenate([word_embeddings, grammemes_layer, 
                             chars_layer
                            ], name="LSTM_input")
        layer = Bidirectional(LSTM(self.rnn_units // 2, dropout=self.dropout, recurrent_dropout=self.dropout, return_sequences=True))(layer)
        layer = Bidirectional(LSTM(self.rnn_units // 2, dropout=self.dropout, recurrent_dropout=self.dropout))(layer)
        layer = Dense(self.dense_units, activation='relu')(layer)
        layer = Dropout(self.dropout)(layer)
        
        predictions = Dense(3, activation='softmax')(layer)
        model = Model(inputs=[word_index_input, 
                              chars_input, 
                              grammemes_input], outputs=predictions)
        
        model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
                                                                                          
        print(model.summary())
        self.model = model

    def train(self, data_train, data_val, model_filename, enable_checkpoints=True):
        word_matrix_train, char_matrix_train, grammemes_matrix_train, y_train = data_train
        word_matrix_val, char_matrix_val, grammemes_matrix_val, y_val = data_val
        
        print("Train example:")
        print(word_matrix_train[0])
        print(char_matrix_train[0])
        print(grammemes_matrix_train[0])
        print(y_train[0])
        
        # Callback to prevent overfitting.
        callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

        # Callback to save best only model.
        if enable_checkpoints:
            callbacks.append(ModelCheckpoint(model_filename, monitor='val_loss', save_best_only=True))
            
        cw = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
        self.model.fit([word_matrix_train, 
                        char_matrix_train, 
                        grammemes_matrix_train], y_train, 
                       validation_data=([word_matrix_val, 
                                         char_matrix_val, 
                                         grammemes_matrix_val], y_val),
                       epochs=50,
                       batch_size=self.batch_size,
                       shuffle=True, 
                       callbacks=callbacks,
                       class_weight=cw,
                       verbose=1)

    def load(self, filename: str) -> None:
        self.model = load_model(filename)
        print(self.model.summary())

    def predict(self, data_test):
        word_matrix, char_matrix, grammemes_matrix = data_test
        
        print("Test example: ")
        print(word_matrix[0])
        print(char_matrix[0])
        print(grammemes_matrix[0])
        preds = self.model.predict([word_matrix, 
                                    char_matrix, 
                                    grammemes_matrix], 
                                   batch_size=self.batch_size, verbose=1)
        test_answers = [np.argmax(pred)-1 for pred in preds]
        test['answer'] = test_answers
        get_submission(test_filename, test, "submission.xml")

In [14]:
MODEL_FILENAME = "pickles/banks_model.h5"

In [15]:
rnn = SentiRNN()
rnn.build(weights, vocabulary.size())
rnn.train(data_train, data_val, MODEL_FILENAME)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
chars_input (InputLayer)         (None, None, 30)      0                                            
____________________________________________________________________________________________________
char_embeddings (Embedding)      (None, None, 30, 5)   625         chars_input[0][0]                
____________________________________________________________________________________________________
time_distributed_1 (TimeDistribu (None, None, 27, 5)   105         char_embeddings[0][0]            
____________________________________________________________________________________________________
time_distributed_2 (TimeDistribu (None, None, 27, 5)   0           time_distributed_1[0][0]         
___________________________________________________________________________________________

In [18]:
rnn = SentiRNN()
rnn.load(MODEL_FILENAME)
rnn.predict(data_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
chars_input (InputLayer)         (None, None, 30)      0                                            
____________________________________________________________________________________________________
char_embeddings (Embedding)      (None, None, 30, 5)   625         chars_input[0][0]                
____________________________________________________________________________________________________
time_distributed_1 (TimeDistribu (None, None, 27, 5)   105         char_embeddings[0][0]            
____________________________________________________________________________________________________
time_distributed_2 (TimeDistribu (None, None, 27, 5)   0           time_distributed_1[0][0]         
___________________________________________________________________________________________

In [20]:
!cd twit-calc-win64/; node calc.js bank ../submission.xml ../banks_etalon.xml

Counts    -  positive { tp: 191, tn: 2842, fp: 258, fn: 127 } negative { tp: 555, tn: 2264, fp: 370, fn: 229 }
Precision -  { positive: 0.42538975501113585, negative: 0.6 }
Recall    -  { positive: 0.60062893081761, negative: 0.7079081632653061 }
F         -  { positive: 0.4980443285528031, negative: 0.649502633118783 }
F_R       -  0.573773480835793


In [146]:
def run_bow_nb(train_sentences, train_answers, test_sentences):
    train_data, test_data = bow(train_sentences, test_sentences)
    nb = MultinomialNB()
    clf = GridSearchCV(estimator=nb, 
                       param_grid={"alpha": [0.1, 0.3, 0.6, 0.9, 1.0]}, 
                       scoring="neg_log_loss", cv=5)
    clf.fit(train_data, train_answers)
    print("CV: {}".format(clf.best_score_))
    return  clf.predict_proba(train_data), clf.predict_proba(test_data)

def run_boc_nb(train_sentences, train_answers, test_sentences):
    train_data, test_data = bow(train_sentences, test_sentences, tokenizer=None, use_tfidf=True, analyzer='char')
    nb = MultinomialNB()
    clf = GridSearchCV(estimator=nb, 
                       param_grid={"alpha": [0.1, 0.3, 0.6, 0.9, 1.0]}, 
                       scoring="neg_log_loss", cv=5)
    clf.fit(train_data, train_answers)
    print("CV: {}".format(clf.best_score_))
    return  clf.predict_proba(train_data), clf.predict_proba(test_data)

def collect_additional_features(train, test):
    train_df = train.copy()
    test_df = test.copy()
    rus_stopwords = set(nltk.corpus.stopwords.words("russian"))
    
    train_df["words"] =  train_df["text"].apply(lambda text: text.split())
    test_df["words"] = test_df["text"].apply(lambda text: text.split())
    
    train_df["num_words"] = train_df["words"].apply(lambda words: len(words))
    test_df["num_words"] = test_df["words"].apply(lambda words: len(words))
    
    train_df["num_unique_words"] = train_df["words"].apply(lambda words: len(set(words)))
    test_df["num_unique_words"] = test_df["words"].apply(lambda words: len(set(words)))
    
    train_df["num_chars"] = train_df["text"].apply(lambda text: len(text))
    test_df["num_chars"] = test_df["text"].apply(lambda text: len(text))
    
    train_df["num_stopwords"] = train_df["words"].apply(lambda words: len([w for w in words if w in rus_stopwords]))
    test_df["num_stopwords"] = test_df["words"].apply(lambda words: len([w for w in words if w in rus_stopwords]))
    
    train_df["num_punctuations"] = train_df['text'].apply(lambda text: len([c for c in text if c in string.punctuation]))
    test_df["num_punctuations"] =test_df['text'].apply(lambda text: len([c for c in text if c in string.punctuation]))
    
    train_df["num_words_upper"] = train_df["words"].apply(lambda words: len([w for w in words if w.isupper()]))
    test_df["num_words_upper"] = test_df["words"].apply(lambda words: len([w for w in words if w.isupper()]))
    
    train_df["num_words_title"] = train_df["words"].apply(lambda words: len([w for w in words if w.istitle()]))
    test_df["num_words_title"] = test_df["words"].apply(lambda words: len([w for w in words if w.istitle()]))
    
    train_df["mean_word_len"] = train_df["words"].apply(lambda words: np.mean([len(w) for w in words]))
    test_df["mean_word_len"] = test_df["words"].apply(lambda words: np.mean([len(w) for w in words]))
    
#     pred_train, pred_test = run_bow_nb(train_df["text"].tolist(), train_df["answer"].tolist(), test_df["text"].tolist())
#     train_df["nb_count_neg"] = pred_train[:,0]
#     train_df["nb_count_neu"] = pred_train[:,1]
#     train_df["nb_count_pos"] = pred_train[:,2]
#     test_df["nb_count_neg"] = pred_test[:,0]
#     test_df["nb_count_neu"] = pred_test[:,1]
#     test_df["nb_count_pos"] = pred_test[:,2]
    
#     pred_train, pred_test = run_boc_nb(train_df["text"].tolist(), train_df["answer"].tolist(), test_df["text"].tolist())
#     train_df["nb_count_chars_neg"] = pred_train[:,0]
#     train_df["nb_count_chars_neu"] = pred_train[:,1]
#     train_df["nb_count_chars_pos"] = pred_train[:,2]
#     test_df["nb_count_chars_neg"] = pred_test[:,0]
#     test_df["nb_count_chars_neu"] = pred_test[:,1]
#     test_df["nb_count_chars_pos"] = pred_test[:,2]
    
    train_df.drop(["text", "answer", "company", "sample_id", "words"], axis=1, inplace=True)
    test_df.drop(["text", "answer", "company", "sample_id", "words"], axis=1, inplace=True)
    
    scaler = MinMaxScaler()
    train_df = scaler.fit_transform(train_df)
    test_df = scaler.transform(test_df)
    return csr_matrix(train_df), csr_matrix(test_df)

In [106]:
train_additional_features, test_additional_features = collect_additional_features(train, test)
train_data, test_data = bow(train['text'].tolist(), test['text'].tolist(), stem=True, bow_ngrams=(1,2))
boc_train_data, boc_test_data = bow(train['text'].tolist(), test['text'].tolist(), analyzer='char', tokenizer=None, bow_ngrams=(1,2))
train_data = hstack([train_data, boc_train_data, train_additional_features])
test_data = hstack([test_data, boc_test_data, test_additional_features])
train_answer = train['answer'].tolist()

clf = LogisticRegression()
clf.fit(train_data, train_answer)

test['answer'] = clf.predict(test_data)
get_submission(test_filename, test, "submission.xml")
!cd twit-calc-win64/; node calc.js bank ../submission.xml ../etalon.xml

Counts    -  positive { tp: 124, tn: 2969, fp: 131, fn: 194 } negative { tp: 392, tn: 2307, fp: 327, fn: 392 }
Precision -  { positive: 0.48627450980392156, negative: 0.545201668984701 }
Recall    -  { positive: 0.389937106918239, negative: 0.5 }
F         -  { positive: 0.4328097731239093, negative: 0.5216234198270127 }
F_R       -  0.477216596475461
