<a href="https://colab.research.google.com/github/IvanSedykh/nikita_task/blob/master/nikita_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install gensim --upgrade

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/1a/b3/8358842ee8e430f7eb8f996bdd06c146a71712b9848ed32f949ad44b5adf/gensim-3.8.2-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 129kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.2


In [0]:
!wget http://vectors.nlpl.eu/repository/20/187.zip
!unzip 187.zip

--2020-04-12 12:27:59--  http://vectors.nlpl.eu/repository/20/187.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2692389554 (2.5G) [application/zip]
Saving to: ‘187.zip’


2020-04-12 12:30:38 (16.3 MB/s) - ‘187.zip’ saved [2692389554/2692389554]

Archive:  187.zip
  inflating: meta.json               
  inflating: model.model             
  inflating: model.model.vectors_ngrams.npy  
  inflating: model.model.vectors.npy  
  inflating: model.model.vectors_vocab.npy  
  inflating: README                  


In [0]:
import gensim
model_file = 'model.model'
word2vec = gensim.models.KeyedVectors.load(model_file)


In [0]:
import pandas as pd
import numpy as np

In [0]:
data = pd.read_excel('data_NN_2019.xlsx')

In [0]:
data.fillna(0, inplace=True)
data['Другое'] = data['Другое'].apply(lambda x:x if x==0 else 1)

In [0]:
import re
import string
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct.lower()
data['docs_Clean'] = data['docs'].apply(lambda x: remove_punct(x))
data['tokens'] = data['docs_Clean'].apply(lambda x: x.split())

In [0]:
from sklearn.model_selection import train_test_split


data_train, data_test = train_test_split(data, 
                                         test_size=0.2, 
                                         random_state=42)

In [0]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

51145 words total, with a vocabulary size of 10742
Max sentence length is 96


In [0]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

12471 words total, with a vocabulary size of 4077
Max sentence length is 68


In [0]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [0]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)


In [0]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [0]:
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string


Using TensorFlow backend.


In [0]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train['docs_Clean'].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train['docs_Clean'].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 10721 unique tokens.


In [0]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [0]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)


(10722, 300)


In [0]:
test_sequences = tokenizer.texts_to_sequences(data_test['docs_Clean'].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [0]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [0]:
feature_columns = ['id', 'docs','docs_Clean', 'tokens']
y = data_train.drop(columns=feature_columns)
label_names = y.columns
y_train = y.values

In [0]:
x_train = train_cnn_data
y_tr = y_train

In [0]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 300)      3216600     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 49, 200)      120200      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 48, 200)      180200      embedding_2[0][0]                
____________________________________________________________________________________________

In [0]:
num_epochs = 3
batch_size = 34

In [0]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Train on 5366 samples, validate on 597 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [0]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [0]:
array = predictions
array[array >= 0.5] = 1
array[array < 0.5] = 0
array

array([[0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [0]:
data_test.values[0][2:-2]

array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 1.0,
       0.0, 0.0], dtype=object)

In [0]:
n_correct = 0
for y_true, y_pred in zip(data_test.values, predictions):
    correct = (y_true[2:-2]==y_pred)
    n_correct += correct.sum()
accuracy = n_correct / (42 * 1491)
print (accuracy )

0.9501453163424994
