In [1]:
import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from keras.metrics import categorical_crossentropy
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('otziv.csv', sep=';')
df = df[['Rating', 'Content']]
df

Unnamed: 0,Rating,Content
0,5,It just works!
1,4,В целом удобноное приложение...из минусов хотя...
2,5,Отлично все
3,5,Стал зависать на 1% работы антивируса. Дальше ...
4,5,"Очень удобно, работает быстро."
...,...,...
20654,1,"Ну и шляпа,с роот правами бесполезная прога,ра..."
20655,5,Ок
20656,4,Доволен
20657,1,"Песопаснасть, рут ни нужын"


In [3]:
df_train = df[:15000]
df_train

Unnamed: 0,Rating,Content
0,5,It just works!
1,4,В целом удобноное приложение...из минусов хотя...
2,5,Отлично все
3,5,Стал зависать на 1% работы антивируса. Дальше ...
4,5,"Очень удобно, работает быстро."
...,...,...
14995,5,Люблю сбербанк
14996,3,Хорошее приложение
14997,5,Всё огонь!
14998,1,"Плохо пишет,что в телефоне есть рут,а его срод..."


In [4]:
df_val = df[15000:]
df_val

Unnamed: 0,Rating,Content
15000,5,Топчик.
15001,5,Супер!!!
15002,5,Прекрасно. Мне нравиться
15003,5,Удобно управлять счетами
15004,4,Очень медленно работает.
...,...,...
20654,1,"Ну и шляпа,с роот правами бесполезная прога,ра..."
20655,5,Ок
20656,4,Доволен
20657,1,"Песопаснасть, рут ни нужын"


In [5]:
max_words = 10000
max_len = 40
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [6]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['Content'] = df_train['Content'].apply(preprocess_text)
df_val['Content'] = df_val['Content'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Content'] = df_train['Content'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['Content'] = df_val['Content'].apply(preprocess_text)


In [7]:
train_corpus = " ".join(df_train['Content'])
train_corpus = train_corpus.lower()

In [8]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [10]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [11]:
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'работать',
 'удобный',
 'отлично',
 'нравиться',
 'хороший',
 'отличный',
 'супер',
 'телефон']

In [12]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [13]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [14]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train['Content']], dtype=np.int32)
x_val = np.asarray([text_to_sequence(text, max_len) for text in df_val['Content']], dtype=np.int32)

In [15]:
x_train.shape

(15000, 40)

In [16]:
max_len

40

In [17]:
x_train[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,  101, 3525, 3526,  118,  168,  116,
         88, 1187,  477, 3527,   15,  537,  458])

In [18]:
df_train['Rating'] = df_train['Rating']-1
df_val['Rating'] = df_val['Rating']-1
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Rating'] = df_train['Rating']-1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['Rating'] = df_val['Rating']-1


Unnamed: 0,Rating,Content
0,4,it just works
1,3,целое удобноной приложениеиз минус хотеть боль...
2,4,отлично
3,4,зависать 1 работа антивирус ранее пользоваться...
4,4,удобно работать быстро
...,...,...
14995,4,любить сбербанк
14996,2,хороший приложение
14997,4,огонь
14998,0,плохо пишетчто телефон рута сроду сдесьнебыть


In [19]:
df_val

Unnamed: 0,Rating,Content
15000,4,топчик
15001,4,супер
15002,4,нравиться
15003,4,удобно управлять счёт
15004,3,медленно работать
...,...,...
20654,0,шляпас роот право бесполезный прогаразрабыв ох...
20655,4,около
20656,3,довольный
20657,0,песопаснастя рута нужын


In [20]:
df_train

Unnamed: 0,Rating,Content
0,4,it just works
1,3,целое удобноной приложениеиз минус хотеть боль...
2,4,отлично
3,4,зависать 1 работа антивирус ранее пользоваться...
4,4,удобно работать быстро
...,...,...
14995,4,любить сбербанк
14996,2,хороший приложение
14997,4,огонь
14998,0,плохо пишетчто телефон рута сроду сдесьнебыть


In [21]:
num_classes = 5
y_train = keras.utils.to_categorical(df_train['Rating'], num_classes)
y_val = keras.utils.to_categorical(df_val['Rating'], num_classes)

In [22]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(100))
model.add(Dense(50))
model.add(Dense(10))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 128)           1280000   
                                                                 
 conv1d (Conv1D)             (None, 38, 128)           49280     
                                                                 
 activation (Activation)     (None, 38, 128)           0         
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 100)               12900     
                                                                 
 dense_1 (Dense)             (None, 50)                5050      
                                                        

In [24]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [25]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [26]:
score = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.7621209025382996
Test accuracy: 0.7432408332824707


### Задание 2, с загруженными эмбеддингами обучить

In [27]:
import zipfile
import wget
import gensim
model_url = 'http://vectors.nlpl.eu/repository/11/180.zip'
m = wget.download(model_url)
model_file = model_url.split('/')[-1]
with zipfile.ZipFile(model_file, 'r') as archive:
    stream = archive.open('model.bin')
    word2vec = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

In [28]:
import razdel
from pymystem3 import Mystem
word2vec_mystem = Mystem(entire_input=False)


def tokenize_with_mystem_pos(text):
    result = []
    for item in word2vec_mystem.analyze(text):
        if item['analysis']:
            lemma = item['analysis'][0]['lex']
            pos = re.split('[=,]', item['analysis'][0]['gr'])[0]
            token = f'{lemma}_{pos}'
        else:
            token = f'{item["text"]}_UNKN'

        result.append(token)

    return result

def tokenize_with_razdel(text):
    return [token.text for token in razdel.tokenize(text)]

def my_tokenizer(text):

    tokens = ['']
    #tokens= []

    text=text.lower()

    text = re.sub('[^а-яa-zё]+', ' ', text)
    text = re.sub('ё', 'е', text)
    text = re.sub('\s+', ' ', text)

    text = tokenize_with_mystem_pos(text)

    tokens = ' '.join(text)
    return tokens

In [29]:
df_train['ready_text'] = df_train.Content.apply(my_tokenizer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['ready_text'] = df_train.Content.apply(my_tokenizer)


In [30]:
df_val['ready_text'] = df_val.Content.apply(my_tokenizer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['ready_text'] = df_val.Content.apply(my_tokenizer)


In [32]:
tokenizer = Tokenizer(num_words=max_words, oov_token='', filters='', lower=False)
tokenizer.fit_on_texts(df_train.ready_text.tolist())
word_index = tokenizer.word_index
word_index[''] = 0

In [35]:
embedding_size = 300
def load_rusvec_embeddings(word2vec):
    embedding_matrix = np.random.uniform(-1, 1, size=(max_words, embedding_size))
    num_loaded = 0
    for w, i in word_index.items():
        try:
            v = word2vec[w]
            print(w)
            print(v)
        except KeyError:
            v = None
            print(w)
            print(v)
        if v is not None and i < max_words:
            embedding_matrix[i] = v
            num_loaded += 1

        #if v is None:
        #    print(w)
    print('Successfully loaded pretrained embeddings for '
          f'{num_loaded}/{max_words} words.')
    embedding_matrix = embedding_matrix.astype(np.float32)
    return embedding_matrix

In [36]:
embedding_matrix = load_rusvec_embeddings(word2vec)


None
приложение_S
None
удобно_ADV
[-0.62151915  0.04893227 -1.9934713  -0.4722305   1.7025783  -0.5086585
 -0.5579546  -0.83951277  0.3853578  -1.3650877   1.358746    1.8705509
  3.0795426  -1.015068   -0.7512511  -1.128863    1.5590192   1.1735799
 -0.4177923   0.77855504 -0.6393769  -0.04846664 -0.03545188  0.06417493
  1.0704358   0.69811356  0.76452506 -0.44425392 -1.2140785  -0.36362588
 -0.41581798 -0.49783468 -0.4509532   0.37100118 -1.2678673   0.70975304
 -0.00378405  1.1042509  -1.221901   -0.09300517  0.40559405  0.08905818
  0.23150606 -2.2871182  -2.067834   -1.3714038   1.2311436   0.86476415
 -1.3364534   2.0246778  -1.4612374  -0.283727    0.37013867  0.7730288
 -2.0762694  -0.22139698 -2.5252886  -1.3957298   0.26774588  1.1909195
 -0.8288046  -1.8730054   1.8390712   0.541051   -1.5269736  -1.4058574
  0.51611644  0.90383273 -0.04886026 -3.5356798   0.7882436  -0.07267711
 -0.7124117  -1.0933892  -0.25837564 -2.8175726   1.2963794   2.3621378
 -1.3008729  -0.912426 

In [39]:
model_1 = Sequential()
model_1.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len, weights=[embedding_matrix], trainable=False))
model_1.add(Conv1D(300, 3))
model_1.add(Activation("relu"))
model_1.add(GlobalMaxPool1D())
model_1.add(Dense(100))
model_1.add(Dense(50))
model_1.add(Dense(10))
model_1.add(Dense(num_classes))
model_1.add(Activation('softmax'))

In [40]:
model_1.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 40, 300)           3000000   
                                                                 
 conv1d_1 (Conv1D)           (None, 38, 300)           270300    
                                                                 
 activation_2 (Activation)   (None, 38, 300)           0         
                                                                 
 global_max_pooling1d_1 (Glo  (None, 300)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 100)               30100     
                                                                 
 dense_5 (Dense)             (None, 50)                5050      
                                                      

In [41]:
model_1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [42]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')


history = model_1.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [43]:
score_1 = model_1.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score_1[0])
print('Test accuracy:', score_1[1])



Test score: 0.8269685506820679
Test accuracy: 0.710726261138916


Пока скор без выгруженных эмбеддингов ниже чем с ними. Но надо делать более сильную обработку, т.к слова "слиплись" в словаре токенайзера из-за этого некорректно пока отрабатывает. Но если даже с нерабочими весами отрабатывает не сильно хуже, то с нормальными эмбеддингами в качестве скорее всего будет прирост