In [74]:
import pandas as pd
import numpy as np
import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import os, re, csv, math, codecs
from tqdm import tqdm
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
MAX_NB_WORDS = 100000
max_seq_len = 50
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('russian'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

In [76]:
df = pd.read_csv('/content/data.csv', sep='\t')

In [77]:
df.head()

Unnamed: 0,review,sentiment
0,качество плохое пошив ужасный (горловина напер...,negative
1,"Товар отдали другому человеку, я не получила п...",negative
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative
3,"товар не пришел, продавец продлил защиту без м...",negative
4,"Кофточка голая синтетика, носить не возможно.",negative


In [78]:
df.sentiment.value_counts()

negative    30000
neautral    30000
positive    30000
Name: sentiment, dtype: int64

In [79]:
lables = df['sentiment']
train = df['review']
lables = pd.get_dummies(lables)
lables.head()

Unnamed: 0,neautral,negative,positive
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [80]:
train_x, test_x, train_y, test_y = train_test_split(train, lables, test_size=0.2)

#load embeddings

In [13]:
ft = codecs.open('/content/drive/My Drive/cc.ru.300.vec', encoding='utf-8')

In [14]:
embeddings_index = {}
for line in tqdm(ft):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
ft.close()
print('found %s word vectors' % len(embeddings_index))

2000001it [04:34, 7275.26it/s]

found 2000000 word vectors





In [81]:
processed_train = []
for doc in tqdm(train_x):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_train.append(" ".join(filtered))


processed_test = []
for doc in tqdm(test_x):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_test.append(" ".join(filtered))

print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS+1, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_train + processed_test)
word_seq_train = tokenizer.texts_to_sequences(processed_train)
word_seq_test = tokenizer.texts_to_sequences(processed_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

100%|██████████| 72000/72000 [00:00<00:00, 91996.18it/s]
100%|██████████| 18000/18000 [00:00<00:00, 90375.01it/s]


tokenizing input data...
dictionary size:  53963


#embedding matrix


In [115]:
#training params
batch_size = 250 
num_epochs = 10

#model parameters
num_filters = 40
embed_dim = 300 
weight_decay = 1e-4

In [116]:
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words+1, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

number of null word embeddings: 10726


#CNN architecture

In [117]:
model = Sequential()
model.add(Embedding(nb_words+1, embed_dim,
          weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(3, activation='softmax'))

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 300)           16189200  
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 50, 40)            84040     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 25, 40)            0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 25, 40)            11240     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 40)                0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)               

In [118]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [119]:
#model training
hist = model.fit(word_seq_train, train_y, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

Epoch 1/10
260/260 - 5s - loss: 0.8289 - accuracy: 0.6175 - val_loss: 0.7212 - val_accuracy: 0.6808
Epoch 2/10
260/260 - 4s - loss: 0.6829 - accuracy: 0.7039 - val_loss: 0.6560 - val_accuracy: 0.7151
Epoch 3/10
260/260 - 4s - loss: 0.6478 - accuracy: 0.7177 - val_loss: 0.6510 - val_accuracy: 0.7190
Epoch 4/10
260/260 - 4s - loss: 0.6225 - accuracy: 0.7298 - val_loss: 0.6500 - val_accuracy: 0.7168
Epoch 5/10
260/260 - 4s - loss: 0.6005 - accuracy: 0.7403 - val_loss: 0.6292 - val_accuracy: 0.7258
Epoch 6/10
260/260 - 4s - loss: 0.5791 - accuracy: 0.7501 - val_loss: 0.6411 - val_accuracy: 0.7215
Epoch 7/10
260/260 - 4s - loss: 0.5631 - accuracy: 0.7583 - val_loss: 0.6419 - val_accuracy: 0.7143
Epoch 8/10
260/260 - 4s - loss: 0.5440 - accuracy: 0.7683 - val_loss: 0.6389 - val_accuracy: 0.7235
Epoch 9/10
260/260 - 4s - loss: 0.5295 - accuracy: 0.7743 - val_loss: 0.6499 - val_accuracy: 0.7186
Epoch 00009: early stopping


In [120]:
pred = model.predict(word_seq_test)
pred

array([[8.6628661e-02, 6.3616121e-03, 9.0700972e-01],
       [1.9169942e-01, 8.0569988e-01, 2.6006286e-03],
       [1.9810115e-01, 3.9553426e-02, 7.6234543e-01],
       ...,
       [4.4333735e-03, 4.4148066e-04, 9.9512511e-01],
       [2.5895584e-01, 7.2648174e-01, 1.4562482e-02],
       [6.6498423e-01, 3.0386439e-01, 3.1151427e-02]], dtype=float32)

In [121]:
pred = np.argmax(pred, axis = 1)
pred

array([2, 1, 2, ..., 2, 1, 0])

In [122]:
true = test_y.values
true = np.argmax(true, axis = 1)
true

array([2, 0, 2, ..., 2, 1, 0])

In [123]:
from sklearn.metrics import f1_score
print('F1-Score:', f1_score(true, pred, average='macro'))

F1-Score: 0.7223400470856381
