In [57]:
import pandas as pd
import numpy as np
import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import os, re, csv, math, codecs
from tqdm import tqdm
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
MAX_NB_WORDS = 100000
max_seq_len = 50
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('russian'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

In [59]:
df = pd.read_csv('/content/12.csv', sep='\t')

In [60]:
df.head()

Unnamed: 0,review,sentiment
0,качество плохое пошив ужасный (горловина напер...,negative
1,"Товар отдали другому человеку, я не получила п...",negative
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative
3,"товар не пришел, продавец продлил защиту без м...",negative
4,"Кофточка голая синтетика, носить не возможно.",negative


In [61]:
df.sentiment.value_counts()

negative    30000
neautral    30000
positive    30000
Name: sentiment, dtype: int64

In [None]:
totalNumWords = [len(line) for line in word_seq_train]
plt.hist(totalNumWords,bins = np.arange(0,410,10))#[0,50,100,150,200,250,300,350,400])#,450,500,550,600,650,700,750,800,850,900])
plt.show()

In [62]:
lables = df['sentiment']
train = df['review']
lables = pd.get_dummies(lables)
lables.head()

Unnamed: 0,neautral,negative,positive
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [63]:
train_x, test_x, train_y, test_y = train_test_split(train, lables, test_size=0.2)

#load embeddings

In [13]:
ft = codecs.open('/content/drive/My Drive/cc.ru.300.vec', encoding='utf-8')

In [14]:
embeddings_index = {}
for line in tqdm(ft):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
ft.close()
print('found %s word vectors' % len(embeddings_index))

2000001it [04:34, 7275.26it/s]

found 2000000 word vectors





In [64]:
processed_train = []
for doc in tqdm(train_x):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_train.append(" ".join(filtered))


processed_test = []
for doc in tqdm(test_x):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_test.append(" ".join(filtered))

print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS+1, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_train + processed_test)
word_seq_train = tokenizer.texts_to_sequences(processed_train)
word_seq_test = tokenizer.texts_to_sequences(processed_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

100%|██████████| 72000/72000 [00:00<00:00, 96154.67it/s]
100%|██████████| 18000/18000 [00:00<00:00, 93810.85it/s]


tokenizing input data...
dictionary size:  53963


#embedding matrix


In [16]:
#training params
batch_size = 500 
num_epochs = 10

#model parameters
num_filters = 64 
embed_dim = 300 
weight_decay = 1e-4

In [17]:
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words+1, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

number of null word embeddings: 10726


#CNN architecture

In [18]:
model = Sequential()
model.add(Embedding(nb_words+1, embed_dim,
          weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(3, activation='softmax'))

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 300)          16189200  
_________________________________________________________________
conv1d (Conv1D)              (None, 100, 64)           134464    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 50, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 50, 64)            28736     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                2

In [19]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [20]:
#model training
hist = model.fit(word_seq_train, train_y, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

Epoch 1/8
130/130 - 7s - loss: 0.8414 - accuracy: 0.6095 - val_loss: 0.6838 - val_accuracy: 0.7053
Epoch 2/8
130/130 - 6s - loss: 0.6815 - accuracy: 0.7029 - val_loss: 0.6430 - val_accuracy: 0.7219
Epoch 3/8
130/130 - 6s - loss: 0.6446 - accuracy: 0.7194 - val_loss: 0.6239 - val_accuracy: 0.7319
Epoch 4/8
130/130 - 6s - loss: 0.6111 - accuracy: 0.7360 - val_loss: 0.6219 - val_accuracy: 0.7290
Epoch 5/8
130/130 - 6s - loss: 0.5910 - accuracy: 0.7462 - val_loss: 0.6164 - val_accuracy: 0.7350
Epoch 6/8
130/130 - 6s - loss: 0.5665 - accuracy: 0.7583 - val_loss: 0.6185 - val_accuracy: 0.7287
Epoch 7/8
130/130 - 6s - loss: 0.5379 - accuracy: 0.7725 - val_loss: 0.6353 - val_accuracy: 0.7278
Epoch 00007: early stopping


In [44]:
pred = model.predict(word_seq_test)
pred

array([[7.5001454e-01, 2.0126955e-01, 4.8715983e-02],
       [8.5546679e-05, 1.2468744e-06, 9.9991322e-01],
       [5.0836784e-01, 2.6763341e-01, 2.2399874e-01],
       ...,
       [5.2264196e-01, 3.6199021e-01, 1.1536783e-01],
       [3.4641147e-01, 6.0632288e-01, 4.7265615e-02],
       [5.2597290e-01, 4.4055313e-01, 3.3473931e-02]], dtype=float32)

In [45]:
pred = np.argmax(pred, axis = 1)
pred

array([0, 2, 0, ..., 0, 1, 0])

In [46]:
true = test_y.values
true = np.argmax(true, axis = 1)
true

array([0, 2, 2, ..., 0, 1, 0])

In [51]:
from sklearn.metrics import f1_score
print('F1-Score:', f1_score(true, pred, average='macro'))

F1-Score: 0.7281990261498278
