In [1]:
import numpy
import pandas as pd
import nltk
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import TfidfVectorizer


Using TensorFlow backend.


In [2]:
#(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=5000)
#X_train

In [3]:
#X_train[1]

In [4]:
data = pd.read_csv('reviews.txt', sep="\t", header=None)

data.columns = ['review', 'degree']


reviews = data['review'].copy()
degree = data.review.apply(lambda s : s[0])

data['review'] = reviews.copy()
data['degree'] = degree.copy()

data['review'] = data.review.apply(lambda s: ' '.join(nltk.word_tokenize(s))[1:])

#del reviews
#del degree
data.head(5)
#рейтинг
#0 	negative
#1 	somewhat negative
#2 	neutral
#3 	somewhat positive
#4 	positive

data['negative'] = (data.degree == 0).astype(float)
data['somewhat negative'] = (data.degree == '1').astype(float)
data['neutral'] = (data.degree == '2').astype(float)
data['somewhat positive'] = (data.degree == '3').astype(float)
data['positive'] = (data.degree == '4').astype(float)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(data.review)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
top_words = len(cv.vocabulary_)
top_words

15164

In [7]:
sent_to_vec = []
for s in data['review']:
    sent_l = nltk.word_tokenize(s)
    new_sent = []
    for w in sent_l:
        try:
            new_sent.append(cv.vocabulary_[w])
        except: pass
    sent_to_vec.append(new_sent)
     
print(sent_to_vec[:10])

[[11783, 9184, 4554, 3473, 13440, 287, 13438, 14798, 7182, 5793, 5298, 13440, 5809, 7182, 526, 5793, 5298, 13440, 5568, 12367, 9184, 14815, 9161, 598, 1868, 9042, 9184, 14815, 589, 13616, 8765, 9184, 12799], [10535, 7110, 620, 4477, 6809, 7182, 15021, 11696], [4918, 9184, 14993, 13129, 15025, 6182, 6128, 13579, 12103, 13533, 13491, 9228], [10073, 13526, 2559, 9184, 4598, 620, 503, 13440, 7100, 1340, 3338, 620, 8789, 9184, 13732, 9282, 7364, 12315, 9252], [620, 8174, 14840], [9184, 8918, 4511, 10355, 11294, 6754, 12075, 9712, 1883, 13440, 13609, 2165, 14073, 8469, 3096], [7182, 9947, 8427], [9184, 12313, 13475, 14934, 14932, 7194, 9923, 7827, 10685, 5462], [7194, 3901, 7714, 15109, 14934, 8765], [2965, 6168, 7194, 5298, 13440, 11441, 10709]]


In [8]:
data['review'] = sent_to_vec
data.review.head(5)

0    [11783, 9184, 4554, 3473, 13440, 287, 13438, 1...
1    [10535, 7110, 620, 4477, 6809, 7182, 15021, 11...
2    [4918, 9184, 14993, 13129, 15025, 6182, 6128, ...
3    [10073, 13526, 2559, 9184, 4598, 620, 503, 134...
4                                   [620, 8174, 14840]
Name: review, dtype: object

In [9]:
max_review_length = data.review.str.len().max()
max_review_length

43

In [10]:
X = data.review
y = data.drop(['review', 'degree'],axis = 1)
y.head(5)




Unnamed: 0,negative,somewhat negative,neutral,somewhat positive,positive
0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0


In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=36)



In [27]:
numpy.random.seed(7)

X_train = sequence.pad_sequences(X_train, maxlen=max_review_length, dtype='int32')
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, dtype='int32')


In [28]:
print(X_train.shape)
print(X_test.shape)


(6396, 43)
(2133, 43)


In [65]:
from keras.layers import Dropout
from keras.regularizers import l2

num_classes = 5
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Dropout(0.1))
model.add(LSTM(20, W_regularizer=l2(0.01), U_regularizer = l2(0.001)))
model.add(Dropout(0.6))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, nb_epoch=55, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

  if __name__ == '__main__':


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_48 (Embedding)     (None, 43, 32)            485248    
_________________________________________________________________
dropout_98 (Dropout)         (None, 43, 32)            0         
_________________________________________________________________
lstm_63 (LSTM)               (None, 20)                4240      
_________________________________________________________________
dropout_99 (Dropout)         (None, 20)                0         
_________________________________________________________________
dense_48 (Dense)             (None, 5)                 105       
Total params: 489,593
Trainable params: 489,593
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/55
Epoch 2/55
Epoch 3/55
Epoch 4/55
Epoch 5/55
Epoch 6/55
Epoch 7/55
Epoch 8/55
Epoch 9/55
Epoch 10/55
Epoch 11/55
Epoch 12

In [None]:
import keras
print(keras.__version__)