In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.models import Sequential, load_model
from keras.layers import  CuDNNLSTM,Bidirectional, GlobalMaxPool1D,Input,Conv1D,MaxPooling1D,Embedding,Reshape,TimeDistributed,Dense, Activation, LSTM, SimpleRNN, Dropout  
from keras.utils import to_categorical
from keras.optimizers import SGD,Adam,RMSprop,Adadelta
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from os import listdir
from os.path import isfile, isdir, join

Using TensorFlow backend.


In [2]:
review = pd.read_csv('all_data.csv')

In [3]:
review['label'].value_counts()

1    504291
0    431145
Name: label, dtype: int64

In [4]:
embeddings_index = {}
f = open('glove.6B.200d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [5]:
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [6]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 200

In [7]:
texts = []
label = []
for i,j in zip(review['comm'],review['label']):
    texts.append(str(i).lower())
    label.append(int(j))

In [8]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [10]:
# import pickle
# pickle.dump(tokenizer, open( "tokenizer.p", "wb" ))

In [9]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 76753 unique tokens.


In [10]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
num_words = min(MAX_NB_WORDS, len(word_index)) +1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

In [None]:
# embeddings_index.get('the')

In [11]:
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [20]:
model = Sequential()
model.add( Embedding(num_words,
                     EMBEDDING_DIM,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False))
model.add(Bidirectional(CuDNNLSTM(512, return_sequences = True)))
model.add(GlobalMaxPool1D())
# model.add(LSTM(units=512,dropout=0.2,return_sequences=False))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model = multi_gpu_model(model, 4)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_4_input (InputLayer)  (None, 1000)         0                                            
__________________________________________________________________________________________________
lambda_5 (Lambda)               (None, 1000)         0           embedding_4_input[0][0]          
__________________________________________________________________________________________________
lambda_6 (Lambda)               (None, 1000)         0           embedding_4_input[0][0]          
__________________________________________________________________________________________________
lambda_7 (Lambda)               (None, 1000)         0           embedding_4_input[0][0]          
__________________________________________________________________________________________________
lambda_8 (

In [17]:
from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard

In [15]:
from keras.utils import multi_gpu_model

In [18]:
filepath="weights-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False, mode='max',  period=1)

In [21]:
model.fit(data, label, batch_size=2048, epochs=15, callbacks=[TensorBoard(log_dir='log_dir'),checkpoint],validation_split=0.01)

Train on 926081 samples, validate on 9355 samples
Epoch 1/15

Epoch 00001: saving model to weights-01-0.91.hdf5
Epoch 2/15

Epoch 00002: saving model to weights-02-0.92.hdf5
Epoch 3/15

Epoch 00003: saving model to weights-03-0.92.hdf5
Epoch 4/15

Epoch 00004: saving model to weights-04-0.92.hdf5
Epoch 5/15

Epoch 00005: saving model to weights-05-0.92.hdf5
Epoch 6/15

Epoch 00006: saving model to weights-06-0.92.hdf5
Epoch 7/15

Epoch 00007: saving model to weights-07-0.92.hdf5
Epoch 8/15

Epoch 00008: saving model to weights-08-0.91.hdf5
Epoch 9/15

Epoch 00009: saving model to weights-09-0.92.hdf5
Epoch 10/15

Epoch 00010: saving model to weights-10-0.92.hdf5
Epoch 11/15

Epoch 00011: saving model to weights-11-0.91.hdf5
Epoch 12/15

Epoch 00012: saving model to weights-12-0.89.hdf5
Epoch 13/15

Epoch 00013: saving model to weights-13-0.91.hdf5
Epoch 14/15

Epoch 00014: saving model to weights-14-0.90.hdf5
Epoch 15/15

Epoch 00015: saving model to weights-15-0.90.hdf5


<keras.callbacks.History at 0x7f0bca9c23c8>

In [None]:
# model.fit(data, label, batch_size=128, epochs=15, callbacks=[TensorBoard(log_dir='log_dir'),checkpoint],validation_split=0.01)

In [20]:
# model.fit(data, label, batch_size=128, epochs=5,validation_split=0.2)

Train on 748348 samples, validate on 187088 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8eed43ec18>

In [27]:
model.save('sentimental4.h5')

In [None]:
from keras.models import load_model
model =  load_model('sentimental.h5')

In [None]:
# import re
# def remove_emoji(text):
#     emoji_pattern = re.compile(
#         "["
#         "\U0001F600-\U0001F64F"  # emoticons
#         "\U0001F300-\U0001F5FF"  # symbols & pictographs
#         "\U0001F680-\U0001F6FF"  # transport & map symbols
#         "\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            "]+"
#        , flags=re.UNICODE)
#     return emoji_pattern.sub(r'', text)

In [None]:
# path = 'data'
# file = listdir(path)
# raw_data= {'label':[],'comm':[]}
# for i in tqdm(file):
#         fullpath = join(path,i)
#         fullpath = fullpath.replace('\\', '/') #windows"
#         review = pd.read_csv(fullpath)
#         for label,comm in zip(review['label'],review['comm'] ) :
#             label = int(label)
#             comm = remove_emoji(str(comm))
#             raw_data['label'].append(label)
#             raw_data['comm'].append(comm)

In [None]:
# df = pd.DataFrame(raw_data, columns = ['label','comm'])
# df.to_csv('all_data.csv',index = False)