In [10]:
import numpy as np 
import pandas as pd 

from math import sqrt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, GRU, Bidirectional, Embedding, Dropout, Activation, GlobalMaxPool1D, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.utils.np_utils import to_categorical

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [28]:
train_data = pd.read_csv('train_after_nltk_with_negative.csv', index_col=0)
test_data = pd.read_csv('test_after_nltk_with_negative.csv', index_col=0)
train_data['text'] = train_data['text'].astype(str)
test_data['text'] = test_data['text'].astype(str)
train_data_text = train_data['text'].values
test_data_text =  test_data['text'].values
train_label = train_data['stars']
test_Idx = test_data[['review_id']]

In [None]:
As_regression_problem = 0 # Set 1 if see this problem as regression problem; otherwise, set 0 as classification problem

In [29]:
if As_regression_problem == 1:
    train_label = train_data['stars'].astype(float)
    train_label=np.array(train_label).reshape(-1,1)
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_label = scaler.fit_transform(train_label)

In [30]:
if As_regression_problem == 0:
# Encoding for label
    label = list(np.unique(train_label))
    le = LabelEncoder()
    le.fit(label)
    num_labels = len(label)
    train_label = to_categorical(train_label.map(lambda x: le.transform([x])[0]), num_labels)

In [35]:
# Tokenize the text
MAX_NUM_WORDS=1000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH=100 # max number of words in a review to use

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_data_text)

train_sequences = tokenizer.texts_to_sequences(train_data_text)
test_sequences = tokenizer.texts_to_sequences(test_data_text)

train_data_text_vector = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data_text_vector = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print('Shape of train data tensor:', train_data_text_vector.shape)
print('Shape of train label tensor:', train_label.shape)
print('Shape of test data tensor:', test_data_text_vector.shape)

Found 20593 unique tokens.
Shape of train data tensor: (7997, 100)
Shape of train label tensor: (7997, 5)
Shape of test data tensor: (2003, 100)


In [36]:
# Random suffle and split the data into training/validation
VALIDATION_SPLIT=0.2

indices = np.arange(train_data_text_vector.shape[0])
np.random.shuffle(indices)
train_data_text_vector = train_data_text_vector[indices]
train_label = train_label[indices]
nb_validation_samples = int(VALIDATION_SPLIT * train_data_text_vector.shape[0])

x_train = train_data_text_vector[:-nb_validation_samples]
y_train = train_label[:-nb_validation_samples]
x_val = train_data_text_vector[-nb_validation_samples:]
y_val = train_label[-nb_validation_samples:]

In [38]:
# Build the LSTM Model
embedding_vecor_length = 128
lstm_out = 128
batch_size = 256

model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, embedding_vecor_length, input_length = MAX_SEQUENCE_LENGTH, dropout = 0.2))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(GRU(lstm_out, dropout=0.2, recurrent_dropout=0.2))
if As_regression_problem == 0:
    model.add(Dense(num_labels,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
else:
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam',metrics = ['accuracy'])
print(model.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 128)          128000    
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 100, 128)          49280     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 50, 128)           0         
_________________________________________________________________
gru_10 (GRU)                 (None, 128)               98688     
_________________________________________________________________
dense_10 (Dense)             (None, 5)                 645       
Total params: 276,613
Trainable params: 276,613
Non-trainable params: 0
_________________________________________________________________
None


In [39]:
# Train the model. Though the overfitting happens when validation loss keeps increasing but training loss keeps decreasing
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size = batch_size, epochs = 100, verbose = 2)

Train on 6398 samples, validate on 1599 samples
Epoch 1/100
 - 11s - loss: 1.4747 - acc: 0.3460 - val_loss: 1.4475 - val_acc: 0.3421
Epoch 2/100
 - 9s - loss: 1.4185 - acc: 0.3654 - val_loss: 1.4401 - val_acc: 0.3346
Epoch 3/100
 - 9s - loss: 1.3948 - acc: 0.3865 - val_loss: 1.4517 - val_acc: 0.3290
Epoch 4/100
 - 9s - loss: 1.3583 - acc: 0.4276 - val_loss: 1.4821 - val_acc: 0.3358
Epoch 5/100
 - 9s - loss: 1.3014 - acc: 0.4537 - val_loss: 1.5489 - val_acc: 0.3377
Epoch 6/100


KeyboardInterrupt: 

In [9]:
score,acc = model.evaluate(x_val, y_val, verbose = 2, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Validation Accuracy: %.2f%%" % (acc * 100.0))

Score: 0.08
Validation Accuracy: 36.02%


In [10]:
# Evaluate the model
if As_regression_problem == 0:
    y_val_pred = model.predict_classes(x_val)
    rmse = sqrt(mean_squared_error(y_val.argmax(axis=1)+1, y_val_pred+1))
    accuracy = accuracy_score(y_val.argmax(axis=1)+1, y_val_pred+1)
else:
    y_val_pred = model.predict(x_val)
    y_val = scaler.inverse_transform(y_val)
    y_val_pred = scaler.inverse_transform(y_val_pred)
    rmse = sqrt(mean_squared_error(y_val, y_val_pred))
    accuracy = accuracy_score(np.rint(y_val).astype(int), np.rint(y_val_pred).astype(int))
print("RMSE: %.2f" % rmse)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

RMSE: 1.58
Accuracy: 36.02%


In [24]:
# Predict on the test data
if As_regression_problem == 0:
    y_test_pred = model.predict_classes(test_data_text_vector)
else:
    y_test_pred = model.predict(test_data_text_vector)
    y_test_pred = scaler.inverse_transform(y_test_pred)
    y_test_pred = y_test_pred.ravel()

In [26]:
predictions = y_test_pred_real
submission = pd.DataFrame({"stars": predictions})
submission = test_Idx.join(submission)

In [27]:
submission.head(10)

Unnamed: 0,review_id,stars
0,2713,5.0
1,4734,5.0
2,5598,5.0
3,9545,5.0
4,1471,5.0
5,4533,5.0
6,7723,5.0
7,2433,5.0
8,9182,5.0
9,2238,5.0


In [15]:
submission.to_csv("result.csv", index=False,header=False)