In [18]:
import tensorflow as tf
import pandas as pd
import numpy as np
import keras
import math
import os
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [19]:
train_data = pd.read_csv('train_data.csv')
valid_data = pd.read_csv('valid_data.csv')
test_data = pd.read_csv('test_data.csv')

In [20]:
train_data['Comment'][0]

'story man unnatural feeling pig start opening scene terrific example absurd comedy formal orchestra audience turned insane violent mob crazy chanting singer unfortunately stay absurd whole time general narrative eventually making putting even era turned cryptic dialogue would make shakespeare seem easy third grader technical level better might think good cinematography future great vilmos zsigmond future star sally kirkland frederic forrest seen briefly '

In [21]:
def data_stats(dataframe):
    
    s = 0.0
    pos = 0
    
    for i in dataframe['Comment']:
        word_list = i.split()
        s = s + len(word_list)
    print("Total revievs: ", dataframe.shape[0])
    print("Average length of each review : ",s/train_data.shape[0])
    
    try:
        for i in range(dataframe.shape[0]):
            if dataframe.iloc[i]['Sentiment'] == 1:
                pos = pos + 1
        neg = dataframe.shape[0] - pos
        print("Percentage of reviews with positive sentiment: "+str(pos/dataframe.shape[0]*100)+"%")
        print("Percentage of reviews with negative sentiment: "+str(neg/dataframe.shape[0]*100)+"%")
        
    except:
        print("No sentiment in test data")

In [22]:
data_stats(train_data)

Total revievs:  24994
Average length of each review :  122.99143794510682
Percentage of reviews with positive sentiment: 50.0%
Percentage of reviews with negative sentiment: 50.0%


In [23]:
data_stats(valid_data)

Total revievs:  24993
Average length of each review :  120.19284628310794
Percentage of reviews with positive sentiment: 50.00600168047053%
Percentage of reviews with negative sentiment: 49.99399831952947%


In [24]:
data_stats(test_data)

Total revievs:  49995
Average length of each review :  247.2188125150036
No sentiment in test data


In [25]:
X_train = train_data['Comment']
X_valid = valid_data['Comment']
X_test = test_data['Comment']

# y_train = train_data['Sentiment']
# y_valid = valid_data['Sentiment']

y_train = train_data['Rating']
y_valid = valid_data['Rating']

num_classes = 11 #amount of numbers from 0 to 9

y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_valid = tf.keras.utils.to_categorical(y_valid, num_classes)
# y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [26]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [27]:
vocab_size = 1000 # choose based on statistics
oov_tok = ''
embedding_dim = 128
max_length = 300 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
valid_sequences = tokenizer.texts_to_sequences(X_valid)
valid_padded = pad_sequences(valid_sequences, padding='post', maxlen=max_length)

In [28]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    # keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
    keras.layers.LSTM(64),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(11, activation='sigmoid')
])
# compile model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(),
              optimizer=optimizer,
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 128)          128000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 11)                363       
                                                                 
Total params: 179,851
Trainable params: 179,851
Non-trainable params: 0
_________________________________________________________________


In [29]:
tb_callback = tf.keras.callbacks.TensorBoard(log_dir="logs/", histogram_freq=1)

checkpoint_path = "training/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_path, save_weights_only=True, verbose=1)

In [30]:
num_epochs = 5
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=1, callbacks = [tb_callback, cp_callback],
                   validation_data = (valid_padded, y_valid))

Epoch 1/5
Epoch 1: saving model to training\cp-0001.ckpt
Epoch 2/5
Epoch 2: saving model to training\cp-0002.ckpt
Epoch 3/5
Epoch 3: saving model to training\cp-0003.ckpt
Epoch 4/5
Epoch 4: saving model to training\cp-0004.ckpt
Epoch 5/5
Epoch 5: saving model to training\cp-0005.ckpt


In [31]:
prediction = model.predict(valid_padded)
# print(prediction)
# # Get labels based on probability 1 if p>= 0.5 else 0
# pred_labels = []
for i in range(len(prediction)):
    prediction[i] = np.where(prediction[i] < max(prediction[i]), 0, 1)
print("Accuracy of validation on test set : ", accuracy_score(y_valid,prediction))

Accuracy of validation on test set :  0.20441723682631135


In [32]:
prediction

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [33]:
y_valid

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)