In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from Preprocessing.to_embedding import WordEmbedding
from Preprocessing.data_format import formatting
from Preprocessing.helper_functions import import_embedding, embedding_matrix_word2vec
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
data = formatting("phase1_movie_reviews-train.csv")

y = pd.get_dummies(data['polarity'])
X_train, X_dev, y_train, y_dev = train_test_split(data['reviewText'], y, test_size = 0.10, random_state=42)

embedding_size = 300 #number of feature weights in embeddings
max_len = 400

In [4]:
data.head()

Unnamed: 0,polarity,summary,reviewText,year
0,negative,"[bruce, lee, the, legend, baaaaaad]","[this, was, a, horrible, movie, thats, all, i,...",2000
1,positive,"[stylish, yet, uneven, film, at, an, affordabl...","[as, a, lover, of, certian, genres, such, as, ...",2001
2,positive,"[masterful, and, commanding]","[master, and, commander, the, far, side, of, t...",2003
3,positive,"[great, special, effects, disappointed, with, ...","[what's, the, 411, on, this, movie, i'm, an, a...",2009
4,positive,"[bevare, bevare, dracula, is, suspect, here]","[when, i, bought, my, set, i, went, to, my, lo...",2004


In [3]:
embedding = WordEmbedding(num_features = embedding_size)

WordEmbedding.fit(embedding, X_train)
WordEmbedding.size(embedding)

Total number of words in the vocabulary:  (52260, 300)


In [5]:
#Save word embedding to dataframe
#train_embeddings = WordEmbedding.to_pd(embedding, X_train)

#Save Save embeddings to file
WordEmbedding.to_file(embedding)

In [6]:
embeddings_index = import_embedding('trained_embedding_word2vec.txt')

## 2. Vectorize text data

In [7]:
#Basic Vectorization of data
#Review data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

def vectorize(data, tokenizer ,max_len):
    sequences = tokenizer.texts_to_sequences(data)
    padding = pad_sequences(sequences, maxlen = max_len)
    
    return padding

X_train = vectorize(X_train, tokenizer , max_len)
X_dev = vectorize(X_dev, tokenizer, max_len)

print('Found %s unique tokens.' % len(word_index))
print('Shape of train tensor', X_train.shape)
print('Shape of dev tensor', X_dev.shape)

Found 179993 unique tokens.
Shape of train tensor (81000, 400)
Shape of dev tensor (9000, 400)


## 3. Create word vectors with the loaded word2vec model

In [8]:
embedding_matrix, num_words = embedding_matrix_word2vec(word_index, embedding_size, embeddings_index)

### Check train/dev sets

In [9]:
print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_test:', X_dev.shape)
print('Shape of y_test:', y_dev.shape)

Shape of X_train: (81000, 400)
Shape of y_train: (81000, 2)
Shape of X_test: (9000, 400)
Shape of y_test: (9000, 2)


## 5. Define model

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, CuDNNLSTM, GRU, Bidirectional, GlobalMaxPool1D, Dropout
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

# Define Model
model = Sequential()
model.add(Embedding(num_words, 
                    embedding_size,
                    input_length = max_len,
                     dropout=0.2))
model.add(Bidirectional(CuDNNLSTM(128, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(2, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  # This is added back by InteractiveShellApp.init_path()


In [13]:
history = model.fit(X_train, y_train, batch_size = 256, epochs = 4, validation_data = (X_dev, y_dev), verbose = 1)

Train on 81000 samples, validate on 9000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 3, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 3, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

In [25]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Development Accuracy:  {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_dev, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Development Accuracy:  0.9918
Testing Accuracy:  0.8826


NameError: name 'plot_history' is not defined

In [134]:
movie_df = pd.read_csv("phase1_movie_reviews-test-hidden.csv")
movie_labels = pd.read_csv("true_labels/true_movie_labels.txt", header=None)
movies_test = pd.concat([movie_df, movie_labels], axis=1).drop('polarity', axis=1).rename(columns={0: "polarity"})
#movies_test.to_csv("movies_test.csv", index = False)

game_df = pd.read_csv("phase1_video_games-test-hidden.csv")
game_labels = pd.read_csv("true_labels/true_game_labels.txt", header=None)
games_test = pd.concat([game_df, game_labels], axis=1).drop('polarity', axis=1).rename(columns={0: "polarity"})
#games_test.to_csv("movies_test.csv", index = False)

In [135]:
from keras.preprocessing.text import text_to_word_sequence

movies_test['reviewText'] = movies_test['reviewText'].astype(str)
movies_test['reviewText'] = movies_test['reviewText'].apply(text_to_word_sequence)

games_test['reviewText'] = games_test['reviewText'].astype(str)
games_test['reviewText'] = games_test['reviewText'].apply(text_to_word_sequence)

In [136]:
movies_X_test = movies_test['reviewText']
movies_y_test = movies_test[['polarity']]

games_X_test = games_test['reviewText']
games_y_test = games_test[['polarity']]

In [137]:
movies_X_test = vectorize(movies_X_test, tokenizer , max_len)
games_X_test = vectorize(games_X_test, tokenizer , max_len)

In [138]:
movies_pred = model.predict_classes(movies_X_test)

games_pred = model.predict_classes(games_X_test)

In [139]:
movies_y_test["polarity"] = movies_y_test["polarity"].str.replace('positive', '1')
movies_y_test["polarity"] = movies_y_test["polarity"].str.replace('negative', '0')
movies_y_test["polarity"] = movies_y_test["polarity"].astype('int64')

games_y_test["polarity"] = games_y_test["polarity"].str.replace('positive', '1')
games_y_test["polarity"] = games_y_test["polarity"].str.replace('negative', '0')
games_y_test["polarity"] = games_y_test["polarity"].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [140]:
from sklearn.metrics import accuracy_score

print(accuracy_score(movies_y_test, movies_pred))
print(accuracy_score(games_y_test, games_pred))

0.881
0.851527764639107
