# Import Required Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras import layers

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV


Using TensorFlow backend.


# Importing Data tfrom files

In [2]:
!unzip data.zip  #Run this if you are using Google colabratory

Archive:  data.zip
replace data/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data/.DS_Store          
replace data/amazon_cells_labelled.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data/amazon_cells_labelled.txt  
replace data/imdb_labelled.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data/imdb_labelled.txt  
replace data/readme.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data/readme.txt         
replace data/yelp_labelled.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data/yelp_labelled.txt  


In [0]:


filepath_dict = {'yelp':   'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb':   'data/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)

# Splitting Data for Test and Train:

In [0]:
sentences = df['sentence']    #features
y = df['label']               #Labels

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

# Tokenize and preparation of Data

In [0]:
# Tokenize words

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

In [0]:
vocab_size = len(tokenizer.word_index) + 1

In [0]:
# Pad sequences with zeros

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Buid Keras Model and Fit

In [0]:
# Main settings
epochs = 10
embedding_dim = 50

output_file = 'data/output.txt'
# Parameter grid for grid search
param_grid = dict(num_filters=[64],
                  kernel_size=[7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])

In [0]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [0]:
model = KerasClassifier(build_fn=create_model,epochs=epochs, batch_size=10,verbose=False)

In [0]:
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                          cv=4, verbose=1, n_iter=5)

In [12]:
grid_result = grid.fit(X_train, y_train)  #Took Around 1.6min  to finish in Google colab

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
W0711 15:20:44.767977 140502108546944 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0711 15:20:44.784341 140502108546944 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0711 15:20:44.788620 140502108546944 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0711 15:20:44.853814 140502108546944 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.t

Fitting 4 folds for each of 1 candidates, totalling 4 fits


W0711 15:20:45.113069 140502108546944 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.5min finished


# Evaluate model

In [0]:
# Evaluate testing set

test_accuracy = grid.score(X_test, y_test)


In [14]:
# saving the best fit parameters

prompt=input("Enter[y/n]")
if prompt.lower() in {'y', 'true', 'yes'}:
  with open(output_file, 'a') as f:
    s = ('data set\nBest Accuracy : ''{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
    output_string = s.format(
        grid_result.best_score_,
        grid_result.best_params_,
        test_accuracy)
    print(output_string)
    f.write(output_string)

Enter[y/n]y
data set
Best Accuracy : 0.8151
{'vocab_size': 4603, 'num_filters': 64, 'maxlen': 200, 'kernel_size': 7, 'embedding_dim': 50}
Test Accuracy : 0.8355




# Prediction

In [15]:

labels=["Negative", "Positive"]
a=["The hotel was under renovation and the smeel was unbeareable"]
a_series=pd.Series(a)
a_tokenizer=tokenizer.texts_to_sequences(a_series)

# Pad sequences with zeros
a_pad = pad_sequences(a_tokenizer, padding='post', maxlen=maxlen)
k=-1
for i in a_pad:
  k+=1
  print('Sentence:' + str(a[k]))
  prediction=grid.predict(np.array([i]))
  predict_label=labels[prediction[0][0]]
  print('Prediction: '+ str(predict_label))
  print()

Sentence:The hotel was under renovation and the smeel was unbeareable
Prediction:Negative

