In [1]:
import numpy as np
import pandas as pd
from embeddings import w2v_embedding
import re

In [2]:
def clean_text(df, col):
    """A function for keeping only alpha-numeric
    characters and replacing all white space with
    a single space.
    """
    return df[col].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', str(x).lower()))\
                  .apply(lambda x: re.sub('\s+', ' ', x).strip())

In [3]:
df = pd.read_csv('../dataset/small_yelp.csv')

##### using following to training and predict using Conv1D and Fully-connected layers

In [4]:
# split the dataframe to training, test and validate

train = df[:25000]
test = df[25000:35000]
validate = df[35000:]

In [5]:
#column name 'text' for Yelp!, 'review content' for Zappos
x_train = clean_text(train, 'text')
x_validate = clean_text(validate, 'text')
x_test = clean_text(test, 'text')

y_train = np.asarray([x-1 for x in train['stars'].values])
y_validate = np.asarray([x-1 for x in validate['stars'].values])
y_test = np.asarray([x-1 for x in test['stars'].values])

the input for convo_fc model is (number of dimension, model_path, training text, validate text, test text, training label, validate label), the output of is the predicted label given for the given test set.

In [6]:
from NeuralNetClassifier import convo_fc

y_pred = convo_fc(300, '../shoes_w2v_model.bin', x_train, x_validate, x_test, y_train, y_validate)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          16369200  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 193, 32)           76832     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 96, 32)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3072)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               393344    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
________________________________________________

###### print results

In [9]:
from NeuralNetClassifier import print_results

print_results(y_test, y_pred)

F1 score: 0.3355
[[  46  714  382   94  238]
 [  18  439  201   65  109]
 [  20  472  352  103  201]
 [  27  675  525  190  731]
 [  36  884  809  341 2328]]


##### option to plot the roc-auc curve for a given prediction

In [None]:
from NeuralNetClassifier import roc_auc_plot
import keras

y_true = keras.utils.to_categorical(y_test, num_classes=5)
plt = roc_auc_plot(y_true, y_pred, title='word2vec-cnn')

# plt.savefig('./w2v-cnn.png', dpi=300)

##### using following to training and predict using SVM

the input of w2v_embedding model is (model_path, training set text)

In [10]:
from embeddings import w2v_embedding

model_path = '../shoes_w2v_model.bin'
x_train = w2v_embedding(model_path, x_train)
x_test = w2v_embedding(model_path, x_test)

#column name is 'stars' for Yelp! and 'rating' for Zappos
y_train = train['stars'].values
y_test = test['stars'].values

In [12]:
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score

#initialise SVM
svm = LinearSVC(dual=False, max_iter=5000)
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

In [13]:
print("F1 score: {}".format(f1_score(y_test, y_pred, average='micro')))
confusion_matrix(y_test, y_pred)

F1 score: 0.2874


array([[ 482,  226,  194,  378,  194],
       [ 194,  149,  129,  219,  141],
       [ 208,  168,  220,  358,  194],
       [ 336,  240,  376,  761,  435],
       [ 828,  503,  521, 1284, 1262]])