In [1]:
import numpy as np 
from sklearn import svm 


## Load the dataset

In [2]:
f = np.load('data_and_embedding.npz')

In [3]:
num_words = f['num_words']
embedding_dim = f['embedding_dim']
max_sequence_length = f['max_sequence_length']

data = f['data']
labels = f['labels']

embedding_matrix = f['embedding_matrix']

In [4]:
validation_split = 0.2 
epoch = 10

In [5]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

## Bag of words

In [36]:
def construct_feature_vec(text, embedding_matrix):
    text_vec = [0] * embedding_matrix.shape[0]
    zero_flag = 1
    for word in text:
        if zero_flag and word < 1:
            continue 
        else:
            zero_flag = 0
            text_vec[word] += 1
    return text_vec 

def convert_doc_feature_vec(doc, embedding_matrix):
    return [construct_feature_vec(text, embedding_matrix) for text in doc]

In [None]:
x_train_bow = convert_doc_feature_vec(x_train, embedding_matrix)

In [None]:
x_val_bow = convert_doc_feature_vec(x_val, embedding_matrix)

## Word embedding

In [21]:
def embed_text(text, embedding_matrix):
    count = 0
    text_embedding = np.zeros(embedding_matrix[0].shape)
    for word in text:
        if word != 0:
            count += 1
            text_embedding +=  embedding_matrix[word]
    return text_embedding  /count 

def embed_doc(doc, embedding_matrix):
    return [embed_text(text, embedding_matrix) for text in doc]

def convert_labels(one_hot_labels):
    return [list(label).index(1.0) for label in one_hot_labels]

In [32]:
x_train_embedded = embed_doc(x_train, embedding_matrix)

In [28]:
x_val_embedded = embed_doc(x_val, embedding_matrix)

## Convert labels

In [22]:
y_train_embedded = convert_labels(y_train)

# print(y_train[0])
# print(y_train_embedded[0])

y_val_embedded = convert_labels(y_val)

## Save data 

In [None]:
data_bow = convert_doc_feature_vec(data, embedding_matrix)
data_embedded = embed_doc(data, embedding_matrix)
labels_embedded = convert_labels(labels)

In [None]:
np.savez('bagofwords_and_svmembedding', data_bow=data_bow, data_embedded=data_embedded, labels_embedded=labels_embedded)

## SVM models (sklearn)

### Training (bag of words)

In [None]:
clf_bow = svm.LinearSVC()

In [None]:
clf_bow.fit(x_train_bow, y_train_embedded)

### Evaluation

In [None]:
preds_bow = clf_bow.predict(x_val_bow)

In [None]:
acc = np.mean(1*np.equal(np.array(y_val_embedded), preds))
print("accuracy: %g %" % (acc*10))

### Training (word embedding)

In [9]:
clf = svm.LinearSVC()

In [33]:
clf.fit(x_train_embedded, y_train_embedded)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

### Evaluation

In [None]:
preds = clf.predict(x_val_embedded)

In [None]:
acc = np.mean(1*np.equal(np.array(y_val_embedded), preds))
print("accuracy: %g %" % (acc*10))