In [1]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 

import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding

import keras.backend as K

Using TensorFlow backend.


## Load the dataset

In [3]:
f = np.load('data_and_embedding100.npz')

In [4]:
num_words = int(f['num_words'])
embedding_dim = int(f['embedding_dim'])
max_sequence_length = int(f['max_sequence_length'])

data = f['data']
labels = f['labels']

embedding_matrix = f['embedding_matrix']

In [5]:
validation_split = 0.2 
epoch = 10

In [6]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

## Baselines

In [15]:
labels_tally = {}
count = 0

for label in y_list:
    count += 1
    if label not in labels_tally:
        labels_tally[label] = 1
    else:
        labels_tally[label] += 1

In [20]:
for label in labels_tally:
    print("label: %d, percentage: %.2f" % (label, labels_tally[label]/count * 100), end='')
    print('%')

label: 1, percentage: 6.26%
label: 2, percentage: 4.44%
label: 3, percentage: 7.22%
label: 4, percentage: 16.28%
label: 5, percentage: 65.79%


## Classification model (Keras)

### Build the model 

In [None]:
import numpy as np 
import tensorflow as tf 


class BaselineClassifier(Classifier):
    """
    """
    def __init__(self):
        self.batch_size = batch_size
        self.epochs = epochs
        self.embedding_dim = embedding_dim 
        
        self.data_path = data_path 
        self.load_data(data_path)
        
        self.embedding_layer = self._construct_embedding_layer()
        
        self.model_type = model_type
        self.model = self.build(self.model_type)
        
    def load_data(self, embedded_data_path, validation_split=0.2):
        """
        """
        # f = np.load('data_and_embedding100.npz')
        f = np.load(embedded_data_path)
        
        self.num_words = int(f['num_words'])
        self.embedding_dim = int(f['embedding_dim'])
        self.max_sequence_length = int(f['max_sequence_length'])

        self.x = f['x_train']
        self.y = f['y_train']
        self.x_test = f['x_test']
        self.y_test = f['y_test']

        self.embedding_matrix = f['embedding_matrix']
        
        indices = np.arange(self.x.shape[0])
        np.random.shuffle(indices)
        self.x = self.x[indices]
        self.y = self.y[indices]
        num_validation_samples = int(validation_split * self.x.shape[0])

        self.x_train = self.x[:-num_validation_samples]
        self.y_train = self.y[:-num_validation_samples]
        self.x_val = self.x[-num_validation_samples:]
        self.y_val = self.y[-num_validation_samples:]
        
        
    def _construct_embedding_layer(self):
        """
        """
        return Embedding(num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)
    
    def build(self, model_type=self.model_type):
        """
        """
        if model_type == 'majority':
            return self.build_majority_predictor()
        elif model_type == 'logistic':
            return self.build_logistic_regression()
        elif model_type == 'svm':
            return self.build_SVM()
    
    def build_majority_predictor(self):
        """
        """
        pass 
    
    def build_logistic_regression(self):
        """
        """
        pass 
    
    def build_SVM(self):
        """
        """
        pass 
    
    def train(self, model_type=self.model_type):
        """
        """
        if model_type == 'logistic':
            self.train_logistic_regression()
        elif model_type == 'svm':
            self.train_SVM()
        else:
            pass
        
    def train_logistic_regression(self):
        """
        """
        pass 
    
    def train_SVM(self):
        """
        """
        pass 
    
    def predict(self, model_type=self.model_type):
        """
        """
        if model_type == 'majority':
            return self.predict_majority_predictor()
        elif model_type == 'logistic':
            return self.predict_logistic_regression()
        elif model_type == 'svm':
            return self.predict_SVM()
        
    def predict_majority_predictor(self):
        """
        """
        pass 
    
    def predict_logistic_regression(self):
        """
        """
        pass
    
    def predict_SVM(self):
        """
        """
        pass
    
    def save_model(self, model_type=self.model_type):
        """
        """
        pass 
        
    def embedding_mean(x):
        """ for logistic regression model 
        """
        return tf.reduce_mean(x, axis=1)
    
        

In [6]:
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)

In [7]:
def embedding_mean(x):
    return tf.reduce_mean(x, axis=1)

In [8]:
sequence_input = Input(shape=(max_sequence_length, ), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Lambda(embedding_mean)(embedded_sequences)
preds = Dense(6, activation='softmax')(x)

model = Model(sequence_input, preds)

In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, array(1000))       0         
_________________________________________________________________
embedding_1 (Embedding)      (None, array(1000), array 2000000   
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 606       
Total params: 2,000,606.0
Trainable params: 606.0
Non-trainable params: 2,000,000.0
_________________________________________________________________


### Train the model

In [10]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
import time 
start_time = time.time()

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

print("Training time: ", time.time() - start_time)

Train on 311902 samples, validate on 77975 samples
Epoch 1/10
 45824/311902 [===>..........................] - ETA: 75s - loss: 1.5094 - acc: 0.6404 

### Save the model

In [None]:
model.save('models/logistic_regression.h5')