In [1]:
import pickle
import numpy as np


def get_idx_from_sent(sent, word_idx_map, max_l=56, k=300, filter_h=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = filter_h - 1
    for i in range(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data(data, word_idx_map, max_l=56, k=300, filter_h=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    X, y = [], []
    for row in data:
        sent = get_idx_from_sent( row['text'], word_idx_map, max_l, k, filter_h)
        X.append( sent )
        y.append( row['y'] )
    X = np.array( X, dtype="int" )
    y = np.array( y, dtype="int" )
    return X, y

# train = [ { y: 'class', text: 'sentence'} ]
# dev = [ { y: 'class', text: 'sentence'} ]
# test = [ { y: 'class', text: 'sentence'} ]

# W = google pretrained word2vec matrix, each column represent a word
# W2 = random generated word2vec matrix, each column represent a word

# word_idx_map: word => column number in word2vec matrix
# vocab = set of all words in dataset


# alterative datasources: data/MR.p, data/SST1.p, data/SST2.p
data = pickle.load(open("data/combined.p","rb"))


In [5]:
print(data.keys())
google_W = data['google_W']
glove_W = data['glove_W']
word_idx_map = data['word_idx_map']
idx_word_map = { v:k for (k,v) in word_idx_map.items() }

dict_keys(['sst2_train', 'sst1_train', 'sst2_test', 'vocab', 'google_W', 'sst1_test', 'sst1_dev', 'sst2_dev', 'random_W', 'mr_train', 'glove_W', 'word_idx_map'])


In [None]:
from sklearn import preprocessing
norm_google_W = preprocessing.normalize(google_W, norm='l2')
norm_glove_W = preprocessing.normalize(glove_W, norm='l2')

In [None]:
import keras
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=""

In [None]:
from keras.layers import Input, Dense, Flatten, Embedding
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D, Input, Add, Dropout, Activation
from keras.layers.merge import Concatenate

np.random.seed(2017)

assert random_W.shape == google_W.shape == glove_W.shape
vocab_size, vec_size = google_W.shape
sent_length = 64 # train_X.shape[1]

num_filters = 100
kernel_sizes = [3,4,5]
batch_size = 50
dropout_rate = 0.5
l2_constraint = 3

# TODO: check all layer initializers

def conv(x):
    cs = []
    for kernel_size in kernel_sizes:
        c = Conv1D( filters=num_filters, 
          kernel_size=kernel_size, 
          padding="valid", # i.e. no additional padding
          activation="relu",
          strides=1,
          name='filter_{}'.format(kernel_size))(x)
        c = GlobalMaxPooling1D()(c)
        cs.append( c )
    return Concatenate()(cs)

# model just for binary classification
"""
def get_bin_model( static=True, initial_embedding=google_W ):
    inputs = Input(shape=(64,))
    x = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ initial_embedding ], 
              trainable=(not static) )(inputs)
    x = conv( x )
    predictions = Dense(1, kernel_initializer='normal', 
            #kernel_regularizer=regularizers.l2(3.), 
            activation='sigmoid',
            name='final_layer')(x)
    model = Model( inputs=inputs, outputs=predictions )
    model.compile( optimizer='Adadelta',
              loss='binary_crossentropy',
              metrics=['binary_accuracy'] )
    return model
"""

# model for multi-category classification
def get_model( num_classes, static=True, initial_embedding=google_W ):
    inputs = Input(shape=(64,))
    x = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ initial_embedding ], 
              trainable=(not static) )(inputs)
    x = conv( x )
    predictions = Dense(num_classes, kernel_initializer='normal', 
            #kernel_regularizer=regularizers.l2(3.), 
            activation='sigmoid',
            name='final_layer')(x)
    model = Model( inputs=inputs, outputs=predictions )
    model.compile( optimizer='Adadelta',
              loss='categorical_crossentropy',
              metrics=['accuracy'] )
    return model

def multi_channel_conv( x1, x2 ):
    cs = []
    for kernel_size in kernel_sizes:
        shared_conv = Conv1D( filters=num_filters, 
          kernel_size=kernel_size, 
          padding="valid",
          strides=1,
          name='filter_{}'.format(kernel_size))
        
        c1 = shared_conv( x1 )
        c2 = shared_conv( x2 )
        
        c = Add()( [c1, c2] )
        c = Activation('relu')(c)
        c = GlobalMaxPooling1D()(c)
        
        cs.append( c )
    return Concatenate()(cs)

# model for multi-category classification
"""
def get_bin_multi_channel_model( initial_embedding=google_W ):
    inputs = Input(shape=(64,))
    x_static = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ inital_embedding ], 
              trainable=False )(inputs)
    
    x_non_static = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ initial_embedding ],
              trainable=True )(inputs)
    
    x = multi_channel_conv( x_static, x_non_static )
    predictions = Dense(1, kernel_initializer='normal', 
            #kernel_regularizer=regularizers.l2(3.), 
            activation='sigmoid',
            name='final_layer')(x)
    model = Model( inputs=inputs, outputs=predictions )
    model.compile( optimizer='Adadelta',
              loss='binary_crossentropy',
              metrics=['binary_accuracy'] )
    return model
"""

def get_multi_channel_model( num_classes, initial_embedding=google_W  ):
    inputs = Input(shape=(64,))
    x_static = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ initial_embedding ], 
              trainable=False )(inputs)
    
    x_non_static = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ initial_embedding ],
              trainable=True )(inputs)
    
    x = multi_channel_conv( x_static, x_non_static )
    
    predictions = Dense(num_classes, kernel_initializer='normal', 
            #kernel_regularizer=regularizers.l2(3.), 
            activation='sigmoid',
            name='final_layer')(x)
    model = Model( inputs=inputs, outputs=predictions )
    model.compile( optimizer='Adadelta',
              loss='categorical_crossentropy',
              metrics=['accuracy'] )
    return model

In [None]:
num_classes = len(np.unique(train_y))

train_X, train_y = make_idx_data( train, word_idx_map )
num_classes = len(np.unique(train_y))
train_y_onehot = keras.utils.to_categorical( train_y, num_classes )


if dev: 
    dev_X, dev_y = make_idx_data( dev, word_idx_map )
    dev_y_onehot = keras.utils.to_categorical( dev_y, num_classes )

if test: 
    test_X, test_y = make_idx_data( test, word_idx_map )
    test_y_onehot = keras.utils.to_categorical( test_y, num_classes )


In [None]:
"""
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=10)
splits = kf.split( train_X, train_y )
    
scores = []
for train_indices, val_indices in splits:
    X_cv_train, X_cv_val = train_X[train_indices], train_X[val_indices]
    y_cv_train, y_cv_val = train_y_onehot[train_indices], train_y_onehot[val_indices]
    
    model = get_model( num_classes, initial_embedding=google_W )
    model.fit( X_cv_train, y_cv_train, batch_size=50, epochs=25)
    
    score = model.evaluate( X_cv_val, y_cv_val )
    scores.append( score[1] )
print('final cross validation scores: {}'.format(sum(scores)/len(scores)))
"""


# ignore from this point forward

In [None]:
np.random.seed(2017)
num_classes = len(np.unique(train_y))
model = get_multi_channel_model( num_classes, initial_embedding=google_W )
model.fit( train_X, train_y_onehot, batch_size=50, epochs=10 )

In [None]:

### get the top contributors for each class,
"""
    implicit assumptions about the model
        (1) 3 conv layers, with filter sizes (3,4,5)
        (2) 100 filters for each conv layer
        
    get the top contributors for each class
    for a class
        (1) find the most positive and most negative weights for the logistic regression
        (2) for each weight, figure out which convolutional filter it came from
        (3) for each component in the filter, find the words that matches closest to the component
"""
#def get_top_filters( model, W, num_filters_per_class ):
num_filters_per_class = 5

logistic_weights = model.get_layer('final_layer').get_weights()[0]
num_classes = logistic_weights.shape[1]

filters = {}
for output_class in range( num_classes ):
    sorted_filter_indices = np.argsort( logistic_weights[:,output_class] )
    
    class_filters = []
    for top_contributor in range( num_filters_per_class ):
        filter_index = sorted_filter_indices[ top_contributor ]
        conv_filter = model.get_layer('filter_{}'.format( 3+int(filter_index/100) )).get_weights()[0][:,:,filter_index%100]

        filter_in_english = []
        filter_size = range(conv_filter.shape[0])
        for k in range(10):
            tmp = []
            for i in filter_size:
                best_match_indices = np.argsort( google_W @ conv_filter[i] )
                word = idx_word_map[best_match_indices[k]]
                tmp.append( word )
            filter_in_english.append( ' - '.join(tmp) )
        class_filters.append( filter_in_english )
    filters[output_class] = class_filters
filters

In [None]:
glove_idx_word_map = { v:k for (k,v) in glove_word_idx_map.items() }
vec = norm_glove_W[glove_word_idx_map['sad']]
best_match_indices = np.argsort( norm_glove_W @ vec )
for k in range(5):
    print( (glove_idx_word_map[best_match_indices[k]] ) )