# This is a document that tries keras on jupyter notebook

In [153]:
import numpy as np
import pandas as pd
import gensim
import re
from keras.models import Sequential, Model, load_model, save_model
from keras.layers import Dense
from keras.layers import LSTM, GRU, Conv1D, MaxPooling1D, Flatten
from keras.layers import GaussianNoise, BatchNormalization, Dropout
from keras.layers import Activation, merge, Input, concatenate
from keras.optimizers import Adam
from keras.regularizers import l1, l2
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import Callback, LambdaCallback, TensorBoard, ReduceLROnPlateau, EarlyStopping
from keras.utils import np_utils

In [118]:
def is_desired_letter(char):
    return ord(char) >= 97 and ord(char) < 123 or ord(char) >= 48 and ord(char) < 58 or ord(char) == ord(" ")


def get_train_data(train_portion):
    # load the dataset but only keep the top n words, zero the rest
    train_data = pd.read_csv("input/kickstarter_train.csv")
    train_texts_and_results = train_data.iloc[:, [2, -1]]
    # get split point for train and test data
    split_point = int(train_portion * len(train_data))
    # do preliminary preprocessing:remove all symbols
    train_data["desc"] = [[char for char in str(text).lower() if is_desired_letter(char)] for
                          text in train_data["desc"]]
    train_data["desc"] = [''.join(text).split() for text in train_data["desc"]]
    # remove too short desc
    drop_index = []
    for i in range(len(train_data)):
        if len(train_data.iloc[i, 2]) <= 8:
            drop_index.append(i)
    train_data.drop(train_data.index[drop_index])
    # get descriptions data
    train_texts = np.array(train_data.iloc[:split_point, 2])
    test_texts = np.array(train_data.iloc[split_point:, 2])
    # get num data
    train_num = np.array(train_data.iloc[:split_point, [3, 12]])
    test_num = np.array(train_data.iloc[split_point:, [3, 12]])
    # get result data
    train_results = np.array(train_data.iloc[:split_point, -1])
    test_results = np.array(train_data.iloc[split_point:, -1])
    
    return train_texts, train_num, train_results, test_texts, test_num, test_results


def get_bad_word_portion(data):
    bad_word_num = 0.0
    all_word = 0.0
    for text in data:
        for word in text:
            all_word += 1
            if word not in all_words:
                bad_word_num += 1
    return bad_word_num / all_word


def convert_to_onehot(data, num_features):
    new_data = []
    for item in data:
        new_data.append(np_utils.to_categorical(item, num_classes=num_features))
    return np.array(new_data)

In [119]:
# get training testing data from disk
train_data_portion = 0.9
trainX_desc, trainX_num, trainY, testX_desc, testX_num, testY = get_train_data(train_data_portion)
print("data grabbed")

# convert char to int, and 
all_words = set([word for text in trainX_desc for word in text])
n_vacab = len(all_words) + 1
word_to_int = dict((word, float(i+1)) for i, word in enumerate(all_words))
trainX_desc = [[word_to_int[word] for word in text] for text in trainX_desc]
# print bad word portion for test data before tokenization
print(get_bad_word_portion(testX_desc))
testX_desc = [[word_to_int[word] for word in text if word in all_words] for text in testX_desc]

print("tokenizing and normalizing is done")

data grabbed
0.02938531311469891
tokenizing and normalizing is done


In [120]:
# preprocessing description data
# truncate and pad input sequences
max_desc_length = 40
trainX_desc = sequence.pad_sequences(list(trainX_desc), maxlen=max_desc_length, truncating="post")
testX_desc = sequence.pad_sequences(list(testX_desc), maxlen=max_desc_length, truncating="post")
print("padding finished")

padding finished


In [121]:
# # reshape trainX to multi_timestep single feature
# time_steps = max_desc_length
# num_features = 1
# testX_desc = np.array(testX_desc)
# testX_desc = testX_desc.reshape((-1, time_steps, num_features))
# trainX_desc = np.array(trainX_desc)
# trainX_desc = trainX_desc.reshape((-1, time_steps, num_features))
# print("reshaping data with shape {}".format(trainX_desc.shape))

In [200]:
# generate model for descriptions
model_input = Input(shape=(40,))
x = model_input
x = Embedding(input_dim=n_vacab, output_dim=64, input_length=40)(x)
x = GaussianNoise(stddev=0.1)(x)
x = Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = Dropout(0.5)(x)
x = Conv1D(filters=64, kernel_size=3, activation="relu")(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(pool_size=2)(x)
x = Flatten()(x)
x = Dense(64)(x)
x = BatchNormalization()(x)
x = Activation("sigmoid")(x)
x = Dense(1, activation='sigmoid')(x)
description_model = Model(inputs=[model_input], outputs=[x])

# configurate model training
description_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=["accuracy"])
print("model building finished\n", description_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_33 (InputLayer)        (None, 40)                0         
_________________________________________________________________
embedding_28 (Embedding)     (None, 40, 64)            5240192   
_________________________________________________________________
gaussian_noise_16 (GaussianN (None, 40, 64)            0         
_________________________________________________________________
conv1d_36 (Conv1D)           (None, 36, 64)            20544     
_________________________________________________________________
dropout_76 (Dropout)         (None, 36, 64)            0         
_________________________________________________________________
conv1d_37 (Conv1D)           (None, 34, 64)            12352     
_________________________________________________________________
dropout_77 (Dropout)         (None, 34, 64)            0         
__________

In [190]:
# do training
epoch_num = 2
for i in range(epoch_num):
    description_model.fit(trainX_desc, trainY, epochs=5, batch_size=256, shuffle=True, 
                      verbose=1, validation_split=0.1, callbacks=[ReduceLROnPlateau(), EarlyStopping(patience=3)])
    # see actual result
    scores = description_model.evaluate(testX_desc, testY, verbose=1)
    print("Accuracy:{}".format(np.array(scores)))
    print("actual epoch num is: ", i)

# save model
filepath = "description_model_weights.h5"
description_model.save_weights(filepath)

Train on 87584 samples, validate on 9732 samples
Epoch 1/5
15616/87584 [====>.........................] - ETA: 38s - loss: 0.2205 - acc: 0.9064

KeyboardInterrupt: 

In [221]:
# preprocessing input nums
def preprocess_num_data(data_num):
    data_num_0 = [float(item[0]) for item in data_num]
    data_num_1 = [float(item[1]) for item in data_num]
    max0 = np.array(data_num_0).max()
    max1 = np.array(data_num_1).max()
    data_num_0 = [item / max0 * 2 - 1 for item in data_num_0]
    data_num_1 = [item / max1 * 2 - 1 for item in data_num_1]
    data_num= zip(data_num_0, data_num_1)
    data_num = np.array(data_num)

trainX_num = preprocess_num_data(trainX_num)
testX_num = preprocess_num_data(testX_num)
print(testX_num)

TypeError: 'NoneType' object is not iterable

In [222]:
print(testX_num)

None


In [201]:
# generate model for num data
model_input = Input(shape=(2,))
x = model_input
x = Dense(units=12, input_shape=(2,))(x)
x = BatchNormalization()(x)
x = Activation("sigmoid")(x)
x = Dropout(0.5)(x)
x = Dense(units=1, activation="sigmoid")(x)
num_model = Model(inputs=[model_input], outputs=[x])

# configure network for training
num_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=["accuracy"])
print("model building finished\n", num_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_34 (InputLayer)        (None, 2)                 0         
_________________________________________________________________
dense_97 (Dense)             (None, 32)                96        
_________________________________________________________________
batch_normalization_42 (Batc (None, 32)                128       
_________________________________________________________________
activation_44 (Activation)   (None, 32)                0         
_________________________________________________________________
dropout_78 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_98 (Dense)             (None, 16)                528       
_________________________________________________________________
batch_normalization_43 (Batc (None, 16)                64        
__________

In [193]:
# do training
num_model.fit(trainX_num, trainY, epochs=100, batch_size=256, shuffle=True, 
                  verbose=1, validation_split=0.1, callbacks=[ReduceLROnPlateau(), EarlyStopping(patience=3)])

# save model
filepath = "num_model_weights.h5"
num_model.save_weights(filepath)

Train on 87584 samples, validate on 9732 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


In [202]:
# load models from file
num_model.load_weights("num_model_weights.h5")
description_model.load_weights("description_model_weights.h5")
print("models weights loaded")

models weights loaded


In [203]:
# pop the original output value and replace with a dropout layer
num_model.layers.pop()
# x = Dropout(0.25)(num_model.layers[-1].output)
# num_model = Model(inputs=[num_model.layers[0].input], outputs=[x])
description_model.layers.pop()
# x = Dropout(0.5)(description_model.layers[-1].output)
# description_model = Model(inputs=[description_model.layers[0].input], outputs=[x])

<keras.layers.core.Dense at 0x1c851257b8>

In [204]:
num_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=["accuracy"])
description_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=["accuracy"])
description_model.layers[-1].output

<tf.Tensor 'activation_43/Sigmoid:0' shape=(?, 64) dtype=float32>

In [213]:
# build hybrid model
# merge models
hybrid_output = concatenate([num_model.layers[-1].output, description_model.layers[-1].output])
# create lower part of the model
model_input = Input(shape=(80,))
# output = Dense(units=64)(model_input)
# output = Activation("sigmoid")(output)
# output = Dropout(0.5)(output)
# output = BatchNormalization()(output)
# output = Activation("sigmoid")(output)
# output = Dense(units=1, activation="sigmoid")(output)
output = Dense(units=1, activation="sigmoid")(model_input)
lower_model = Model(inputs=[model_input], outputs=[output])
# concatenate two models
hybrid_model = Model(inputs=[num_model.layers[0].input, description_model.layers[0].input], outputs=[lower_model(hybrid_output)])

# compile for training
hybrid_model.compile(loss='binary_crossentropy', optimizer="adam", metrics = ['accuracy'])
print("num_model summary", num_model.summary())
print("desc_model summary", description_model.summary())
print("hybrid model summary\n", hybrid_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_34 (InputLayer)        (None, 2)                 0         
_________________________________________________________________
dense_97 (Dense)             (None, 32)                96        
_________________________________________________________________
batch_normalization_42 (Batc (None, 32)                128       
_________________________________________________________________
activation_44 (Activation)   (None, 32)                0         
_________________________________________________________________
dropout_78 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_98 (Dense)             (None, 16)                528       
_________________________________________________________________
batch_normalization_43 (Batc (None, 16)                64        
__________

In [214]:
# do training
# do pretraining:
hybrid_model.fit([trainX_num, trainX_desc], trainY, epochs=1, batch_size=256, shuffle=True, 
                  verbose=1, validation_split=0.1, callbacks=[ReduceLROnPlateau(), EarlyStopping(patience=2)])
# see actual result
scores = hybrid_model.evaluate([testX_num, testX_desc], testY, verbose=1)
print("Accuracy:{}".format(np.array(scores)))
print("actual epoch num is: ", i)

hybrid_model.layers[-1].trainable = False
hybrid_model.compile(loss='binary_crossentropy', optimizer="adam", metrics = ['accuracy'])
print(hybrid_model.summary())

epoch_num = 5
for i in range(epoch_num):
    hybrid_model.fit([trainX_num, trainX_desc], trainY, epochs=10, batch_size=256, shuffle=True, 
                  verbose=1, validation_split=0.1, callbacks=[ReduceLROnPlateau(), EarlyStopping(patience=2)])
    # see actual result
    scores = hybrid_model.evaluate([testX_num, testX_desc], testY, verbose=1)
    print("Accuracy:{}".format(np.array(scores)))
    print("actual epoch num is: ", i)

# save model
filepath = "hybrid_model_weights.h5"
hybrid_model.save_weights(filepath)

Train on 87584 samples, validate on 9732 samples
Epoch 1/1
Accuracy:[ 0.74291515  0.73735319]
actual epoch num is:  0
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_28 (Embedding)        (None, 40, 64)       5240192     input_33[0][0]                   
__________________________________________________________________________________________________
gaussian_noise_16 (GaussianNois (None, 40, 64)       0           embedding_28[0][0]               
__________________________________________________________________________________________________
conv1d_36 (Conv1D)              (None, 36, 64)       20544       gaussian_noise_16[0][0]  

KeyboardInterrupt: 

In [189]:
hybrid_model.load_weights("hybrid_model_weights.h5")
scores = hybrid_model.evaluate([testX_num, testX_desc], testY, verbose=1)
print("Accuracy:{}".format(np.array(scores)))

Accuracy:[ 1.27852129  0.71247572]
