In [1]:
import numpy as np
import pandas as pd
from gensim.models import Phrases
from gensim.corpora import Dictionary
from stop_words import get_stop_words
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from keras import backend as K
from keras.models import Sequential, Model, load_model, save_model
from keras.layers import Dense
from keras.layers import LSTM, GRU, Conv1D, MaxPooling1D, Flatten
from keras.layers import GaussianNoise, BatchNormalization, Dropout
from keras.layers import Activation, Input, concatenate, Reshape, merge, dot
from keras.optimizers import Adam, RMSprop, SGD
from keras.regularizers import l1, l2
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams, make_sampling_table
from keras.callbacks import Callback, LambdaCallback, TensorBoard, ReduceLROnPlateau, EarlyStopping
from keras.utils import np_utils
from random import shuffle
import time
import pylab as pl
from ipywidgets import FloatProgress
from IPython import display
import matplotlib.pyplot as plt
%matplotlib notebook

Using TensorFlow backend.


In [2]:
# set backends of keras
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=8))
K.set_session(sess)

In [3]:
def is_desired_letter(char):
    order = ord(char)
    return order >= 97 and order < 123 or order >= 48 and order < 58 or order == ord(" ") or order == ord("'")


def get_text_data():
    # load the dataset but only keep the top n words, zero the rest
    train_data = pd.read_csv("input/kickstarter_train.csv")
    # segment all sentences
    sent_list = [sent.lower() for text in train_data["desc"] if type(text) is str for sent in sent_tokenize(text)]
    # remove symbols in each description
    sent_list = [[char for char in sent if is_desired_letter(char)] for sent in sent_list]
    sent_list = [''.join(sent).split() for sent in sent_list]
    # remove too short desc
    train_texts = [sent for sent in sent_list if len(sent) > 3]

    return train_texts

In [4]:
# get training texts from disk
train_texts = get_text_data()
print("data grabbed")

data grabbed


### Sentence segmentation boosted the result
### Common terms make phrases more reasonable

In [5]:
# train bigram phrases
# min_count 15, threshold 0.8, scorer 'npmi', max_vocab_size 50000 seems great
common_words = get_stop_words('en')
common_words.extend(["of", "with", "without", "and", "or", "the", "a"])
bigram = Phrases(common_terms=common_words, sentences=train_texts, scoring='npmi', min_count=20, threshold=0.8, max_vocab_size=40000)
print("bigram vocabulary size: ", len(bigram.vocab))

bigram vocabulary size:  15941


In [7]:
# save bigram
bigram.save("my_bigram_with_vocab_{}.pkl".format(len(bigram.vocab)))

In [6]:
# get dictionary of word with fair frequency
no_below = 8
non_sense = "9898989898i98989i89"
dictionary = Dictionary(documents=bigram[train_texts])
dictionary.filter_extremes(no_below=no_below)
dictionary.compactify()
print("current dictionary length is: ", len(dictionary))
print("dictionary vocabulary adding finished, now start non_sense rotation...")



current dictionary length is:  13151
dictionary vocabulary adding finished, now start non_sense rotation...


In [7]:
# rotate the dictionary until non-sense become index 0, preparing for future padding and make sample table
dictionary.add_documents([[non_sense]])
index_dfs_list = [(dictionary[i], dictionary.dfs[i]) for i in range(len(dictionary) - 1)]
index_dfs_list = sorted(index_dfs_list, key=lambda x: -x[1])
dictionary.filter_tokens(good_ids=[len(dictionary)-1]) # preserve only non_sense
# add vocabs back in sequence
for i in range(len(index_dfs_list)):
    dictionary.add_documents([[index_dfs_list[i][0]]])
    dictionary.dfs[i+1] = index_dfs_list[i][1]

n_vocab = len(dictionary)
print("dictionary vocabulary length:", n_vocab)

dictionary vocabulary length: 13152


In [8]:
dictionary.save("dictionary_least_dfs_{}_vocab_{}.pkl".format(no_below, n_vocab))
print("dictionary saved")

dictionary saved


In [9]:
# tokenize the texts and remove too short texts
train_texts = bigram[train_texts]
train_texts = [[dictionary.token2id[word] for word in text if word in dictionary.token2id] for text in train_texts]
train_texts = [text for text in train_texts if len(text) > 5]
print("tokenizing is done")
print("left texts num is: ", len(train_texts))
t = np.array([len(text) for text in train_texts])
print("The max len is {}, the average len is {}, the min len is {}".format(t.max(), t.mean(), t.min()))



tokenizing is done
left texts num is:  133686
The max len is 34, the average len is 13.97717786454827, the min len is 6


In [11]:
# generate and save training data
# generate progress bar
f = FloatProgress(min=0, max=100)
display.display(f)
# load training data
training_targets = []
training_contexts = []
training_pairs = []
training_labels = []
shuffle(train_texts)
for i, text in enumerate(train_texts):
    pairs, labels = skipgrams(sampling_table=make_sampling_table(n_vocab), sequence=text, vocabulary_size=n_vocab, 
                                  negative_samples=1., window_size=3)
    if not pairs:
        continue
    pairs = [np.array(x) for x in zip(*pairs)]
    labels = np.array(labels)
    training_targets.append(pairs[0])
    training_contexts.append(pairs[1])
    training_labels.append(labels)
    f.value = 100 * float(i)/float(len(train_texts))

training_pairs = [np.hstack(training_targets), np.hstack(training_contexts)]
training_labels = np.hstack(training_labels)
# save the loaded file to file
np.savez("training pairs and labels.npz", target=training_pairs[0], context=training_pairs[1], labels=training_labels)
training_pairs_labels = np.load("training pairs and labels.npz")
training_pairs = [training_pairs_labels['target'], training_pairs_labels['context']]
training_labels = training_pairs_labels['labels']
# print(training_pairs[0].shape)
# print(training_pairs[1].shape)
# print(training_labels.shape)
print("preparing training data is done")
print("data no is: ", len(training_labels))

A Jupyter Widget

preparing training data is done


In [12]:
np.savez("training pairs and labels.npz", target=training_pairs[0], context=training_pairs[1], labels=training_labels)