In [1]:
import pandas as pd

data = pd.read_csv("/Users/Shared/data/HN_posts_year_to_Sep_26_2016.csv", parse_dates=['created_at'])
data = data[["id", "title", "created_at", "num_points"]]

In [24]:
from DataLoader import GloVe

WORD_DIM = 300
glove = GloVe.load2('./data/GloVe/glove.6B.{}d.txt'.format(WORD_DIM))
# emb: Symbol to float32 of fixed DIMENSION
# Create an index mapping, index to symbol, symbol to index

class Embedding:
    def __init__(self, emb, verbose = False):
        # assert emb is dictionary and each entry has same dimension
        self.emb = emb
        self.dim = len(self.emb[list(self.emb.keys())[0]])
        self.emb['<UNK>'] = [0. for i in range(self.dim)]
        self.emb['<PAD>'] = [1. for i in range(self.dim)]
        self.emb['<GO>'] = [-1. for i in range(self.dim)]
        
        self.build_dicts()
        
        if verbose:
            self.describe()
        
    def describe(self):
        print('Embedding Dimension: {}'.format(self.dim))
        print('Embedding Symbols: {}'.format(len(self.emb)))
        print('Index to symbol: {}'.format([(i, self.idx2Sym[i]) for i in range(10)]))
        
    def getIndex(self, symbol):
        if symbol in self.sym2Idx:
            return self.sym2Idx[symbol]
        else:
            return self.sym2Idx['<UNK>']

    def getEmb(self, symbol):
        return self.emb[self.idx2Sym[self.getIndex(symbol)]]
    
    def getSymbols(self, indices):
        return [self.idx2Sym[idx] for idx in indices]

    def getNumpyArray(self):
        return np.array([self.emb[self.idx2Sym[idx]] for idx in range(len(self.emb))])
    
    def build_dicts(self):
        self.sym2Idx = {}
        index = 0
        for key in sorted(self.emb.keys()):
            self.sym2Idx[key] = index
            index += 1
            
        self.idx2Sym = { v:k for k, v in self.sym2Idx.items()}

glove_emb = Embedding(glove, verbose=True)
glove = glove_emb.getNumpyArray()

Start: Loading Glove Model
End: Loaded 400000 rows.
Embedding Dimension: 300
Embedding Symbols: 400003
Index to symbol: [(0, '!'), (1, '!!'), (2, '!!!'), (3, '!!!!'), (4, '!!!!!'), (5, '!?'), (6, '!?!'), (7, '"'), (8, '#'), (9, '##')]


In [3]:
GOOD_THRESHOLD = 100
MAX_SEQUENCE_LENGTH = 30
BATCH_SIZE = 64

In [4]:
import numpy as np

seed = 7
np.random.seed(seed)

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

title = data["title"]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(title)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 87282 unique tokens.


In [6]:
"""
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
"""

'\nembedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))\nfor word, i in word_index.items():\n    embedding_vector = embeddings_index.get(word)\n    if embedding_vector is not None:\n        # words not found in embedding index will be all-zeros.\n        embedding_matrix[i] = embedding_vector\n'

In [21]:
from TextPreprocess.Tokenizer.RegExp import tokenize

def preprocessStrings(strings):
    return [[glove_emb.getIndex(token.lower()) for token in tokenize(string)] for string in strings]

def prepareData(df):        
    num_points = data["num_points"].values

    #dayofweek = data['created_at'].dt.dayofweek.values[:,np.newaxis]
    #hour = data['created_at'].dt.hour.values[:,np.newaxis]
    #month = data['created_at'].dt.month.values[:,np.newaxis]
    #day = data['created_at'].dt.day.values[:,np.newaxis]
    
    #dayofweek = dayofweek / dayofweek.max() 
    #hour = hour / hour.max()
    #month = month / month.max()
    #day = day / day.max()
    
    y_train = np.zeros((len(num_points), 2), dtype=int)
    y_original = np.zeros((len(num_points)), dtype=int)
    for i in range(0, len(num_points)):
        y_train[i, 1] = int(num_points[i] >= GOOD_THRESHOLD)
        y_train[i, 0] = int(num_points[i] < GOOD_THRESHOLD)
        y_original[i] = int(num_points[i] >= GOOD_THRESHOLD)
        
    sequences = preprocessStrings(data["title"])
    x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    #x_train_expand = np.zeros((x_train.shape[0], x_train.shape[1] * EMBEDDING_DIM))
    #for i in range(0, x_train.shape[0]):
    #    for j in range(0, x_train.shape[1]):
    #        x_train_expand[i][j*EMBEDDING_DIM:(j+1)*EMBEDDING_DIM] = embedding_matrix[x_train[i][j]]
            
    
    #X = np.hstack((x_train_expand, dayofweek, hour, month, day))
    return x_train, y_train, y_original
    #return X, y_train, y_original


In [22]:
print(data.columns)

Index(['id', 'title', 'created_at', 'num_points'], dtype='object')


In [23]:
X_full, _, y_full = prepareData(data)
#x_train, y_train, y2_train = prepareData(train)
#x_test, y_test, y2_test = prepareData(test)

print(X_full.shape)

(293119, 30)


In [11]:
from sklearn.metrics import precision_score, recall_score

def validate(model, x_test, y_test):
    test_truth = np.apply_along_axis(lambda x: np.argmax(x), 1, y_test)
    test_pred = model.predict(x_test)
    test_pred = np.apply_along_axis(lambda x: np.argmax(x), 1, test_pred)
    precision = precision_score(test_truth, test_pred)
    recall = recall_score(test_truth, test_pred)
    print(precision)
    print(recall)
    return precision, recall

def validate_2(truth, pred):
    truth = np.apply_along_axis(lambda x: np.argmax(x), 1, truth)
    pred = np.apply_along_axis(lambda x: np.argmax(x), 1, pred)
    precision = precision_score(truth, pred)
    recall = recall_score(truth, pred)
    print(precision)
    print(recall)
    return precision, recall

In [29]:
from keras.layers import Input, Convolution1D, MaxPooling1D, Dense, Flatten, Dropout, Embedding
from keras.models import Model, Sequential
from keras.regularizers import l2

def create_model():
    model = Sequential()
    model.add(Dense(10, input_shape=(5,)))
    model.add(Dense(1, activation='softmax'))
    model.compile(optimizer='rmsprop',
          loss='categorical_crossentropy',
          metrics=['accuracy', 'precision'])
    return model
    

def create_baseline():
    embedding_layer = Embedding(glove.shape[0],
                            glove.shape[1],
                            weights=[glove],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    #x = Convolution1D(16, 5, activation='relu')(embedded_sequences)
    #x = Dropout(0.5)(x)
    #x = MaxPooling1D()(x)
    #x = Convolution1D(16, 5, activation='relu')(embedded_sequences)
    #x = Dropout(0.5)(x)
    #x = MaxPooling1D()(x)
    #x = Dropout(0.5)(x)
    x = Flatten()(embedded_sequences)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='tanh')(x)
    x = Dropout(0.3)(x)
    preds = Dense(1, activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
    return model

In [30]:
from keras.callbacks import EarlyStopping
es = EarlyStopping('val_loss', patience=3)

In [31]:
# w_x * c_x / (w_y * c_y) = 1, w_y = c_x / c_y if w_x = 1

class_weight = {
    0: 1,
    1: (len(y_full) - sum(y_full)) / sum(y_full)
}

print(class_weight)

{0: 1, 1: 24.400259965337955}


In [None]:
model = create_baseline()
model.fit(
    x = X_full,
    y = y_full,
    batch_size=BATCH_SIZE, 
    epochs=200, 
    callbacks=[es], 
    validation_split=0.2,
    shuffle=True, 
    class_weight=class_weight)

Train on 234495 samples, validate on 58624 samples
Epoch 1/200

In [None]:


from sklearn.model_selection import StratifiedKFold

N = 5

kfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=seed)
kfold.get_n_splits(x_group_train, y_train)

print(y2_train)

precision = 0
recall = 0
for train_index, test_index in kfold.split(x_group_train, y2_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_k_train, x_k_test = x_group_train[train_index], x_group_train[test_index]
    y_k_train, y_k_test = y_train[train_index], y_train[test_index]
    model = create_model()
    model.fit(x_k_train, y_k_train, nb_epoch=10, batch_size=32, validation_data=(x_k_test, y_k_test), callbacks=[])
    p, r = validate(model, x_k_test, y_k_test)
    precision += p
    recall += r
    validate(model, x_group_test, y_group_test)
    
print("Precision: %.2f" % (precision / N))
print("Recall: %.2f" % (recall / N))

In [None]:
from sklearn.model_selection import StratifiedKFold

N = 5

kfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=seed)
kfold.get_n_splits(x_full, y_full)

print(y2_full)

precision = 0
recall = 0
for train_index, test_index in kfold.split(x_full, y2_full):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x_full[train_index], x_full[test_index]
    y_train, y_test = y_full[train_index], y_full[test_index]
    model = create_baseline()
    model.fit(x_train, y_train, nb_epoch=100, batch_size=128, validation_data=(x_test, y_test), callbacks=[es])
    p, r = validate(model, x_test, y_test)
    precision += p
    recall += r
    
print("Precision: %.2f" % (precision / N))
print("Recall: %.2f" % (recall / N))