In [1]:
import pandas as pd

FILE = "/Users/Shared/data/HN_posts_year_to_Sep_26_2016.csv"

data = pd.read_csv(FILE)
data = data[["id", "title", "num_points"]]

In [2]:
import numpy as np

seed = 7
np.random.seed(seed)

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

title = data["title"]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(title)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 87282 unique tokens.


In [19]:
GOOD_THRESHOLD = 100
MAX_SEQUENCE_LENGTH = 24

train = data.sample(frac=0.8)
test = data.drop(train.index)

In [20]:
import os
import numpy as np

EMBEDDING_DIM = 300

embeddings_index = {}
f = open(os.path.join('/Users/Shared/data/glove.6B/', 'glove.6B.%dd.txt' % EMBEDDING_DIM))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [21]:


embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [22]:
def prepareData(df):    
    good = df[df["num_points"] >= GOOD_THRESHOLD]
    bad = df[df["num_points"] < GOOD_THRESHOLD]
    bad = bad.sample(n=good.shape[0])
    data = good.append(bad)
    data = data.sample(frac=1).reset_index(drop=True)
    
    num_points = data["num_points"].values

    y_train = np.zeros((len(num_points), 2), dtype=int)
    y_original = np.zeros((len(num_points)), dtype=int)
    for i in range(0, len(num_points)):
        y_train[i, 1] = int(num_points[i] >= GOOD_THRESHOLD)
        y_train[i, 0] = int(num_points[i] < GOOD_THRESHOLD)
        y_original[i] = int(num_points[i] >= GOOD_THRESHOLD)
        
    sequences = tokenizer.texts_to_sequences(data["title"])
    x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return x_train, y_train, y_original


In [23]:
x_full, y_full, y2_full = prepareData(data)
x_train, y_train, _ = prepareData(train)
x_test, y_test, _ = prepareData(test)

print(y2_full)

[0 0 0 ..., 0 1 0]


In [45]:
from keras.layers import Input, Convolution1D, MaxPooling1D, Dense, Flatten, Dropout, Embedding
from keras.models import Model
from keras.regularizers import l2, activity_l2

def create_baseline():
    embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Convolution1D(16, 5, activation='relu')(embedded_sequences)
    #x = Dropout(0.5)(x)
    x = MaxPooling1D()(x)
    x = Convolution1D(16, 5, activation='relu')(embedded_sequences)
    #x = Dropout(0.5)(x)
    x = MaxPooling1D()(x)
    #x = Dropout(0.5)(x)
    #x = Flatten()(embedded_sequences)
    x = Flatten()(x)
    x = Dense(50, init='uniform', activation='relu')(x)
    x = Dense(50, init='uniform', activation='relu')(x)

    preds = Dense(2, activation='softmax')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc', 'precision'])
    return model

In [46]:
from sklearn.metrics import precision_score, recall_score

def validate(model, x_test, y_test):
    test_truth = np.apply_along_axis(lambda x: np.argmax(x), 1, y_test)
    test_pred = model.predict(x_test)
    test_pred = np.apply_along_axis(lambda x: np.argmax(x), 1, test_pred)
    precision = precision_score(test_truth, test_pred)
    recall = recall_score(test_truth, test_pred)
    print(precision)
    print(recall)
    return precision, recall

In [47]:
from keras.callbacks import EarlyStopping
es = EarlyStopping('val_precision', patience=1, mode='max')

In [None]:
from sklearn.model_selection import StratifiedKFold

N = 5

kfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=seed)
kfold.get_n_splits(x_full, y_full)

print(y2_full)

precision = 0
recall = 0
for train_index, test_index in kfold.split(x_full, y2_full):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x_full[train_index], x_full[test_index]
    y_train, y_test = y_full[train_index], y_full[test_index]
    model = create_baseline()
    model.fit(x_train, y_train, nb_epoch=100, batch_size=128, validation_data=(x_test, y_test), callbacks=[])
    p, r = validate(model, x_test, y_test)
    precision += p
    recall += r
    
print("Precision: %.2f" % (precision / N))
print("Recall: %.2f" % (recall / N))

[0 0 0 ..., 0 1 0]
TRAIN: [    1     2     3 ..., 23074 23076 23077] TEST: [    0     6    12 ..., 23075 23078 23079]
Train on 18464 samples, validate on 4616 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
 3200/18464 [====>.........................] - ETA: 4s - loss: 0.0922 - acc: 0.9706 - precision: 0.9706