In [1]:
import pandas as pd

FILE = "/Users/Shared/data/HN_posts_year_to_Sep_26_2016.csv"

data = pd.read_csv(FILE)
data = data[["id", "title", "num_points"]]

In [2]:
import numpy as np

seed = 7
np.random.seed(seed)

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

title = data["title"]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(title)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 87282 unique tokens.


In [25]:
num_points = data['num_points'].values
print(data['num_points'].describe())
cur_dict = {}
for i in range(0, 6000):
    cur_dict[i] = len(num_points[num_points < i]) / len(num_points)

data['cur_num_points'] = data['num_points'].apply(lambda x: cur_dict[x])

count    293119.000000
mean         15.025324
std          58.504103
min           1.000000
25%           1.000000
50%           2.000000
75%           4.000000
max        5771.000000
Name: num_points, dtype: float64


In [26]:
data['cur_num_points'].describe()

count    293119.000000
mean          0.402988
std           0.344124
min           0.000000
25%           0.000000
50%           0.328904
75%           0.707586
max           0.999997
Name: cur_num_points, dtype: float64

In [27]:
GOOD_THRESHOLD = 100
MAX_SEQUENCE_LENGTH = 24

train = data.sample(frac=0.8)
test = data.drop(train.index)

In [28]:
def prepareData(df):  
    good = df[df["num_points"] >= GOOD_THRESHOLD]
    bad = df[df["num_points"] < GOOD_THRESHOLD]
    bad = bad.sample(n=good.shape[0])
    dt = good.append(bad)
    dt = dt.sample(frac=1).reset_index(drop=True)
    
    num_points = dt["num_points"].values
    cur_num_points = dt["cur_num_points"].values

    y_train = np.zeros((len(num_points), 2), dtype=int)
    y_original = np.zeros((len(num_points)), dtype=int)
    for i in range(0, len(num_points)):
        y_train[i, 1] = int(num_points[i] >= GOOD_THRESHOLD)
        y_train[i, 0] = int(num_points[i] < GOOD_THRESHOLD)
        y_original[i] = int(num_points[i] >= GOOD_THRESHOLD)
        
    sequences = tokenizer.texts_to_sequences(dt["title"])
    x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return x_train, y_train, y_original, cur_num_points


In [29]:
x_full, y_full, y2_full, y_cur = prepareData(data)
x_train, y_train, _, _ = prepareData(train)
x_test, y_test, _, _ = prepareData(test)

In [30]:
y_cur

array([ 0.79786025,  0.83267547,  0.99638713, ...,  0.92781771,
        0.97819998,  0.70758634])

In [31]:
import os
import numpy as np

embeddings_index = {}
f = open(os.path.join('/Users/Shared/data/glove.6B/', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [32]:
EMBEDDING_DIM = 100

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [99]:
from keras.layers import Input, Convolution1D, MaxPooling1D, Dense, Flatten, Dropout, Embedding, LSTM,BatchNormalization
from keras.models import Model
from keras.regularizers import l2, activity_l2

def create_baseline():
    embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    #x = LSTM(64)(embedded_sequences)
    x = Convolution1D(64, 5, activation='relu')(embedded_sequences)
    #x = Dropout(0.5)(x)
    #x = MaxPooling1D()(x)
    #x = Convolution1D(16, 5, activation='relu')(embedded_sequences)
    #x = Dropout(0.5)(x)
    #x = MaxPooling1D()(x)
    #x = Dropout(0.5)(x)
    x = Flatten()(x)
    x = Dense(32, init='uniform', activation='relu')(x)
    x = Dense(32, init='uniform', activation='relu')(x)
    preds = Dense(1)(x)
    model = Model(sequence_input, preds)
    model.compile(loss='mse',
              optimizer='adam')
    return model

In [100]:
from sklearn.metrics import precision_score, recall_score
from scipy.stats import describe

def validate(model, x_test, y_test):
    test_truth = y_test
    test_pred = model.predict(x_test)
    print(describe(test_truth))
    print(describe(test_pred))
    print(cur_dict[GOOD_THRESHOLD])
    test_pred[test_pred >= cur_dict[GOOD_THRESHOLD]] = 1
    test_pred[test_pred < cur_dict[GOOD_THRESHOLD]] = 0
    print(describe(test_pred))
    precision = precision_score(test_truth, test_pred)
    recall = recall_score(test_truth, test_pred)
    print(precision)
    print(recall)
    return precision, recall

In [101]:
from keras.callbacks import EarlyStopping
es = EarlyStopping('val_loss', patience=3, mode='min')

In [102]:
from sklearn.model_selection import StratifiedKFold

N = 5

kfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=seed)
kfold.get_n_splits(x_full, y_full)

precision = 0
recall = 0
for train_index, test_index in kfold.split(x_full, y2_full):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x_full[train_index], x_full[test_index]
    y_train, y_test = y_full[train_index], y_full[test_index]
    y_cur_train, y_cur_test = y_cur[train_index], y_cur[test_index] 
    y2_full_train, y2_full_test = y2_full[train_index], y2_full[test_index] 
    model = create_baseline()
    model.fit(x_train, y_cur_train, nb_epoch=10, batch_size=128, validation_data=(x_test, y_cur_test), callbacks=[])
    p, r = validate(model, x_test, y2_full_test)
    precision += p
    recall += r
    
print("Precision: %.2f" % (precision / N))
print("Recall: %.2f" % (recall / N))

TRAIN: [    1     2     3 ..., 23074 23076 23077] TEST: [    0     9    10 ..., 23075 23078 23079]
Train on 18464 samples, validate on 4616 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
DescribeResult(nobs=4616, minmax=(0, 1), mean=0.5, variance=0.25005417118093176, skewness=0.0, kurtosis=-2.0)
DescribeResult(nobs=4616, minmax=(array([ 0.12903117], dtype=float32), array([ 1.50590515], dtype=float32)), mean=array([ 0.75892794], dtype=float32), variance=array([ 0.03249393], dtype=float32), skewness=array([ 0.10973765], dtype=float32), kurtosis=array([ 0.62314844], dtype=float32))
0.9606303242027981
DescribeResult(nobs=4616, minmax=(array([ 0.], dtype=float32), array([ 1.], dtype=float32)), mean=array([ 0.1215338], dtype=float32), variance=array([ 0.10678646], dtype=float32), skewness=array([ 2.31657362], dtype=float32), kurtosis=array([ 3.36651087], dtype=float32))
0.509803921569
0.123916811092
TRAIN: [    0     1  