In [27]:
import pandas as pd

FILE = "/Users/Shared/data/HN_posts_year_to_Sep_26_2016.csv"

data = pd.read_csv(FILE)
data = data[["id", "title", "num_points"]]

In [28]:
import numpy as np

seed = 7
np.random.seed(seed)

In [29]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

title = data["title"]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(title)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 87282 unique tokens.


In [30]:
GOOD_THRESHOLD = 100
MAX_SEQUENCE_LENGTH = 24

train = data.sample(frac=0.8)
test = data.drop(train.index)

In [31]:
def prepareData(df):    
    good = df[df["num_points"] >= GOOD_THRESHOLD]
    bad = df[df["num_points"] < GOOD_THRESHOLD]
    bad = bad.sample(n=good.shape[0])
    data = good.append(bad)
    data = data.sample(frac=1).reset_index(drop=True)
    
    num_points = data["num_points"].values

    y_train = np.zeros((len(num_points), 2), dtype=int)
    y_original = np.zeros((len(num_points)), dtype=int)
    for i in range(0, len(num_points)):
        y_train[i, 1] = int(num_points[i] >= GOOD_THRESHOLD)
        y_train[i, 0] = int(num_points[i] < GOOD_THRESHOLD)
        y_original[i] = int(num_points[i] >= GOOD_THRESHOLD)
        
    sequences = tokenizer.texts_to_sequences(data["title"])
    x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return x_train, y_train, y_original


In [43]:
x_full, y_full, y2_full = prepareData(data)
x_train, y_train, y2_train = prepareData(train)
x_test, y_test, y2_test = prepareData(test)

print(y2_full)

[0 0 1 ..., 0 1 1]


In [33]:
import os
import numpy as np

embeddings_index = {}
f = open(os.path.join('/Users/Shared/data/glove.6B/', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [34]:
EMBEDDING_DIM = 100

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [36]:
from sklearn.metrics import precision_score, recall_score

def validate(model, x_test, y_test):
    test_truth = np.apply_along_axis(lambda x: np.argmax(x), 1, y_test)
    test_pred = model.predict(x_test)
    test_pred = np.apply_along_axis(lambda x: np.argmax(x), 1, test_pred)
    precision = precision_score(test_truth, test_pred)
    recall = recall_score(test_truth, test_pred)
    print(precision)
    print(recall)
    return precision, recall

def validate_2(truth, pred):
    truth = np.apply_along_axis(lambda x: np.argmax(x), 1, truth)
    pred = np.apply_along_axis(lambda x: np.argmax(x), 1, pred)
    precision = precision_score(truth, pred)
    recall = recall_score(truth, pred)
    print(precision)
    print(recall)
    return precision, recall

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", random_state=1)
rf.fit(x_train, y2_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [43]:
res = rf.predict(x_test)

#validate_2(y_test, res)
precision = precision_score(y2_test, res)
recall = recall_score(y2_test, res)
print(precision)
print(recall)

0.520553872782
0.496696944674


In [29]:
from sklearn.ensemble import GradientBoostingClassifier

print(x_train.shape)
print(y_train.shape)

gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=1)
gbc.fit(x_train, y2_train)

(18236, 24)
(18236, 2)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, presort='auto', random_state=1,
              subsample=1.0, verbose=0, warm_start=False)

In [30]:
res = gbc.predict(x_test)
print(res)

precision = precision_score(y2_test, res)
recall = recall_score(y2_test, res)
print(precision)
print(recall)

[0 0 0 ..., 1 0 0]
0.527832609554
0.552023121387


In [31]:
import xgboost as xgb

params = {"objective": "binary:logistic",
          "eta": 0.15,
          "max_depth": 7,
          "min_child_weight": 10,
          "silent": 1,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "seed": 1}
num_trees=500
gbm = xgb.train(params, xgb.DMatrix(x_train, y2_train), num_trees)

In [32]:
res = gbm.predict(xgb.DMatrix(x_test))

print(res)
res2 = np.zeros(len(res), dtype=int)
for i in range(0, len(res)):
    res2[i] = 1 if res[i] >= 0.5 else 0
print(res2)

precision = precision_score(y2_test, res2)
recall = recall_score(y2_test, res2)
print(precision)
print(recall)

[ 0.57553232  0.23762675  0.47189417 ...,  0.55435169  0.23429342
  0.38367593]
[1 0 0 ..., 1 0 0]
0.541924824453
0.541701073493


In [57]:
rf_x_group_train = rf.predict_proba(x_train)
gbc_x_group_train = gbc.predict_proba(x_train)
gbm_x_group_train = gbm.predict(xgb.DMatrix(x_train))

print(rf_x_group_train.shape)
print(gbc_x_group_train.shape)
print(gbm_x_group_train.shape)

x_group_train = np.zeros([rf_x_group_train.shape[0], 5])
for i in range(0, rf_x_group_train.shape[0]):
    x_group_train[i][0] = rf_x_group_train[i][0]
    x_group_train[i][1] = rf_x_group_train[i][1]
    
    x_group_train[i][2] = gbc_x_group_train[i][0]
    x_group_train[i][3] = gbc_x_group_train[i][1]
    
    x_group_train[i][4] = gbm_x_group_train[i]
    
print(x_group_train.shape)

(18236, 2)
(18236, 2)
(18236,)
(18236, 5)


In [68]:
rf_x_group_test = rf.predict_proba(x_test)
gbc_x_group_test = gbc.predict_proba(x_test)
gbm_x_group_test = gbm.predict(xgb.DMatrix(x_test))

print(rf_x_group_test.shape)
print(gbc_x_group_test.shape)
print(gbm_x_group_test.shape)

x_group_test = np.zeros([rf_x_group_test.shape[0], 5])
for i in range(0, rf_x_group_test.shape[0]):
    x_group_test[i][0] = rf_x_group_test[i][0]
    x_group_test[i][1] = rf_x_group_test[i][1]
    
    x_group_test[i][2] = gbc_x_group_test[i][0]
    x_group_test[i][3] = gbc_x_group_test[i][1]
    
    x_group_test[i][4] = gbm_x_group_test[i]
    
print(x_group_test.shape)

y_group_test = y_test
    


#print(zip(rf_x_group_train,gbc_x_group_train))
#print(np.stack([rf_x_group_train,gbm_x_group_train]))


(4844, 2)
(4844, 2)
(4844,)
(4844, 5)


In [63]:
from keras.layers import Input, Convolution1D, MaxPooling1D, Dense, Flatten, Dropout, Embedding
from keras.models import Model, Sequential
from keras.regularizers import l2, activity_l2

def create_model():
    model = Sequential()
    model.add(Dense(10, input_shape=(5,)))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='rmsprop',
          loss='categorical_crossentropy',
          metrics=['accuracy', 'precision'])
    return model
    

def create_baseline():
    embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    #x = Convolution1D(16, 5, activation='relu')(embedded_sequences)
    #x = Dropout(0.5)(x)
    #x = MaxPooling1D()(x)
    #x = Convolution1D(16, 5, activation='relu')(embedded_sequences)
    #x = Dropout(0.5)(x)
    #x = MaxPooling1D()(x)
    #x = Dropout(0.5)(x)
    x = Flatten()(embedded_sequences)
    x = Dropout(0.5)(x)
    x = Dense(64, init='uniform', activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(2, activation='softmax')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc', 'precision'])
    return model

In [64]:
from keras.callbacks import EarlyStopping
es = EarlyStopping('val_precision', patience=1, mode='max')

In [69]:
from sklearn.model_selection import StratifiedKFold

N = 5

kfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=seed)
kfold.get_n_splits(x_group_train, y_train)

print(y2_train)

precision = 0
recall = 0
for train_index, test_index in kfold.split(x_group_train, y2_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_k_train, x_k_test = x_group_train[train_index], x_group_train[test_index]
    y_k_train, y_k_test = y_train[train_index], y_train[test_index]
    model = create_model()
    model.fit(x_k_train, y_k_train, nb_epoch=10, batch_size=32, validation_data=(x_k_test, y_k_test), callbacks=[])
    p, r = validate(model, x_k_test, y_k_test)
    precision += p
    recall += r
    validate(model, x_group_test, y_group_test)
    
print("Precision: %.2f" % (precision / N))
print("Recall: %.2f" % (recall / N))

[1 1 0 ..., 1 0 1]
TRAIN: [    0     1     2 ..., 18233 18234 18235] TEST: [    3    13    15 ..., 18225 18228 18232]
Train on 14588 samples, validate on 3648 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.996158068057
0.995065789474
0.530671859786
0.450041288192
TRAIN: [    0     1     3 ..., 18233 18234 18235] TEST: [    2     8    11 ..., 18213 18214 18224]
Train on 14588 samples, validate on 3648 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.996162280702
0.996162280702
0.532807104095
0.445912469034
TRAIN: [    0     1     2 ..., 18232 18234 18235] TEST: [    6    10    12 ..., 18227 18229 18233]
Train on 14588 samples, validate on 3648 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.996712328767
0.99725877193
0.522765598651
0.511973575557
TRAIN: [    0     1     2

In [None]:
from sklearn.model_selection import StratifiedKFold

N = 5

kfold = StratifiedKFold(n_splits=N, shuffle=True, random_state=seed)
kfold.get_n_splits(x_full, y_full)

print(y2_full)

precision = 0
recall = 0
for train_index, test_index in kfold.split(x_full, y2_full):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x_full[train_index], x_full[test_index]
    y_train, y_test = y_full[train_index], y_full[test_index]
    model = create_baseline()
    model.fit(x_train, y_train, nb_epoch=100, batch_size=128, validation_data=(x_test, y_test), callbacks=[es])
    p, r = validate(model, x_test, y_test)
    precision += p
    recall += r
    
print("Precision: %.2f" % (precision / N))
print("Recall: %.2f" % (recall / N))