In [1]:
import pandas as pd

data = pd.read_csv("./data/HN/HN_posts_year_to_Sep_26_2016.csv", parse_dates=['created_at'])
data = data[["id", "title", "created_at", "num_points"]]

In [2]:
from DataLoader import GloVe
import numpy as np

WORD_DIM = 300
glove = GloVe.load2('./data/GloVe/glove.840B.{}d.txt'.format(WORD_DIM), WORD_DIM)


Start: Loading Glove Model
End: Loaded 2195884 rows.


In [4]:
# emb: Symbol to float32 of fixed DIMENSION
# Create an index mapping, index to symbol, symbol to index

class Embedding:
    def __init__(self, emb, verbose = False):
        # assert emb is dictionary and each entry has same dimension
        self.emb = emb
        self.dim = len(self.emb[list(self.emb.keys())[0]])
        self.emb['<UNK>'] = [0. for i in range(self.dim)]
        self.emb['<PAD>'] = [1. for i in range(self.dim)]
        self.emb['<GO>'] = [-1. for i in range(self.dim)]
        
        self.build_dicts()
        
        if verbose:
            self.describe()
        
    def describe(self):
        print('Embedding Dimension: {}'.format(self.dim))
        print('Embedding Symbols: {}'.format(len(self.emb)))
        print('Index to symbol: {}'.format([(i, self.idx2Sym[i]) for i in range(10)]))
        
    def getIndex(self, symbol):
        if symbol in self.sym2Idx:
            return self.sym2Idx[symbol]
        else:
            return self.sym2Idx['<UNK>']

    def getEmb(self, symbol):
        return self.emb[self.idx2Sym[self.getIndex(symbol)]]
    
    def getSymbols(self, indices):
        return [self.idx2Sym[idx] for idx in indices]

    def getNumpyArray(self):
        return np.array([self.emb[self.idx2Sym[idx]] for idx in range(len(self.emb))])
    
    def build_dicts(self):
        self.sym2Idx = {}
        index = 0
        for key in sorted(self.emb.keys()):
            self.sym2Idx[key] = index
            index += 1
            
        self.idx2Sym = { v:k for k, v in self.sym2Idx.items()}

glove_emb = Embedding(glove, verbose=True)
glove_np = glove_emb.getNumpyArray()

Embedding Dimension: 300
Embedding Symbols: 2195887
Index to symbol: [(0, '!'), (1, '!!'), (2, '!!!'), (3, '!!!!'), (4, '!!!!!'), (5, '!!!!!!'), (6, '!!!!!!!'), (7, '!!!!!!!!'), (8, '!!!!!!!!!'), (9, '!!!!!!!!!!')]


In [5]:
GOOD_THRESHOLD = 100
MAX_SEQUENCE_LENGTH = 60

In [6]:
import numpy as np

seed = 7
np.random.seed(seed)

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

title = data["title"]

Using TensorFlow backend.


In [8]:
from TextPreprocess.Tokenizer.RegExp import tokenize

def preprocessStrings(strings):
    return [[glove_emb.getIndex(token.lower()) for token in tokenize(string)] for string in strings]

def prepareData(df):        
    num_points = data["num_points"].values

    #dayofweek = data['created_at'].dt.dayofweek.values[:,np.newaxis]
    #hour = data['created_at'].dt.hour.values[:,np.newaxis]
    #month = data['created_at'].dt.month.values[:,np.newaxis]
    #day = data['created_at'].dt.day.values[:,np.newaxis]
    
    #dayofweek = dayofweek / dayofweek.max() 
    #hour = hour / hour.max()
    #month = month / month.max()
    #day = day / day.max()
    
    y_train = np.zeros((len(num_points), 2), dtype=int)
    y_original = np.zeros((len(num_points)), dtype=int)
    for i in range(0, len(num_points)):
        y_train[i, 1] = int(num_points[i] >= GOOD_THRESHOLD)
        y_train[i, 0] = int(num_points[i] < GOOD_THRESHOLD)
        y_original[i] = int(num_points[i] >= GOOD_THRESHOLD)
        
    sequences = preprocessStrings(data["title"])
    x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    #x_train_expand = np.zeros((x_train.shape[0], x_train.shape[1] * EMBEDDING_DIM))
    #for i in range(0, x_train.shape[0]):
    #    for j in range(0, x_train.shape[1]):
    #        x_train_expand[i][j*EMBEDDING_DIM:(j+1)*EMBEDDING_DIM] = embedding_matrix[x_train[i][j]]
            
    
    #X = np.hstack((x_train_expand, dayofweek, hour, month, day))
    return x_train, y_train, y_original
    #return X, y_train, y_original


In [9]:
print(data.columns)

Index(['id', 'title', 'created_at', 'num_points'], dtype='object')


In [10]:
X_full, _, y_full = prepareData(data)
#x_train, y_train, y2_train = prepareData(train)
#x_test, y_test, y2_test = prepareData(test)

print(X_full.shape)

(293119, 60)


In [None]:
from keras.layers import Input, Convolution1D, MaxPooling1D, Dense, Flatten, Dropout, Embedding,BatchNormalization
from keras.models import Model, Sequential
from keras.regularizers import l2

def create_model():
    model = Sequential()
    model.add(Dense(10, input_shape=(5,)))
    model.add(Dense(1, activation='softmax'))
    model.compile(optimizer='rmsprop',
          loss='categorical_crossentropy',
          metrics=['accuracy', 'precision'])
    return model
    

def create_baseline(dropout=0, branching=5):
    embedding_layer = Embedding(glove_np.shape[0],
                            glove_np.shape[1],
                            weights=[glove_np],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    #print(embedded_sequences)
    #x = Convolution1D(300, branching)(embedded_sequences)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    
    #x = MaxPooling1D()(x)
    #x = Dropout(dropout)(x)
    
    """
    x = Convolution1D(800, 2, activation='relu')(x)
    x = MaxPooling1D()(x)
    x = Dropout(dropout)(x)
    """
    
    x = Flatten()(embedded_sequences)
    
    x = Dense(3000, activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    
    x = Dense(1500, activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    
    x = Dense(750, activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    
    preds = Dense(1, activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])
    return model

In [None]:
from keras.callbacks import EarlyStopping
es = EarlyStopping('val_loss', patience=3)

In [13]:
# w_x * c_x / (w_y * c_y) = 1, w_y = c_x / c_y if w_x = 1

class_weight = {
    0: 1,
    1: (len(y_full) - sum(y_full)) / sum(y_full)
}

print(class_weight)

{0: 1, 1: 24.400259965337955}


In [14]:
from keras_tqdm import TQDMNotebookCallback

BATCH_SIZE = 1024
DROPOUT = 0.1
BRANCHING = 4

model = create_baseline(DROPOUT, BRANCHING)
model.fit(
    x = X_full,
    y = y_full,
    batch_size=BATCH_SIZE, 
    epochs=200, 
    validation_split=0.2,
    shuffle=True, 
    class_weight=class_weight
    ,verbose=0, callbacks=[TQDMNotebookCallback()]
)

AttributeError: 'dict' object has no attribute 'shape'