In [11]:
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [12]:
# YOU MUST SET THIS TO THE ROOT DATA DIRECTORY FROM THE DOWNLOADED ZIP FILE
DATA_DIR = "./data"

## Data loading and pre-processing....

In [13]:
# loading handcrafted features
meta_features = {}
meta_labels = {}
with open(DATA_DIR+"/detailed_data/handcrafted_features.tsv") as fp:
    for line in fp:
        info = line.split()
        meta_features[info[0]] = np.array(map(float, info[-1].split(",")))
        meta_labels[info[0]] = 1 if info[1] == "burst" else 0

In [14]:
# loading the user, source, and target community embeddings for all examples
with open(DATA_DIR + "/detailed_data/full_ids.txt") as fp:
    ids = {id.strip():i for i, id in enumerate(fp.readlines())}
all_embeds = np.load(open(DATA_DIR + "/detailed_data/full_embeds.npy", 'rb'))

In [16]:
# loading the post embeddings from the LSTM 
lstm_embeds = np.load(open(DATA_DIR + "/detailed_data/lstm_embeds.npy", 'rb'))
lstm_ids = pickle.load(open(DATA_DIR + "/detailed_data/lstm_embeds-ids.pkl", 'rb'))
lstm_ids = {id:i for i, id in enumerate(lstm_ids)}

In [18]:
# loading preprocessed lstm data to ensure identical train/val/test splits
train_data = pickle.load(open(DATA_DIR + "/preprocessed_train_data.pkl",'rb'))
val_data = pickle.load(open(DATA_DIR + "/preprocessed_val_data.pkl", 'rb'))
test_data = pickle.load(open(DATA_DIR + "/preprocessed_test_data.pkl", 'rb'))

  train_data = pickle.load(open(DATA_DIR + "/preprocessed_train_data.pkl",'rb'))


In [35]:
# flattening the preprocessed LSTM data (no need for minibatching here....)
def flatten(data):
    ids, text, users, subreddits, lengths,sfs, labels = [], [], [], [], [], [], []
    for batch in data:
        bids, btext, busers, bsubreddits, blengths, bsfs, blabels = batch
        ids.extend(bids)
        text.extend(btext.numpy().tolist())
        users.extend(busers.numpy().tolist())
        subreddits.extend(bsubreddits.numpy().tolist())
        lengths.extend(blengths)
        labels.extend(blabels)
        sfs.extend(bsfs)
    return (ids, text, users, subreddits, lengths, labels)
print(train_data)
flat_train_data = flatten(train_data)
flat_val_data = flatten(val_data)
flat_test_data = flatten(test_data)

[([b'24ga5l', b'45394y', b'4xtquo', b'2khwmo', b'3o4ciu', b'337r83', b'35rj34', b'1x8t83', b'3xgjdc', b'5uxs6c', b'2gus55', b'3pc1sp', b'3m6me1', b'3igvek', b'1wsirv', b'5o85n6', b'21l9ax', b'4qi0mv', b'1y4va6', b'4l6slu', b'2g2b8g', b'469ddu', b'575i3t', b'3tfywb', b'4euyok', b'5mioit', b'5fxg6d', b'42v30g', b'64swie', b'20l080', b'3ed5bo', b'315uvr', b'4x5rwh', b'53svlq', b'4jtuqw', b'53tc06', b'3ddh5c', b'4d408e', b'3hck37', b'4eiv1c', b'3aq1n7', b'44wupw', b'3mcib8', b'3yeop5', b'2ef5hh', b'26bq7m', b'5gv7y1', b'1wnnp5', b'55trbd', b'2dc4qp', b'3h3907', b'4r7l77', b'49jd50', b'4768pm', b'3omkp3', b'3tt6a2', b'34bvmq', b'4ord0q', b'2kaq9r', b'3du3ck', b'5x1z8y', b'64sghr', b'2swpnv', b'4jsn6x', b'5kxr06', b'56jkh2', b'2q7qj0', b'2lmxj6', b'4wv96j', b'3jg2la', b'5a67ml', b'4fi5or', b'1x772b', b'4qch4a', b'3c1zx8', b'5a8fen', b'51mlqa', b'25arxi', b'2do7n1', b'4a2x8j', b'2ni0nm', b'28avh9', b'1v9dsf', b'65i33v', b'3ydq2l', b'53q3zp', b'3sz3zv', b'32favv', b'2ve4w4', b'3ae57r', b'5g276

In [33]:
print(flat_train_data[0])
print(flat_val_data[0])
print(flat_test_data[0])
print(meta_features.keys())
print(all_embeds)
print(lstm_embeds)

# train_X = np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0]])
# val_X =  np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if i in meta_features])
# test_X =  np.stack([np.concatenate([meta_features[i],all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if i in meta_features])

# train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if i in meta_features])
# val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if i in meta_features])
# test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if i in meta_features])

[b'24ga5l', b'45394y', b'4xtquo', b'2khwmo', b'3o4ciu', b'337r83', b'35rj34', b'1x8t83', b'3xgjdc', b'5uxs6c', b'2gus55', b'3pc1sp', b'3m6me1', b'3igvek', b'1wsirv', b'5o85n6', b'21l9ax', b'4qi0mv', b'1y4va6', b'4l6slu', b'2g2b8g', b'469ddu', b'575i3t', b'3tfywb', b'4euyok', b'5mioit', b'5fxg6d', b'42v30g', b'64swie', b'20l080', b'3ed5bo', b'315uvr', b'4x5rwh', b'53svlq', b'4jtuqw', b'53tc06', b'3ddh5c', b'4d408e', b'3hck37', b'4eiv1c', b'3aq1n7', b'44wupw', b'3mcib8', b'3yeop5', b'2ef5hh', b'26bq7m', b'5gv7y1', b'1wnnp5', b'55trbd', b'2dc4qp', b'3h3907', b'4r7l77', b'49jd50', b'4768pm', b'3omkp3', b'3tt6a2', b'34bvmq', b'4ord0q', b'2kaq9r', b'3du3ck', b'5x1z8y', b'64sghr', b'2swpnv', b'4jsn6x', b'5kxr06', b'56jkh2', b'2q7qj0', b'2lmxj6', b'4wv96j', b'3jg2la', b'5a67ml', b'4fi5or', b'1x772b', b'4qch4a', b'3c1zx8', b'5a8fen', b'51mlqa', b'25arxi', b'2do7n1', b'4a2x8j', b'2ni0nm', b'28avh9', b'1v9dsf', b'65i33v', b'3ydq2l', b'53q3zp', b'3sz3zv', b'32favv', b'2ve4w4', b'3ae57r', b'5g276w'

## Running the models

#### Baseline model 

In [20]:
# First we run the Random Forest with only the metadata/handcrafted features...
baseline_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
baseline_mod.fit(train_X[:, :263], train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=100,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [21]:
# For reference, on the authors server we get 0.682
print roc_auc_score(val_Y, baseline_mod.predict_proba(val_X[:, :263])[:,1])

0.6823965206409768


In [22]:
# For reference, on the authors server we get 0.667
print roc_auc_score(test_Y, baseline_mod.predict_proba(test_X[:, :263])[:,1])

0.6656650084718871


In [23]:
# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=100,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [24]:
# For reference, on the authors server we get 0.765
print roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])

0.7648275951540929


In [25]:
# For reference, on the authors server we get 0.756
print roc_auc_score(test_Y, ensemble_mod.predict_proba(test_X[:, :])[:,1])

0.7564078921461626
