# Baseline method
BERT get sentence embeddings, then use a dense layer and softmax to output directly

## Import files to workspace

In [1]:
import pandas as pd 
import numpy as np 

use_dev_set = True

# load original data file
original_data = pd.read_csv('./data/train_tweets.txt', sep='\t', header=None)

train = pd.read_csv('./data/train_set_v1.txt', sep='\t', header=None)
dev = pd.read_csv('./data/dev_set_v1.txt', sep='\t', header=None)

In [2]:

# load embeddings
x_train = np.load('./data/BERT_encoding/train_encode.npy')
x_dev = np.load('./data/BERT_encoding/dev_encode.npy')

if not use_dev_set:
    x_test = np.load('./data/BERT_encoding/test_encode.npy')

In [3]:
train.head(5)

Unnamed: 0,0,1
0,5592,Do you enjoy the games you are playing in life...
1,8940,Most fun part of owning team. Charging court a...
2,5149,"WSJ: Fannie, Freddie Woes Hurt Apartments #rea..."
3,4562,"45% of execs, managers spend 3+ hours a day us..."
4,8061,loves audiobooks


In [4]:
dev.head(5)

Unnamed: 0,0,1
0,8363,"Or, one of these? This @handle shot top 10 wor..."
1,8687,Blog Post: Chris Brown Speaking to MTV
2,2228,No Vegas for me either. Although it will proba...
3,3088,Salvation Army Expecting to Help More During H...
4,1013,"Bummed about the Charger loss, SD? Come down t..."


In [5]:
print(len(train))
print(len(dev))

print(x_train.shape)
print(x_dev.shape)
print(x_test.shape)


291862
36124
(291862, 1024)
(36124, 1024)


NameError: name 'x_test' is not defined

## Construct projection dictionary
Construct the user_id - class_id hash dictionary, for softmax

In [6]:
len(original_data)

328195

In [7]:
projection_dict = {}
inverse_projection_dict = {}
counter = 0
for row in train.itertuples():
    if row[1] not in projection_dict.keys():
        projection_dict.update({
            row[1]: counter
        })
        inverse_projection_dict.update({
            counter: row[1]
        })
        counter += 1
        

In [8]:
train_class_id = []
for row in train.itertuples():
    class_id = int(projection_dict.get(row[1]))
    train_class_id.append(class_id)
                   
dev_class_id = []
for row in dev.itertuples():
    dev_class_id.append(str(projection_dict.get(row[1])))

In [9]:
train['class_id'] = train_class_id
dev['class_id'] = dev_class_id

y_train = train['class_id']
y_dev = dev['class_id']

In [10]:
# change training target to one_hot
nb_classes = len(projection_dict)
targets = np.array([y_train]).reshape(-1)
y_train_one_hot = np.eye(nb_classes)[targets]

In [11]:
y_train_one_hot

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Constrtuct Keras MLP model

In [13]:
import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import Adam


seed = 7
np.random.seed(seed)


model = Sequential()
model.add(Dense(units=1024, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(units=512, activation='relu', input_dim=x_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(units=len(projection_dict), activation='softmax'))
# optimizer = Adam(lr=0.01)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam', metrics=['accuracy'])

model.summary()


# callbacks
filepath="best_weights_tough_head.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
earlyStopping = EarlyStopping(monitor='val_acc', patience=2, verbose=0, mode='auto')

callbacks_list = [checkpoint, earlyStopping]

# 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_5 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 9266)              4753458   
Total params: 6,327,858
Trainable params: 6,327,858
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(x=x_train, y=y_train_one_hot, batch_size=32, epochs=50, validation_split=0.1, callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Train on 262675 samples, validate on 29187 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.03669, saving model to best_weights_tough_head.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.03669 to 0.05492, saving model to best_weights_tough_head.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.05492 to 0.06702, saving model to best_weights_tough_head.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.06702 to 0.06948, saving model to best_weights_tough_head.hdf5
Epoch 5/50

Epoch 00005: val_acc improved from 0.06948 to 0.07507, saving model to best_weights_tough_head.hdf5
Epoch 6/50

Epoch 00006: val_acc improved from 0.07507 to 0.07863, saving model to best_weights_tough_head.hdf5
Epoch 7/50

Epoch 00007: val_acc improved from 0.07863 to 0.08291, saving model to best_weights_tough_head.hdf5
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.08291
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.08

<keras.callbacks.History at 0x7f8fa53149e8>

## Apply keras model

In [15]:
# load from file
model.load_weights("best_weights_tough_head.hdf5")
if use_dev_set:
    y_test = model.predict(x_dev, batch_size=128, verbose=1)
else:
    y_test = model.predict(x_test, batch_size=128, verbose=1)
y_test


# for i in range(len(test_df)):
#     if np.argmax(y_test[i]) == 0:
#         label = "NOT ENOUGH INFO"
# #         test_df['evidence'][i] = []
#     elif np.argmax(y_test[i]) == 1:
#         label = "REFUTES"
#     else:
#         label = "SUPPORTS"
#     key = test_df['key'][i]
#     result_dict.update({
#         key:{
#             "claim": test_df['claim'][i],
#             "label": label,
#             "evidence": test_df['evidence'][i]
#         }
#     })
    
# with open('dev_top7_test.json', 'w') as outfile:
#     json.dump(result_dict, outfile, indent=4)




array([[2.52128229e-04, 4.78287664e-04, 1.03502953e-06, ...,
        4.78794334e-07, 1.42071102e-08, 1.39120191e-08],
       [1.27373085e-06, 8.65117784e-07, 7.70490715e-07, ...,
        1.09587525e-07, 2.42102693e-08, 2.21087131e-08],
       [3.53152209e-05, 6.06618472e-04, 2.42241953e-07, ...,
        3.62016195e-08, 2.07881556e-09, 2.00881356e-09],
       ...,
       [3.01541422e-05, 1.62081251e-05, 5.67836356e-09, ...,
        5.43668897e-08, 2.98618801e-08, 2.67783307e-08],
       [1.45513541e-05, 5.93461709e-06, 2.71375598e-08, ...,
        1.38389865e-07, 3.00772229e-09, 2.87059065e-09],
       [1.93709257e-05, 1.10128494e-04, 5.90464151e-05, ...,
        1.42818153e-05, 3.41318014e-08, 3.53840868e-08]], dtype=float32)

In [16]:
y_test.shape

(36124, 9266)

In [17]:
predict_result = list(np.argmax(y_test, axis=1))

In [18]:
projected_result = []
for r in predict_result:
    projected_result.append(inverse_projection_dict.get(r))

In [19]:
np.save("./data/result/BERT_MLP_DEV", projected_result)