# Baseline method
BERT get sentence embeddings, then use a dense layer and softmax to output directly

## Import files to workspace

In [42]:
import pandas as pd 
import numpy as np 

# load original data file
original_data = pd.read_csv('./data/train_tweets.txt', sep='\t', header=None)

train = pd.read_csv('./data/train_set_v1.txt', sep='\t', header=None)
dev = pd.read_csv('./data/dev_set_v1.txt', sep='\t', header=None)


In [79]:

# load embeddings
x_train = np.load('./data/BERT_encoding/train_encode.npy')
x_dev = np.load('./data/BERT_encoding/dev_encode.npy')

In [44]:
train.head(5)

Unnamed: 0,0,1
0,5592,Do you enjoy the games you are playing in life...
1,8940,Most fun part of owning team. Charging court a...
2,5149,"WSJ: Fannie, Freddie Woes Hurt Apartments #rea..."
3,4562,"45% of execs, managers spend 3+ hours a day us..."
4,8061,loves audiobooks


In [45]:
dev.head(5)

Unnamed: 0,0,1
0,8363,"Or, one of these? This @handle shot top 10 wor..."
1,8687,Blog Post: Chris Brown Speaking to MTV
2,2228,No Vegas for me either. Although it will proba...
3,3088,Salvation Army Expecting to Help More During H...
4,1013,"Bummed about the Charger loss, SD? Come down t..."


In [46]:
print(len(train))
print(len(dev))

print(train_embedding.shape)
print(dev_embedding.shape)


291862
36124
(291862, 768)
(36124, 768)


## Construct projection dictionary
Construct the user_id - class_id hash dictionary, for softmax

In [135]:
len(original_data)

328195

In [139]:
projection_dict = {}
inverse_projection_dict = {}
counter = 0
for row in train.itertuples():
    if row[1] not in projection_dict.keys():
        projection_dict.update({
            row[1]: counter
        })
        inverse_projection_dict.update({
            counter: row[1]
        })
        counter += 1
        

In [140]:
train_class_id = []
for row in train.itertuples():
    class_id = int(projection_dict.get(row[1]))
    train_class_id.append(class_id)
                   
dev_class_id = []
for row in dev.itertuples():
    dev_class_id.append(str(projection_dict.get(row[1])))

In [142]:
train['class_id'] = train_class_id
dev['class_id'] = dev_class_id

y_train = train['class_id']
y_dev = dev['class_id']

In [148]:
# change training target to one_hot
nb_classes = len(projection_dict)
targets = np.array([y_train]).reshape(-1)
y_train_one_hot = np.eye(nb_classes)[targets]

## Constrtuct Keras MLP model

In [153]:
import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import Adam


seed = 7
np.random.seed(seed)


model = Sequential()
model.add(Dense(units=2048, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(units=1024, activation='relu', input_dim=x_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(units=len(projection_dict), activation='softmax'))
# optimizer = Adam(lr=0.01)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam', metrics=['accuracy'])

model.summary()


# callbacks
filepath="best_weights_tough_head.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
earlyStopping = EarlyStopping(monitor='val_acc', patience=2, verbose=0, mode='auto')

callbacks_list = [checkpoint, earlyStopping]

# 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 2048)              1574912   
_________________________________________________________________
dense_14 (Dense)             (None, 1024)              2098176   
_________________________________________________________________
dropout_5 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_15 (Dense)             (None, 9266)              9497650   
Total params: 13,170,738
Trainable params: 13,170,738
Non-trainable params: 0
_________________________________________________________________


In [154]:
model.fit(x=x_train, y=y_train_one_hot, batch_size=32, epochs=50, validation_split=0.1, callbacks=callbacks_list)

Train on 262675 samples, validate on 29187 samples
Epoch 1/50
  2592/262675 [..............................] - ETA: 29:10 - loss: 9.1403 - acc: 0.0000e+00

KeyboardInterrupt: 

## Apply keras model

In [None]:
# # load from file
# model.load_weights("best_weights_head.hdf5")
# y_test = model.predict(x_test, batch_size=128, verbose=1)
# y_test


# for i in range(len(test_df)):
#     if np.argmax(y_test[i]) == 0:
#         label = "NOT ENOUGH INFO"
# #         test_df['evidence'][i] = []
#     elif np.argmax(y_test[i]) == 1:
#         label = "REFUTES"
#     else:
#         label = "SUPPORTS"
#     key = test_df['key'][i]
#     result_dict.update({
#         key:{
#             "claim": test_df['claim'][i],
#             "label": label,
#             "evidence": test_df['evidence'][i]
#         }
#     })
    
# with open('dev_top7_test.json', 'w') as outfile:
#     json.dump(result_dict, outfile, indent=4)
