# ELECTRA Fine-Tuned + GLoVe Embeddings

## 0. Taking a Look at the Data

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('train.csv')
data[2010:].head(20)

Unnamed: 0,id,keyword,location,text,target
2010,2886,damage,,Beach did damage to my shit,1
2011,2887,damage,,@WonderousAllure crosses her arms to cover her...,0
2012,2888,damage,,New post on my blog: http://t.co/Avu9b4k2rv \n...,0
2013,2889,damage,Charlotte NC,REPORTED: HIT &amp; RUN-IN ROADWAY-PROPERTY DA...,1
2014,2890,damage,,Devil May Cry 4 Special Edition Vergil Vs Agnu...,0
2015,2891,damage,"Rockville, Maryland",#Glaucoma occurs when fluid builds up pressure...,1
2016,2893,damage,,#JSunNews Storm damage reported in Madison Cou...,1
2017,2895,damage,,S61.231A Puncture wound without foreign body o...,1
2018,2896,damage,Australia,Thank you @RicharkKirkArch @AusInstArchitect f...,0
2019,2898,damage,Your Conversation,This real shit will damage a bitch,0


---
### NOTE
By looking at the data, we can see the *location* data is actually very useful,  
because a serious Tweet will generally have a proper location,      
whereas a jokeful Tweet will have something silly.  
So let's keep the *location* data.  

## 1. Pre-Processing

In [2]:
# let's apply regex to clean the strings

data['text'] = data['text'].str.replace('http\S+', 'url', regex=True)  # replace all URLs with 'url'
data['text'] = data['text'].str.replace('&\S+', '', regex=True)  # remove all html junks ex) &amp;
data['text'] = data['text'].str.replace('[0-9]','0', regex=True)  # replace all integer values with 0
data['text'] = data['text'].str.replace('[^a-zA-Z0 ]', ' ', regex=True)  # replace all non-numerics, non-alphabets with space
data['text'] = data['text'].str.lower()

data['location'] = data['location'].str.replace('http\S+', 'http', regex=True)
data['location'] = data['location'].str.replace('&\S+', '', regex=True)
data['location'] = data['location'].str.replace('[0-9]','0', regex=True)
data['location'] = data['location'].str.replace('[^a-zA-Z0 ]', ' ', regex=True)
data['location'] = data['location'].str.lower()
data['location'] = data['location'].fillna('')  # replace NaN values with empty string

data[2010:].head(20)

Unnamed: 0,id,keyword,location,text,target
2010,2886,damage,,beach did damage to my shit,1
2011,2887,damage,,wonderousallure crosses her arms to cover her...,0
2012,2888,damage,,new post on my blog url thesensualeye mode...,0
2013,2889,damage,charlotte nc,reported hit run in roadway property damage ...,1
2014,2890,damage,,devil may cry 0 special edition vergil vs agnu...,0
2015,2891,damage,rockville maryland,glaucoma occurs when fluid builds up pressure...,1
2016,2893,damage,,jsunnews storm damage reported in madison cou...,1
2017,2895,damage,,s00 000a puncture wound without foreign body o...,1
2018,2896,damage,australia,thank you richarkkirkarch ausinstarchitect f...,0
2019,2898,damage,your conversation,this real shit will damage a bitch,0


In [3]:
from nltk.stem import WordNetLemmatizer

lemm = WordNetLemmatizer()

def lemmatize(pd_series): # lemmatize words to a more general term
    ret = []
    for entry in pd_series:
        sent = ''
        words = entry.split()
        for word in words:
            sent = sent + lemm.lemmatize(word) + ' '
        ret.append(sent[:-1])
    return np.asarray(ret)

lemmatize(['Kevin drinks', 'Yoon lives again'])  # some examples

array(['Kevin drink', 'Yoon life again'], dtype='<U15')

In [4]:
data['text'] = lemmatize(data['text'])
data['location'] = lemmatize(data['location'])
data[2010:].head(20)

Unnamed: 0,id,keyword,location,text,target
2010,2886,damage,,beach did damage to my shit,1
2011,2887,damage,,wonderousallure cross her arm to cover her han...,0
2012,2888,damage,,new post on my blog url thesensualeye model ca...,0
2013,2889,damage,charlotte nc,reported hit run in roadway property damage at...,1
2014,2890,damage,,devil may cry 0 special edition vergil v agnus...,0
2015,2891,damage,rockville maryland,glaucoma occurs when fluid build up pressure i...,1
2016,2893,damage,,jsunnews storm damage reported in madison coun...,1
2017,2895,damage,,s00 000a puncture wound without foreign body o...,1
2018,2896,damage,australia,thank you richarkkirkarch ausinstarchitect for...,0
2019,2898,damage,your conversation,this real shit will damage a bitch,0


In [5]:
# shuffle the data, make sure to set the random_state(the seed for shuffling) to a fixed integer,
# so the validation set can have the same data for proper validation every time you run the code. 
data = data.sample(frac=1, random_state=1).reset_index(drop=True)  
data[2010:].head(20)

Unnamed: 0,id,keyword,location,text,target
2010,8431,sandstorm,usa,watch this airport get swallowed up by a sands...,1
2011,1485,body%20bags,westside of philly 0 block,ain t no bag in the trunk it s a body,0
2012,704,attacked,0 0 of the blam squad,i m feeling so attacked url,0
2013,5580,flood,new york,0pcs 00w cree led work light offroad lamp car ...,1
2014,3957,devastation,newport wale uk,cllrraymogford indeed ray devastation would be...,1
2015,7694,panic,torry alvarez love forever,panic at the disco te amo,0
2016,10731,wreck,canada bc,raineishida lol im just a nervous wreck p,0
2017,5035,eyewitness,india,read a schoolboy s eyewitness account of hiros...,1
2018,10260,war%20zone,,they turned jasmine house into a war zone litt...,0
2019,9600,thunder,macon ga,thunder outside my house this afternoon gawx,1


## 2. Tokenizing and Embedding

---
### NOTE
We are going to tokenize the main text data with ELECTRA Tokenzier, so we can actually feed the data into ELECTRA.  
We are only going to use GLoVe embeddings on location texts, because they are shorter, and the sequence of the words are less important.  
i.e. we only need to know if the words are actually describing a real location or not.

In [6]:
from transformers import ElectraTokenizer
# Huggingface Transformers ELECTRA: https://huggingface.co/transformers/model_doc/electra.html

# tokenize the main text data with ElectraTokenizer

tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
texts = data['text'].tolist()
texts = tokenizer(texts, truncation=True, padding=True)  # pads to longest sentence, but truncates if it exceeds max_len of the specific model, in ELECTRA's case 512

token_input = np.asarray(texts['input_ids'])  # sentence with words converted to token
mask_input = np.asarray(texts['attention_mask'])  # masking: if token is <pad> then 0, else 1

print(token_input.shape)  # in our case 53 is the longest sentence
print(mask_input.shape)

emb_len = token_input.shape[-1]  # save 53 for future use

(7613, 53)
(7613, 53)


In [7]:
import pickle

with open('glove.pickle', 'rb') as p:
    glove_dict = pickle.load(p)  # dictionary containing pre-trained GLoVe, with 100 dimension vector representing a word

# Stanford GLoVe : https://nlp.stanford.edu/projects/glove/

def gloveAndPad(pd_series, max_length):  # converts words in sentence to GLoVe vectors, and pads to meet the max_length, truncs if longer
    ret = []
    for entry in pd_series:
        emb_sent = []
        words = entry.split()
        for word in words:
            try:  # if the word is in the GLoVe data
                emb_sent.append(glove_dict[word])
            except KeyError:
                pass
        while len(emb_sent) < max_length:  # pad to max_length
            emb_sent.append(np.zeros(100, dtype='float32'))
        ret.append(np.asarray(emb_sent)[:max_length])  # trunc to max_length
    return np.asarray(ret)

In [8]:
config = {
    'loc_max_len': 5,  # I picked 5 as max length for location but you can try out different lengths
    'glove_emb_dim' : 100
}

In [9]:
loc_input = gloveAndPad(data['location'], config['loc_max_len'])

labels = np.asarray(data['target']) # prepare the label as well

print(loc_input.shape)
print(labels.shape)

(7613, 5, 100)
(7613,)


## 3. Modeling

In [10]:
from transformers import logging

logging.set_verbosity_error() 
# this removes warnings from the transformers module, which occurs because we are loading the discriminator weights on a ordinary Model class,
# and therefore some of the weights that are only used during pre-training the discriminator doesn't have a place to load.
# However, this is a normal behavior.

In [11]:
from transformers import TFElectraModel
import tensorflow as tf

# three inputs
input_tokens = tf.keras.layers.Input(shape=(emb_len, ), name='tokens', dtype='int32')
input_masks = tf.keras.layers.Input(shape=(emb_len, ), name='masks', dtype='int32')
input_locs = tf.keras.layers.Input(shape=(config['loc_max_len'], config['glove_emb_dim']), name='locs', dtype='float32')

electra_model = TFElectraModel.from_pretrained('google/electra-base-discriminator')
electra_output = electra_model(input_tokens, attention_mask=input_masks).last_hidden_state  # we only need the last hidden outputs

x1 = tf.keras.layers.GlobalAveragePooling1D()(electra_output)
x1 = tf.keras.layers.Dense(256, activation='relu')(x1)
x1 = tf.keras.layers.Dropout(0.2)(x1)

x2 = tf.keras.layers.GlobalAveragePooling1D()(input_locs)
x2 = tf.keras.layers.Dense(32, activation='relu')(x2)
x2 = tf.keras.layers.Dropout(0.2)(x2)

x = tf.keras.layers.Concatenate()([x1, x2])
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)

x = tf.keras.layers.Dense(16, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)

y = tf.keras.layers.Dense(1, activation='sigmoid')(x) # output

model = tf.keras.models.Model(inputs=[input_tokens, input_masks, input_locs], outputs=y)
adam = tf.keras.optimizers.Adam(learning_rate=2e-5)  # choose a low learning rate to fine-tune, a high learning rate will disrupt all the pre-trained weights

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tokens (InputLayer)             [(None, 53)]         0                                            
__________________________________________________________________________________________________
masks (InputLayer)              [(None, 53)]         0                                            
__________________________________________________________________________________________________
tf_electra_model (TFElectraMode TFBaseModelOutput(la 108891648   tokens[0][0]                     
                                                                 masks[0][0]                      
__________________________________________________________________________________________________
locs (InputLayer)               [(None, 5, 100)]     0                                 

## 4. Training

In [12]:
#model.fit(x={'tokens':token_input, 'masks':mask_input, 'locs':loc_input}, y=labels, epochs=10, batch_size=16, validation_split=0.2) # for validation
model.fit(x={'tokens':token_input, 'masks':mask_input, 'locs':loc_input}, y=labels, epochs=2, batch_size=16) # for actual training

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x2036d1c46a0>

---
### NOTE
You can test out different numbers of epochs and check the validaiton loss, in my case 2 epoch was the sweetspot.   
Generally fewer epochs will work well to fine-tune a transformer model 

## 5. Prediction

In [13]:
# Apply Preprocssing to Test Data

test_data = pd.read_csv('test.csv')

test_data['text'] = test_data['text'].str.replace('http\S+', 'http', regex=True)
test_data['text'] = test_data['text'].str.replace('&\S+', '', regex=True)
test_data['text'] = test_data['text'].str.replace('[0-9]','0', regex=True)
test_data['text'] = test_data['text'].str.replace('[^a-zA-Z0 ]', ' ', regex=True)
test_data['text'] = test_data['text'].str.lower()

test_data['location'] = test_data['location'].str.replace('http\S+', 'http', regex=True) 
test_data['location'] = test_data['location'].str.replace('&\S+', '', regex=True)  
test_data['location'] = test_data['location'].str.replace('[0-9]','0', regex=True)  
test_data['location'] = test_data['location'].str.replace('[^a-zA-Z0 ]', ' ', regex=True)
test_data['location'] = test_data['location'].str.lower()
test_data['location'] = test_data['location'].fillna('')

test_texts = test_data['text'].tolist()

test_texts = tokenizer(test_texts, truncation=True, padding='max_length', max_length=emb_len)  # set the max_length to emb_len, so it has the same dimension as the model input
token_input_test = np.asarray(test_texts['input_ids'])
mask_input_test = np.asarray(test_texts['attention_mask'])

loc_input_test = gloveAndPad(test_data['location'], config['loc_max_len'])

print(token_input_test.shape)
print(mask_input_test.shape)
print(loc_input_test.shape)

(3263, 53)
(3263, 53)
(3263, 5, 100)


In [14]:
pred = model.predict(x={'tokens':token_input_test, 'masks':mask_input_test, 'locs':loc_input_test})
pred = np.asarray(np.rint(pred), dtype=int)  # round to the closest integer, since we did a Sigmoid on the output layer
pred[:10]

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0]])

In [15]:
submission = pd.DataFrame()
submission['id'] = test_data['id']
submission['target'] = pred
submission.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [16]:
submission.to_csv('submission8.csv', index=False)
# Accuracy Score : 0.84737

## Thank you for reading my notebook!
- Please checkout my repository where I have different models(LSTM, CNN, ELMo, BERT etc.) tested on the same dataset.
- https://github.com/MattYoon/NLP-Disaster_Tweets