In [1]:
import re
import string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
from os import listdir
from os.path import isfile, join

import tensorflow as tf
import transformers
from transformers import BertTokenizer
from transformers import TFAutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PATH = "./semeval-2017-tweets_Subtask-A/downloaded/"
FILES = [PATH+f for f in listdir(PATH) if isfile(join(PATH, f))]
DFS_train = pd.concat([pd.read_csv(file,sep="\t",names=['ID',"label",'text'],encoding="UTF-8") for file in FILES if 'test' not in file])
DFS_test = pd.concat([pd.read_csv(file,sep="\t",names=['ID',"label",'text'],encoding="UTF-8") for file in FILES if 'test' in file and '2016' not in file])

print(len(DFS_train),len(DFS_test))

19710 7790


In [3]:
# Use regex to clean the data
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    return text 

def seperate_alphanumeric(text):
    words = text
    words = re.findall(r"[^\W\d_]+|\d+", words)
    return " ".join(words)

def cont_rep_char(text):
    tchr = text.group(0) 
    
    if len(tchr) > 1:
        return tchr[0:2] 

def unique_char(rep, text):
    substitute = re.sub(r'(\w)\1+', rep, text)
    return substitute

def label_to_float(label):
    return {'positive':1.0,'neutral':0.0,'negative':-1.0}[label]

In [4]:
DFS_train['text'] = DFS_train['text'].apply(lambda x : remove_url(x))
DFS_train['text'] = DFS_train['text'].apply(lambda x : remove_punct(x))
DFS_train['text'] = DFS_train['text'].apply(lambda x : remove_emoji(x))
DFS_train['text'] = DFS_train['text'].apply(lambda x : decontraction(x))
DFS_train['text'] = DFS_train['text'].apply(lambda x : seperate_alphanumeric(x))
DFS_train['text'] = DFS_train['text'].apply(lambda x : unique_char(cont_rep_char,x))
DFS_train['label'] = DFS_train['label'].apply(lambda x : label_to_float(x))

DFS_test['text'] = DFS_test['text'].apply(lambda x : remove_url(x))
DFS_test['text'] = DFS_test['text'].apply(lambda x : remove_punct(x))
DFS_test['text'] = DFS_test['text'].apply(lambda x : remove_emoji(x))
DFS_test['text'] = DFS_test['text'].apply(lambda x : decontraction(x))
DFS_test['text'] = DFS_test['text'].apply(lambda x : seperate_alphanumeric(x))
DFS_test['text'] = DFS_test['text'].apply(lambda x : unique_char(cont_rep_char,x))
DFS_test['label'] = DFS_test['label'].apply(lambda x : label_to_float(x))

In [5]:
seq_len = 256
batch_size = 16
num_samples = len(DFS_train)
model_name = 'distilbert-base-uncased'

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

train_tokens = tokenizer(
    DFS_train['text'].tolist(), 
    max_length=seq_len, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True, 
    return_tensors='np'
)

labels = DFS_train['label'].values

dataset = tf.data.Dataset.from_tensor_slices(
    (
        train_tokens['input_ids'], 
        train_tokens['attention_mask'], 
        labels
    )
)

def map_func(input_ids, masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': masks
    }, labels

dataset = dataset.map(map_func)
dataset = dataset.shuffle(10000).batch(batch_size=batch_size, drop_remainder=True)

split = 0.7
size = int((train_tokens['input_ids'].shape[0] // batch_size) * split)

train_ds = dataset.take(size)
val_ds = dataset.skip(size)


In [7]:
model = TFAutoModel.from_pretrained(model_name)

# Two inputs
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

# Transformer
# embeddings = model.bert(input_ids, attention_mask=mask)[1]
embeddings = model(input_ids, attention_mask=mask)[0]
embeddings = embeddings[:, 0, :]
# Classifier head
x = tf.keras.layers.Dense(512, activation='relu')(embeddings)
# x = tf.keras.layers.Dropout(0.1)(x)
y = tf.keras.layers.Dense(1, activation='tanh', name='outputs')(x)

bert_model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# freeze bert layers
bert_model.layers[2].trainable = False

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.MeanSquaredError()

bert_model.compile(optimizer=optimizer, loss=loss)


bert_model.summary()

bert_model.load_weights("./bertjuh/")

# history = bert_model.fit(
#     train_ds,
#     validation_data=val_ds,
#     epochs=10,
#     batch_size=batch_size
# )

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 tf_distil_bert_model_1 (TFDist  TFBaseModelOutput(l  66362880   ['input_ids[0][0]',              
 ilBertModel)                   ast_hidden_state=(N               'attention_mask[0][0]']         
                                one, 256, 768),                                                   
                                 hidden_states=None                                         

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x249c1655510>

In [13]:
large_test = pd.read_csv("./semeval-2017-tweets_Subtask-A/downloaded/twitter-2016test-A.tsv",sep="\t",names=['ID',"label",'text','nan']).drop(columns=['nan'])

large_test['text'] = large_test['text'].apply(lambda x : remove_url(x))
large_test['text'] = large_test['text'].apply(lambda x : remove_punct(x))
large_test['text'] = large_test['text'].apply(lambda x : remove_emoji(x))
large_test['text'] = large_test['text'].apply(lambda x : decontraction(x))
large_test['text'] = large_test['text'].apply(lambda x : seperate_alphanumeric(x))
large_test['text'] = large_test['text'].apply(lambda x : unique_char(cont_rep_char,x))
large_test['label'] = large_test['label'].apply(lambda x : label_to_float(x))

train_tokens = tokenizer(
    large_test['text'].tolist(), 
    max_length=seq_len, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True, 
    return_tensors='np'
)

dataset = tf.data.Dataset.from_tensor_slices(
    (
        train_tokens['input_ids'], 
        train_tokens['attention_mask'], 
        large_test['label']
    )
)

def map_func(input_ids, masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': masks
    }, large_test['label']

dataset = dataset.map(map_func)
dataset = dataset.shuffle(10000).batch(batch_size=batch_size, drop_remainder=True)

bert_model.evaluate(dataset)




0.8978973031044006

In [15]:
train_tokens = tokenizer(
    DFS_test['text'].tolist(), 
    max_length=seq_len, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True, 
    return_tensors='np'
)

dataset = tf.data.Dataset.from_tensor_slices(
    (
        train_tokens['input_ids'], 
        train_tokens['attention_mask'], 
        DFS_test['label']
    )
)

def map_func(input_ids, masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': masks
    }, DFS_test['label']

dataset = dataset.map(map_func)
dataset = dataset.shuffle(10000).batch(batch_size=batch_size, drop_remainder=True)

bert_model.evaluate(dataset)




0.8677359819412231