In [None]:
!pip install transformers focal_loss tensorflow-addons 

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.5 MB/s 
[?25hCollecting focal_loss
  Downloading focal_loss-0.0.7-py3-none-any.whl (19 kB)
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.16.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 39.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 33.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 34.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3

In [None]:
from transformers import AutoTokenizer, TFAutoModel, AutoConfig
import re
import io
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score,precision_score,recall_score
from zipfile import ZipFile
from IPython.display import FileLink 
import tensorflow as tf
from focal_loss import SparseCategoricalFocalLoss
import tensorflow_addons as tfa
import keras
np.random.seed(45)
tf.random.set_seed(45)
#tpu_strategy = tf.distribute.experimental.TPUStrategy(tf.distribute.cluster_resolver.TPUClusterResolver.connect())

In [None]:
train_data=pd.read_csv("/content/train.csv")
valid_data=pd.read_csv("/content/valid.csv")

In [None]:
OFF=train_data.loc[(train_data["OFF_label"]==1)&(train_data["HS_label"]==0)].sample(1100)
NORM=train_data.loc[train_data["OFF_label"]==0].sample(1100)
HS=train_data.loc[train_data["HS_label"]!=0]
train_data=OFF.append(NORM)
train_data=train_data.append(HS)
train_data=train_data.sample(frac=1).reset_index(drop=True)

In [None]:
def encode_labels(data):
    labels=[]
    for row in data.to_dict(orient="records"):
      if row["HS_label"]!=0:
        labels.append(int(row["HS_label"])+1)
      elif row["OFF_label"]!=0:
        labels.append(int(row["OFF_label"]))
      else:
        labels.append(0)
    text=list(data["tweet_text"].astype("str"))
    return text,np.asarray(labels, dtype='int32')

train_text,train_labels=encode_labels(train_data)
valid_text,valid_labels=encode_labels(valid_data)

In [None]:
def encode_vlabels(data):
    encoded_OFF=tf.convert_to_tensor(data["OFF_label"].astype("int32"))
    encoded_HS=data["HS_label"].astype("int32")
    text=list(data["tweet_text"].astype("str"))
    return text,encoded_OFF,encoded_HS

valid_text,valid_OFF,valid_HS=encode_vlabels(valid_data)

In [None]:
def Binary_HS_feature_gen(encoded_HS):
    HS_bn=[]
    for x in encoded_HS:
        HS_bn.append(x!=0)
    return tf.convert_to_tensor(HS_bn)
train_HS_bn=Binary_HS_feature_gen(train_data["HS_label"])
valid_HS_bn=Binary_HS_feature_gen(valid_data["HS_label"])

In [None]:
def tokenize(sentences):
    tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERTv2")
    input_ids, input_masks = [],[]
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True,max_length=256,truncation=True, padding='max_length',return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
    return np.asarray(input_ids, dtype='int32'),np.asarray(input_masks, dtype='int32')

train_input_ids,train_input_masks=tokenize(train_text)
valid_input_ids,valid_input_masks=tokenize(valid_text)

In [None]:
def report_gen(predictions,labels):
    report={
    "F1_macro":f1_score(predictions,labels,average="macro"),
    "Accuracy":accuracy_score(predictions,labels),
    "Precision_macro":precision_score(predictions,labels,average="macro"),
    "Recall_macro":recall_score(predictions,labels,average="macro")
    }
    return report

def eval_taskA(predictions,labels_OFF,return_predictions=False):
    predict_class = np.argmax(predictions, axis=1)
    predict_class = predict_class.tolist()
    predictions_OFF=[]
    for i in predict_class:
      if i!=0:
        predictions_OFF.append(1)
      else:
        predictions_OFF.append(0)
    return predictions_OFF if return_predictions else report_gen(predictions_OFF,labels_OFF)

def eval_taskB(predictions,labels_HS_bn,return_predictions=False):
    predict_class = np.argmax(predictions, axis=1)
    predict_class = predict_class.tolist()
    predictions_HS_bn=[]
    for i in predict_class:
      if i!=0 and i!=1:
        predictions_HS_bn.append(1)
      else:
        predictions_HS_bn.append(0)
    return predictions_HS_bn if return_predictions else report_gen(predictions_HS_bn,labels_HS_bn)


def eval_taskC(predictions,labels_HS,return_predictions=False):
    predict_class = np.argmax(predictions, axis=1)
    predict_class = predict_class.tolist()
    predictions_HS=[]
    for i in predict_class:
      if i!=0 and i!=1:
        predictions_HS.append(i-1)
      else:
        predictions_HS.append(0)
    return predictions_HS if return_predictions else report_gen(predictions_HS,labels_HS)


In [None]:
def create_model(transformer,conv_units=128,qrnn_units=256,dense_units=64):
    input_ids= keras.layers.Input(shape=(256,), dtype='int32')
    input_masks = keras.layers.Input(shape=(256,), dtype='int32')
    embedding_layer=transformer(input_ids, attention_mask=input_masks)[0]
    
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(256, return_sequences=True, dropout=0.1))(embedding_layer)
    x = tf.keras.layers.Conv1D(128, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool_x= tf.keras.layers.GlobalAveragePooling1D()(x)
    max_pool_x = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.concatenate([avg_pool_x, max_pool_x])
    output=tf.keras.layers.Dense(7, activation="softmax",name="output")(x)

    model = tf.keras.Model(inputs=[input_ids, input_masks], outputs = [output])
    
    return model

In [None]:
def gen_weights(labels):
  count={}
  for i in labels:
    if i not in count:
      count[i]=1
    else:
      count[i]+=1 
  weights={}
  for i in range(8):
    weights[i]=len(labels)/(8*count[i])
  return weights
weights=gen_weights(train_labels)

In [None]:
def build_model():
  #with tpu_strategy.scope():
  config =AutoConfig.from_pretrained("UBC-NLP/MARBERTv2",dropout=0.3,seed=3,attention_dropout=0.3,output_hidden_states = True)
  transformer= TFAutoModel.from_pretrained("UBC-NLP/MARBERTv2",config=config)
  return create_model(transformer)

model=build_model()

def train_model(model,input_ids,input_masks,labels,weights,batch_size=64,epochs_frozen=2,epochs_unfrozen=10,verbose=1):
   
    if epochs_frozen:
        
        for layer in model.layers[:3]:
            layer.trainable = False

        #with tpu_strategy.scope():
        model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
                          metrics="sparse_categorical_accuracy"
                         )

        model.fit([input_ids, input_masks],labels,batch_size=64,epochs=epochs_frozen,verbose=verbose,
                   validation_data=([valid_input_ids,valid_input_masks],valid_labels), class_weight=weights)
        
    if epochs_unfrozen:
        
        for layer in model.layers[:3]:
            layer.trainable = True

        #with tpu_strategy.scope():
        model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), 
                          metrics="sparse_categorical_accuracy"
                         )
        model.fit([input_ids, input_masks],labels,batch_size=64,epochs=epochs_unfrozen,verbose=verbose,
                    validation_data=([valid_input_ids,valid_input_masks],valid_labels), class_weight=weights)

train_model(model,train_input_ids,train_input_masks,train_labels,weights)

RuntimeError: ignored

In [None]:
def evaluate_model(model,input_ids,input_masks,labels_OFF,labels_HS,labels_OFF_not_HS,labels_HS_bn):
    predictions=model.predict([input_ids,input_masks])
    print(f"TaskA:{eval_taskA(predictions,labels_OFF)}")
    print(f"TaskB:{eval_taskB(predictions,labels_HS_bn)}")
    print(f"TaskC:{eval_taskC(predictions,labels_HS)}")
evaluate_model(model,valid_input_ids,valid_input_masks,valid_OFF,valid_HS,valid_OFF_not_HS,valid_HS_bn)

In [None]:
evaluate_model(model,train_input_ids,train_input_masks,train_OFF,train_HS,train_OFF_not_HS,train_HS_bn)

In [None]:
def zip_file(task):
    with ZipFile(task+".zip", 'w') as myzip:
        myzip.write(task+".txt")
    myzip.close()
    
def submission_gen_taskA(predictions):
    sub=[]
    for i in predictions:
        if i==1:
            sub.append("OFF")
        else :
            sub.append("NOT_OFF")
    pd.DataFrame(sub).to_csv("TaskA.txt",index=False,header=False)

def submission_gen_taskB(predictions):
    sub=[]
    for i in predictions:
        if i==1:
            sub.append("HS")
        else :
            sub.append("NOT_HS")
    pd.DataFrame(sub).to_csv("TaskB.txt",index=False,header=False)

def submission_gen_taskC(predictions):
    sub=[]
    for i in predictions:
        if i==0:
            sub.append("NOT_HS")
        else:
            sub.append("HS"+str(i))
    pd.DataFrame(sub).to_csv("TaskC.txt",index=False,header=False)
    
def submission_gen():
    submission_gen_taskA(eval_taskA(predictions_OFF,labels_OFF,return_predictions=True))
    submission_gen_taskB(eval_taskB2(predictions_HS_bn,labels_HS_bn,return_predictions=True))
    submission_gen_taskC(eval_taskC(predictions_HS,labels_HS,return_predictions=True))

#submission_gen()