In [1]:
#importing libraries
import zipfile
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [4]:
zip_ref = zipfile.ZipFile('/content/sarcasm.zip')
zip_ref.extractall('/content')
zip_ref.close()

In [5]:
df = pd.read_csv('/content/train-balanced-sarcasm.csv')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [6]:
df.shape

(1010826, 10)

In [7]:
df = df[:10000]
df = df[['label','comment']]
df.head()

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.


In [8]:
df.shape

(10000, 2)

In [9]:
df.isna().sum()

Unnamed: 0,0
label,0
comment,1


In [10]:
df.dropna(inplace = True)

In [11]:
df.isna().sum()

Unnamed: 0,0
label,0
comment,0


In [12]:
# removing unwanted numerals and symbols
df['comment'] = df['comment'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

In [13]:
# converting the data into lowercase
def lowercase(text):
  return text.lower()

df['comment'] = df['comment'].apply(lowercase)

In [14]:
df.head()

Unnamed: 0,label,comment
0,0,nc and nh
1,0,you do know west teams play against west teams...
2,0,they were underdogs earlier today but since gr...
3,0,this meme isnt funny none of the new york nigg...
4,0,i could use one of those tools


### Tokenization

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
#function for tokenization
def tokenized_data(text, max_length=100):
  return tokenizer(
      text.tolist(),
      max_length = max_length,
      truncation = True,
      padding = 'max_length',
      return_tensors = 'np'
  )


tokenized_data = tokenized_data(df['comment'])

In [17]:
tokenized_data

{'input_ids': array([[  101, 13316,  1998, ...,     0,     0,     0],
       [  101,  2017,  2079, ...,     0,     0,     0],
       [  101,  2027,  2020, ...,     0,     0,     0],
       ...,
       [  101,  5095,  2305, ...,     0,     0,     0],
       [  101, 29420,  2015, ...,     0,     0,     0],
       [  101,  2016, 28719, ...,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}

In [18]:
X = tokenized_data['input_ids']
y = df['label']

### Train Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train.shape, X_test.shape

((7999, 100), (2000, 100))

### Model Building

In [21]:
class HierarchicalBERT(tf.keras.Model):
  def __init__(self, bert_model, lstm_units, cnn_filters, dense_units):
    super(HierarchicalBERT, self).__init__()
    self.bert = bert_model

    # sentence encoding layer
    self.dense_sentence = tf.keras.layers.Dense(768, activation='relu')

    # Context Summarization Layer
    self.mean_pooling = tf.keras.layers.GlobalAveragePooling1D()

    # Context Encoder Layer
    self.bilstm_encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=True))

    #CNN layer
    self.conv = tf.keras.layers.Conv1D(cnn_filters, 2, activation='relu')
    self.pool = tf.keras.layers.GlobalMaxPool1D()

    #FC layer
    self.dense_df = tf.keras.layers.Dense(dense_units, activation='relu')

    #output layer
    self.dense_output = tf.keras.layers.Dense(1, activation='sigmoid')


  def call(self, inputs):
    #BERT embedding
    bert_output = self.bert(inputs) [0]

    #sentence encoding layer
    sentence_encoded = self.dense_sentence(bert_output)

    #context summarization
    context_summarized = self.mean_pooling(sentence_encoded)

    #expand the dimension
    context_summarized = tf.expand_dims(context_summarized , 1)

    #context encoder layer
    context_encoded = self.bilstm_encoder(context_summarized)

    #squeezing the dimension
    context_encoded_squeezed = tf.squeeze(context_encoded, axis = 1)


    #adding the channel dimension to match the required input shape by conv layer
    context_encoded_expanded = tf.expand_dims(context_encoded_squeezed, axis = -1)

    #CNN layer
    cnn_output = self.conv(context_encoded_expanded)

    #pooling layer
    pooled_output = self.pool(cnn_output)

    #FC layer
    dense_output = self.dense_df(pooled_output)

    #output layer
    final_output = self.dense_output(dense_output)

    return final_output

In [22]:
#loading the pretrained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were 

In [23]:
#defining the hierarchical bert model
model = HierarchicalBERT(bert_model, lstm_units=128, cnn_filters=64, dense_units=32)

In [24]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
model.fit(X_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7e8a8c668b90>

In [27]:
loss, accuracy = model.evaluate(X_test,y_test)
print(f'Model Accuracy : {accuracy * 100}')

Model Accuracy : 63.749998807907104


In [28]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
