In [7]:
!pip install --upgrade transformers==4.44.2
!pip install --upgrade tensorflow==2.18.0



In [8]:
import zipfile
import pandas as pd
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [9]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle

In [10]:
!kaggle datasets download -d danofer/sarcasm

Dataset URL: https://www.kaggle.com/datasets/danofer/sarcasm
License(s): copyright-authors
Downloading sarcasm.zip to /content
 84% 182M/216M [00:00<00:00, 629MB/s] 
100% 216M/216M [00:00<00:00, 563MB/s]


In [11]:
zip_ref = zipfile.ZipFile('/content/sarcasm.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [30]:
df = pd.read_csv('/content/train-balanced-sarcasm.csv')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [31]:
df.shape

(1010826, 10)

In [38]:
df = df.sample(10000, random_state=42)
df.shape

(10000, 10)

In [39]:
df = df[['label', 'comment']]
df.head()

Unnamed: 0,label,comment
608627,1,And he sure as hell is successful!
456977,1,"wait, you have to win the candidates to challe..."
803801,1,I mean how could anyone not see that it is ant...
926708,0,"Funny thing, most of the girl pants I've bough..."
129279,0,"Also Ranch dressing, because LBJ was Texan, an..."


In [40]:
df.isna().sum()

Unnamed: 0,0
label,0
comment,0


In [41]:
# dropping the 55 rows in the 1010826 doesn't make any impact
df.dropna(inplace=True)
df.isna().sum()

Unnamed: 0,0
label,0
comment,0


In [42]:
#removing the unwanted numerals and symbols
df['comment'] = df['comment'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
df.head()

Unnamed: 0,label,comment
608627,1,And he sure as hell is successful
456977,1,wait you have to win the candidates to challen...
803801,1,I mean how could anyone not see that it is ant...
926708,0,Funny thing most of the girl pants Ive bought ...
129279,0,Also Ranch dressing because LBJ was Texan and ...


In [43]:
#converting the data the data into lower case
def lower_case(text):
  return text.lower()

df['comment'] = df['comment'].apply(lower_case)
df.head()

Unnamed: 0,label,comment
608627,1,and he sure as hell is successful
456977,1,wait you have to win the candidates to challen...
803801,1,i mean how could anyone not see that it is ant...
926708,0,funny thing most of the girl pants ive bought ...
129279,0,also ranch dressing because lbj was texan and ...


In [44]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [45]:
def tokenize_data(text, max_length = 100):
  return tokenizer(
      text.tolist(),
      max_length = max_length,
      truncation = True,
      padding = 'max_length',
      return_tensors = 'np'
  )
tokenized_data = tokenize_data(df['comment'])


In [46]:
X = tokenized_data['input_ids']
y = df['label']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
X_train.shape, X_test.shape

((8000, 100), (2000, 100))

In [49]:
class HierarchicalBERT(tf.keras.Model):
  def __init__(self, bert_model, lstm_units, cnn_filters, dense_units):
    super(HierarchicalBERT, self).__init__()
    self.bert = bert_model

    #sentence encoding layer
    self.dense_sentense = tf.keras.layers.Dense(768, activation='relu')

    #Context Summarization layer
    #adding or pooling above individual vectors into summarized context layer
    self.mean_pooling = tf.keras.layers.GlobalAveragePooling1D()

    #Context Encoder Layer
    #Here we are using the LSTM for capturing the temporal dependencies and context of summaried data from above from both sides
    self.bilstm_encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences = True))

    #CNN Layer
    #it will extract the local features
    self.conv = tf.keras.layers.Conv1D(cnn_filters, 2, activation='relu')
    self.pool = tf.keras.layers.GlobalMaxPooling1D()

    #Fully connected layer
    self.dense = tf.keras.layers.Dense(dense_units, activation='relu')
    #Output Layer
    self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')
  def call(self, inputs):
    #BERT embedding
    bert_output = self.bert(inputs)[0]
    #sentence encoding layer
    sentence_encoded = self.dense_sentense(bert_output)

    #context summarization layer
    context_summarized = self.mean_pooling(sentence_encoded)

    #expand the dimension
    context_summarized = tf.expand_dims(context_summarized, 1)

    #context encoder layer
    context_encoded = self.bilstm_encoder(context_summarized)

    #squeezing the dimension
    context_encoded_squeezed = tf.squeeze(context_encoded, axis = 1)

    #adding the channel dimension as required input shapeby convlayer
    context_encoded_expanded = tf.expand_dims(context_encoded_squeezed, axis = -1)
    #CNN layer
    cnn_output = self.conv(context_encoded_expanded)
    cnn_output = self.pool(cnn_output)
    #Fully contected layer
    dense_output = self.dense(cnn_output)
    #output layer
    output = self.output_layer(dense_output)

    return output


In [50]:
#loading the ber model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [51]:
#defining the hierarchical bert model
model = HierarchicalBERT(bert_model, 128, 64, 32)

In [52]:
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

In [53]:

model.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=5
)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7cafa7ec6b70>

In [54]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.6931701898574829, Test Accuracy: 0.4984999895095825


In [55]:
model.save('Sarcasm_Model')

