In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
import csv
import xml.etree.ElementTree as ET
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load your dataset
# Assuming your dataset is in the following format:
# texts: a list of text samples (strings)
# labels: a list of corresponding binary labels (0 or 1)

texts = []  # Replace with your list of text samples
labels = []  # Replace with your list of binary labels

# opening the CSV file
with open("/content/drive/My Drive/puns_pos_neg_data.csv", mode ='r') as file:

    # reading the CSV file
    csvFile = csv.reader(file)
    
    # displaying the contents of the CSV file
    for line in csvFile:
        #print(line)
        labels.append(0 if line[0] == "-1" else 1)
        texts.append(line[1])

del texts[0] # delete the head
del labels[0] # delete the head

print(len(texts))
print(len(labels))

4826
4826


In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
max_length = 50
encodings = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')

# Split the dataset into train and validation sets
input_ids_train, input_ids_val, attention_mask_train, attention_mask_val, y_train, y_val = train_test_split(
    encodings['input_ids'].numpy(), encodings['attention_mask'].numpy(), labels, test_size=0.2, random_state=42)

x_train = {'input_ids': input_ids_train, 'attention_mask': attention_mask_train}
x_val = {'input_ids': input_ids_val, 'attention_mask': attention_mask_val}

learning_rate = 2e-5
hidden_dim = 128

# Load the BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Set BERT layers to non-trainable
for layer in bert_model.layers:
    layer.trainable = False

# Define the custom model with BERT and a BiLSTM layer
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

bert_output = bert_model(input_ids, attention_mask=attention_mask)[0] # The last hidden-state is the first element of the output tuple
bilstm_output = Bidirectional(LSTM(hidden_dim, return_sequences=False))(bert_output)
output = Dense(1, activation='sigmoid')(bilstm_output)

model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32
epochs = 3
model.fit(x={'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']}, y=np.array(y_train),
          batch_size=batch_size, epochs=epochs,
          validation_data=({'input_ids': x_val['input_ids'], 'attention_mask': x_val['attention_mask']}, np.array(y_val)))

# Evaluate the model
score = model.evaluate({'input_ids': x_val['input_ids'], 'attention_mask': x_val['attention_mask']}, np.array(y_val), batch_size=batch_size)
print('Validation loss:', score[0])
print('Validation accuracy:', score[1])
model.save('/content/drive/My Drive/my_TF_model.h5')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation loss: 0.38107603788375854
Validation accuracy: 0.8405796885490417




In [None]:
custom_objects = {'TFBertModel': TFBertModel} # If do not use this, it cannot load the BERT layer.
loaded_model = load_model('/content/drive/My Drive/my_TF_model.h5', custom_objects=custom_objects)



In [1]:
!wget https://alt.qcri.org/semeval2017/task7/data/uploads/semeval2017_task7.tar.xz
!tar -xf semeval2017_task7.tar.xz
#!tar -xvf semeval2017_task7.tar.xz
#%cd semeval2017_task7/
#%cd ..
%ls

--2023-05-02 20:29:25--  https://alt.qcri.org/semeval2017/task7/data/uploads/semeval2017_task7.tar.xz
Resolving alt.qcri.org (alt.qcri.org)... 80.76.166.231
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.231|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 748424 (731K) [application/x-xz]
Saving to: ‘semeval2017_task7.tar.xz’


2023-05-02 20:29:27 (869 KB/s) - ‘semeval2017_task7.tar.xz’ saved [748424/748424]

[0m[01;34msample_data[0m/  [01;34msemeval2017_task7[0m/  semeval2017_task7.tar.xz


In [None]:
f = 'semeval2017_task7/data/test/subtask1-heterographic-test.xml'

mytree = ET.parse(f)
myroot = mytree.getroot()

puns = []
for item in myroot.findall('./text'):
  dict1 = {}
  dict1[item.attrib['id']] = {}
  for child in item:
    idd = child.attrib['id']
    dict1[item.attrib['id']][idd] = child.text
  puns.append(dict1)

print(puns[0])

{'het_1': {'het_1_1': "'", 'het_1_2': "'", 'het_1_3': 'I', 'het_1_4': "'", 'het_1_5': 'm', 'het_1_6': 'halfway', 'het_1_7': 'up', 'het_1_8': 'a', 'het_1_9': 'mountain', 'het_1_10': ',', 'het_1_11': "'", 'het_1_12': "'", 'het_1_13': 'Tom', 'het_1_14': 'alleged', 'het_1_15': '.'}}


In [None]:
gold = []
with open('semeval2017_task7/data/test/subtask1-heterographic-test.gold', 'r') as fin:
  for row in fin:
    gold.append(row.strip().split('\t')[1])
print(gold)

['1', '1', '0', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1', '1', '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '0', '0', '1', '1', '1', '0', '1', '1', '1', '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '0', '0', '1', '1', '0', '1', '1', '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0', '1', '1', '0', '1', '0', '1', '1', '1', '1', '1', '1', '0', '0', '1', '0', '1', '0', '1', '1', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0', '1', '1', '0', '1', '1', '1', '1', '0', '0', '1', '0', '0', '1', '1', '0', '1', '0', '0', '0', '1', '0', '1', '0', '0', '1', '0', '1', '0', '1', '0', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '1', '1', '1', '0', '0', '1',

In [None]:
subtask1_heterographic = []
for i in puns:
    for pun in i.values():
        poss = [x for x in pun]
        sentence = ' '.join([pun[x] for x in pun])
        # print(sentence)
        subtask1_heterographic.append(sentence)

print(len(gold))
print(len(subtask1_heterographic))

1780
1780


In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
max_length = 50
encodings = tokenizer(subtask1_heterographic, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')

input_ids_test, attention_mask_test, y_test = encodings['input_ids'].numpy(), encodings['attention_mask'].numpy(), np.array(gold, dtype=int)

x_test = {'input_ids': input_ids_test, 'attention_mask': attention_mask_test}

# Evaluate the model
score = loaded_model.evaluate({'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}, np.array(y_test), batch_size=batch_size)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.5562201738357544
Test accuracy: 0.7348314523696899


In [None]:
f = 'semeval2017_task7/data/test/subtask1-homographic-test.xml'

mytree = ET.parse(f)
myroot = mytree.getroot()

puns = []
for item in myroot.findall('./text'):
  dict1 = {}
  dict1[item.attrib['id']] = {}
  for child in item:
    idd = child.attrib['id']
    dict1[item.attrib['id']][idd] = child.text
  puns.append(dict1)

print(puns[0])

{'hom_1': {'hom_1_1': 'They', 'hom_1_2': 'hid', 'hom_1_3': 'from', 'hom_1_4': 'the', 'hom_1_5': 'gunman', 'hom_1_6': 'in', 'hom_1_7': 'a', 'hom_1_8': 'sauna', 'hom_1_9': 'where', 'hom_1_10': 'they', 'hom_1_11': 'could', 'hom_1_12': 'sweat', 'hom_1_13': 'it', 'hom_1_14': 'out', 'hom_1_15': '.'}}


In [None]:
gold_homo = []
with open('semeval2017_task7/data/test/subtask1-homographic-test.gold', 'r') as fin:
  for row in fin:
    gold_homo.append(row.strip().split('\t')[1])
print(gold_homo)

['1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '0', '0', '0', '1', '1', '0', '1', '0', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '0', '0', '1', '0', '1', '1', '1', '1', '0', '0', '0', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '0', '0', '1', '1', '0', '1', '1', '1', '1', '1', '0', '1', '1', '0', '1', '1', '1', '0', '1', '1', '0', '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1', '0', '1', '0', '1', '1', '0', '1', '1', '0', '1', '0', '1', '0', '1', '1', '1', '1', '1', '0',

In [None]:
subtask1_homographic = []
for i in puns:
    for pun in i.values():
        poss = [x for x in pun]
        sentence = ' '.join([pun[x] for x in pun])
        # print(sentence)
        subtask1_homographic.append(sentence)

print(len(gold_homo))
print(len(subtask1_homographic))

2250
2250


In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
max_length = 50
encodings = tokenizer(subtask1_homographic, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')

input_ids_test, attention_mask_test, y_test = encodings['input_ids'].numpy(), encodings['attention_mask'].numpy(), np.array(gold_homo, dtype=int)

x_test = {'input_ids': input_ids_test, 'attention_mask': attention_mask_test}

# Evaluate the model
score = loaded_model.evaluate({'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}, np.array(y_test), batch_size=batch_size)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.5362956523895264
Test accuracy: 0.7368888854980469
