In [None]:
!pip install transformers
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-2.0.0+cu118.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-2.0.0+cu118.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-2.0.0+cu118.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-2.0.0+cu118.html
# The same as Torch version and CUDA version (torch.__version__ is 2.0.0+cu118)
!pip install torch-geometric
!python3 -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
print(torch.__version__)
from torch_geometric.nn import GCNConv, global_mean_pool, global_max_pool
from torch_geometric.data import Data, DataLoader
import numpy as np
from transformers import BertModel, BertTokenizer, BertConfig
import csv
import spacy
import networkx as nx
#from gensim.models import Word2Vec
from tqdm import tqdm
import xml.etree.ElementTree as ET
from google.colab import drive
drive.mount('/content/drive')

2.0.0+cu118
Mounted at /content/drive


In [None]:
# Custom GNN and BiLSTM layers
class GNN(torch.nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim):
        # input_dim is number of features of each node
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index, edge_weight, batch):
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return x

class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)

    def forward(self, x):
        output, _ = self.lstm(x)
        return output


class BERT_GNN_Classifier(nn.Module):
    def __init__(self, num_classes, hidden_dim, num_lstm_layers, gnn_hidden_dim, gnn_output_dim, max_length, batch_size):
        super().__init__()
        self.max_length = max_length
        self.batch_size = batch_size
        self.gnn_output_dim = gnn_output_dim
        self.bert_config = BertConfig.from_pretrained("bert-base-uncased")
        self.bert = BertModel.from_pretrained("bert-base-uncased", config=self.bert_config)
        # Embedding size of BERT is 768
        self.gnn = GNN(self.bert_config.hidden_size, gnn_hidden_dim, gnn_output_dim)
        #self.bilstm = BiLSTM(self.bert_config.hidden_size, hidden_dim, num_lstm_layers)
        self.bilstm = BiLSTM(gnn_output_dim, hidden_dim, num_lstm_layers)
        self.fc = nn.Linear(hidden_dim * 2, 1) # 2 is bidirectional LSTM, has 2, 1 is the output
        # self.fc = nn.Linear(hidden_dim * 2, num_classes) # multi-class classfication
        # self.sigmoid = nn.Sigmoid()

    def forward(self, data):
        input_ids, attention_mask = data.input_ids, data.attention_mask
        corresponding_index, sentence_len = data.corresponding, data.sentence_len
        edge_index, edge_weight, num_nodes, batch = data.edge_index, data.edge_weight, data.num_nodes, data.batch
        #print(sentence_len)
        assert sum(sentence_len) == len(corresponding_index)
        actual_batch_size = len(input_ids) // self.max_length

        input_ids = torch.reshape(input_ids, (actual_batch_size, self.max_length))
        attention_mask = torch.reshape(attention_mask, (actual_batch_size, self.max_length))
        # print("input_ids.shape:", input_ids.shape) # torch.Size([self.batch_size, self.max_length])

        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = bert_output.last_hidden_state
        # print("hidden_states.shape:", hidden_states.shape) # torch.Size([self.batch_size, self.max_length, 768])

        gnn_input = torch.zeros((len(corresponding_index), hidden_states.shape[-1]))
        each_batch_index = 0
        start_index = 0
        each_counter = 0
        # print(corresponding_index)
        for each_corr_i in corresponding_index:
            # each_corr_i + 1 because the input_ids has a start ID in the front,
            # which needs to add 1 to match the index
            gnn_input[start_index] = hidden_states[each_batch_index, each_corr_i + 1]
            start_index += 1
            each_counter += 1
            if each_counter == sentence_len[each_batch_index]:
                each_batch_index += 1
                each_counter = 0
        each_counter = 0
        assert each_batch_index == actual_batch_size
        assert start_index == len(corresponding_index)
        
        gnn_output = self.gnn(x=gnn_input, edge_index=edge_index, edge_weight=edge_weight, batch=batch)

        bilstm_input = torch.zeros((actual_batch_size, self.max_length, self.gnn_output_dim))
        each_batch_index = 0
        start_index = 0
        # Similar to padding
        for each_sen_len in sentence_len:
            for each_word_i in range(each_sen_len):
                bilstm_input[each_batch_index, each_word_i] = gnn_output[start_index]
                start_index += 1
            each_batch_index += 1

        each_batch_index = 0
        start_index = 0

        bilstm_output = self.bilstm(bilstm_input)
        '''
        Anthor way is using only last hidden state of the LSTM cell:

        Using lstm_output[:, -1] selects the last hidden state of the LSTM cell
        for each sequence in the batch. The reason we use this approach in the
        example provided is that the last hidden state is often a good
        representation of the entire sequence in many sequence-to-sequence
        models, especially for classification tasks.

        When we use lstm_output[:, -1], we're selecting the hidden states of the
        LSTM cells at the last time step (i.e., the last token in the input sequence)
        for each sequence in the batch. This can be a good representation of the
        entire sequence for classification tasks since it captures information
        from both the forward and backward passes of the sequence.
        '''
        pooled_output = torch.mean(bilstm_output, 1)
        logits = self.fc(pooled_output)
        # return self.sigmoid(logits)
        return logits

In [None]:
def create_dataset(texts, labels, tokenizer, max_length):
    assert len(texts) == len(labels)
    dataset = []

    # Load the English language model
    nlp = spacy.load("en_core_web_sm")

    for data_index in tqdm(range(len(labels))):
        text, label = texts[data_index], labels[data_index]
        # Define the input text
        # text = "What will you purchase me for my birthday in July?"

        # Process the text with the spaCy NLP pipeline
        doc = nlp(text)

        # Create a directed graph
        graph = nx.DiGraph()

        # Add nodes and edges based on the dependency relations
        for token in doc:
            graph.add_node(token.i, word=token.text)
            graph.add_edge(token.head.i, token.i, relation=token.dep_)

        # Print the graph's nodes and edges

        # print(len(graph.nodes(data=True)))
        words = []
        for node in graph.nodes(data=True):
            #print(f"Node: {node}")
            #print(node)
            words.append(node)
        words = sorted(words, key=lambda x: x[0])
        words = [x[1]['word'] for x in words]
        #print(words)

        sentence = ' '.join(words)

        corresponding_index = []
        # consider which token (in tokens[], base on index) to be the corresponding word

        current_index = 0
        start = False
        idx = 0
        #print(words)
        for x in words:
            #print(x)
            subwords = tokenizer.tokenize((' ' if start else '') + x)
            start = True
            #print(subwords)
            subwords_new = [subwords[0]]
            for i in range(1, len(subwords)):
                subwords_new.append(subwords[i][2:]) # delete the '##'
            #print(subwords_new)
            subwords_new_len = np.zeros((len(subwords_new),), dtype=float)
            for count_index, each_token in enumerate(subwords_new):
                subwords_new_len[count_index] = len(each_token)
            longest_index = np.argmax(subwords_new_len, axis=0)
            #print(longest_index)
            if 1 + current_index + longest_index < max_length - 1:
                # Cannot longer than max_length
                # because the input_ids has a start ID and end ID
                # which needs to +1 and -1 to match the index
                corresponding_index.append(current_index + longest_index)
                current_index += len(subwords)
                idx += 1
            else:
                # Do Again! (the sentence is shorter)
                doc = nlp(' '.join(words[:idx]))
                graph = nx.DiGraph()
                for token in doc:
                    graph.add_node(token.i, word=token.text)
                    graph.add_edge(token.head.i, token.i, relation=token.dep_)
                break

        #print(corresponding_index)
        corresponding_index = torch.tensor(corresponding_index, dtype=torch.long)
        #print(corresponding_index)
        if len(corresponding_index) != len(words[:idx]):
            print("Words number error!")

        tokens = tokenizer.tokenize(sentence)
        #print(tokens)
        
        tokenized = tokenizer(sentence, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
        #print(tokenized)

        num_nodes = len(corresponding_index)
        source_nodes = [i for j in range(num_nodes) for i in range(num_nodes) if i != j]
        target_nodes = [j for j in range(num_nodes) for i in range(num_nodes) if i != j]
        # Complete directed graph, no self-loop

        # Add the root, which is self-loop, let the temporary index is -1, -1, and it is the last element
        source_nodes.append(-1)
        target_nodes.append(-1)

        source_nodes = torch.tensor(source_nodes, dtype=torch.long)
        target_nodes = torch.tensor(target_nodes, dtype=torch.long)
        edge_index = torch.stack([source_nodes, target_nodes], dim=0)
        #print(edge_index)
        # print(edge_index.shape[1]) # the number of edges

        # TODO: add edge weigth
        # edge_weight = torch.rand(edge_index.shape[1])  # Replace this with the actual edge weights of your adjacency matrix
        edge_weight = torch.ones(edge_index.shape[1])

        has_ROOT = False
        # Add dependency parsing tree
        for edge in graph.edges(data=True):
            # print(f"Edge: {edge}")
            #print(edge)
            source_index = edge[0]
            target_index = edge[1]
            if source_index == target_index:
                # ROOT
                source_nodes[-1] = source_index
                target_nodes[-1] = target_index
                edge_index[0, len(source_nodes) - 1] = source_index
                edge_index[1, len(target_nodes) - 1] = target_index
                edge_weight[len(edge_weight) - 1] = 10.0 # TODO: may change this number
                # print(edge_index[:, len(edge_weight) - 1])
                has_ROOT = True
            else:
                # not ROOT
                edge_weight[target_index * (num_nodes - 1) + source_index + (-1 if source_index > target_index else 0)] *= 10.0
                # TODO: may change this number
                # print(edge_index[:, target_index * (num_nodes - 1) + source_index + (-1 if source_index > target_index else 0)])
        if not has_ROOT:
            print("No ROOT, something wrong!")

        y = torch.tensor(label, dtype=torch.long)

        #tokenized["input_ids"][0] == tokenized["input_ids"].flatten()
        #print(corresponding_index)
        if len(tokenized["input_ids"][0]) != max_length:
            print(len(tokenized["input_ids"][0]))
            print("The length is wrong!")
        data = Data(input_ids=tokenized["input_ids"][0], attention_mask=tokenized["attention_mask"][0],
                    corresponding=corresponding_index, sentence_len=torch.tensor(len(corresponding_index), dtype=torch.long),
                    edge_index=edge_index, edge_weight=edge_weight, y=y, num_nodes=torch.tensor(num_nodes, dtype=torch.long))
        # DO NOT corresponding_index=corresponding_index!
        # If the end of the name of parameters of Data() has '_index',
        # the package will consider this is index list, it will be automatic update the value list:
        # Example: [1,2,5], [2,3,9], [3,6,1] -> [1,2,5,8,9,15,19,22,17]
        # Not: [1,2,5], [2,3,9], [3,6,1] -> [1,2,5,2,3,9,3,6,1]
        # If set num_nodes as parameters of Data(), it will be sum automatically (is a number finally, not a list)
        dataset.append(data)

    return dataset

In [None]:
# **sigmoid**
# If using sigmoid, we should use this accuracy function
def calculate_accuracy(output, target):
    threshold = 0.5
    predictions = (output > threshold).float()
    correct = (predictions == target).sum().item()
    total = target.numel()
    return correct / total

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0.0
    total_accuracy = 0.0
    for batch in tqdm(data_loader):
        batch = batch.to(device)
        labels = batch.y.unsqueeze(1).float().to(device)

        optimizer.zero_grad()
        logits = model(batch)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        sigmoid = torch.sigmoid(logits.view(-1)).unsqueeze(1)
        total_accuracy += calculate_accuracy(sigmoid, labels)
        #print(f"Train Loss: {total_loss:.4f}, Train Accuracy: {total_accuracy:.4f}")

    return total_loss / len(data_loader), total_accuracy / len(data_loader)


def eval_epoch(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0.0
    total_accuracy = 0.0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            batch = batch.to(device)
            labels = batch.y.unsqueeze(1).float().to(device)

            logits = model(batch)
            loss = loss_fn(logits, labels)

            total_loss += loss.item()
            sigmoid = torch.sigmoid(logits.view(-1)).unsqueeze(1)
            total_accuracy += calculate_accuracy(sigmoid, labels)
            #print(f"Validation Loss: {total_loss:.4f}, Validation Accuracy: {total_accuracy:.4f}")

    return total_loss / len(data_loader), total_accuracy / len(data_loader)

In [None]:
# Assuming you have your data as lists: train_texts, train_labels, val_texts, and val_labels
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 50  # Adjust the maximum length based on your dataset
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
texts_PTD = []
labels_PTD = []

# opening the CSV file
with open("/content/drive/My Drive/puns_pos_neg_data.csv", mode ='r') as file:

    # reading the CSV file
    csvFile = csv.reader(file)
    
    # displaying the contents of the CSV file
    for line in csvFile:
        #print(line)
        labels_PTD.append(0 if line[0] == "-1" else 1)
        texts_PTD.append(line[1])

del texts_PTD[0] # delete the head
del labels_PTD[0] # delete the head

# Create data loaders
dataset = create_dataset(texts=texts_PTD, labels=labels_PTD,
                         tokenizer=tokenizer, max_length=max_length)

# Split the dataset into training and validation sets
train_dataset, val_dataset = torch.utils.data.random_split(
    dataset, [len(labels_PTD)-int(0.2*len(labels_PTD)), int(0.2*len(labels_PTD))])

# Create DataLoaders for each set with a batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

100%|██████████| 4826/4826 [01:34<00:00, 51.02it/s]


In [None]:
# Initialize the custom BERT model
num_classes = len(set(labels_PTD))  # Assuming labels are integers starting from 0
hidden_dim = 128
num_lstm_layers = 2
gnn_hidden_dim = 512
gnn_output_dim = 256
learning_rate = 2e-5
model_path = "/content/drive/My Drive/my_GNN_PT_model.pt"  # Choose your desired path and filename


# Set up the loss function and optimizer
# loss_fn = nn.CrossEntropyLoss() # use by softmax
loss_fn = nn.BCEWithLogitsLoss()

In [None]:
model = BERT_GNN_Classifier(num_classes, hidden_dim, num_lstm_layers,
                            gnn_hidden_dim, gnn_output_dim, max_length, batch_size).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
num_epochs = 5

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss, train_acc = train_epoch(model, train_loader, loss_fn, optimizer, device)
    torch.save(model, model_path) # save the entire model, including the architecture
    val_loss, val_acc = eval_epoch(model, val_loader, loss_fn, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}\n")
print("Training complete.")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/5


100%|██████████| 121/121 [23:48<00:00, 11.81s/it]
100%|██████████| 31/31 [01:50<00:00,  3.56s/it]


Train Loss: 0.5895, Validation Loss: 0.4559, Train Accuracy: 0.7923, Validation Accuracy: 0.8827

Epoch 2/5


100%|██████████| 121/121 [23:35<00:00, 11.70s/it]
100%|██████████| 31/31 [01:50<00:00,  3.56s/it]


Train Loss: 0.3710, Validation Loss: 0.3703, Train Accuracy: 0.9179, Validation Accuracy: 0.8833

Epoch 3/5


100%|██████████| 121/121 [23:39<00:00, 11.73s/it]
100%|██████████| 31/31 [01:50<00:00,  3.55s/it]


Train Loss: 0.2456, Validation Loss: 0.2680, Train Accuracy: 0.9434, Validation Accuracy: 0.9179

Epoch 4/5


100%|██████████| 121/121 [23:26<00:00, 11.63s/it]
100%|██████████| 31/31 [01:50<00:00,  3.55s/it]


Train Loss: 0.1511, Validation Loss: 0.2409, Train Accuracy: 0.9674, Validation Accuracy: 0.9143

Epoch 5/5


100%|██████████| 121/121 [23:25<00:00, 11.62s/it]
100%|██████████| 31/31 [01:48<00:00,  3.50s/it]

Train Loss: 0.0937, Validation Loss: 0.2463, Train Accuracy: 0.9773, Validation Accuracy: 0.9210

Training complete.





In [None]:
# model_path = "/content/drive/My Drive/my_GNN_PT_model.pt"
loaded_model = torch.load(model_path)

In [None]:
!wget https://alt.qcri.org/semeval2017/task7/data/uploads/semeval2017_task7.tar.xz
!tar -xf semeval2017_task7.tar.xz
#!tar -xvf semeval2017_task7.tar.xz
#%cd semeval2017_task7/
#%cd ..
%ls

--2023-05-02 20:27:45--  https://alt.qcri.org/semeval2017/task7/data/uploads/semeval2017_task7.tar.xz
Resolving alt.qcri.org (alt.qcri.org)... 80.76.166.231
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.231|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 748424 (731K) [application/x-xz]
Saving to: ‘semeval2017_task7.tar.xz’


2023-05-02 20:27:48 (823 KB/s) - ‘semeval2017_task7.tar.xz’ saved [748424/748424]

[0m[01;34msample_data[0m/  [01;34msemeval2017_task7[0m/  semeval2017_task7.tar.xz


In [None]:
f = 'semeval2017_task7/data/test/subtask1-heterographic-test.xml'

mytree = ET.parse(f)
myroot = mytree.getroot()

puns = []
for item in myroot.findall('./text'):
  dict1 = {}
  dict1[item.attrib['id']] = {}
  for child in item:
    idd = child.attrib['id']
    dict1[item.attrib['id']][idd] = child.text
  puns.append(dict1)

print(puns[0])

{'het_1': {'het_1_1': "'", 'het_1_2': "'", 'het_1_3': 'I', 'het_1_4': "'", 'het_1_5': 'm', 'het_1_6': 'halfway', 'het_1_7': 'up', 'het_1_8': 'a', 'het_1_9': 'mountain', 'het_1_10': ',', 'het_1_11': "'", 'het_1_12': "'", 'het_1_13': 'Tom', 'het_1_14': 'alleged', 'het_1_15': '.'}}


In [None]:
gold = []
with open('semeval2017_task7/data/test/subtask1-heterographic-test.gold', 'r') as fin:
  for row in fin:
    gold.append(int(row.strip().split('\t')[1]))
print(gold)

[1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 

In [None]:
subtask1_heterographic = []
for i in puns:
    for pun in i.values():
        #poss = [x for x in pun]
        sentence = ' '.join([pun[x].replace(u'\xa0', '_') for x in pun])
        # print(sentence)
        subtask1_heterographic.append(sentence)

print(len(gold))
print(len(subtask1_heterographic))
print(subtask1_heterographic[0])

1780
1780
' ' I ' m halfway up a mountain , ' ' Tom alleged .


In [None]:
test_dataset = create_dataset(texts=subtask1_heterographic, labels=gold,
                         tokenizer=tokenizer, max_length=max_length)

# Create DataLoaders for each set with a batch size
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test_loss, test_acc = eval_epoch(loaded_model, test_loader, loss_fn, device)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

100%|██████████| 1780/1780 [00:18<00:00, 98.09it/s]
100%|██████████| 56/56 [03:22<00:00,  3.62s/it]

Test Loss: 0.5280, Test Accuracy: 0.8258





In [None]:
f = 'semeval2017_task7/data/test/subtask1-homographic-test.xml'

mytree = ET.parse(f)
myroot = mytree.getroot()

puns = []
for item in myroot.findall('./text'):
  dict1 = {}
  dict1[item.attrib['id']] = {}
  for child in item:
    idd = child.attrib['id']
    dict1[item.attrib['id']][idd] = child.text
  puns.append(dict1)

print(puns[0])

{'hom_1': {'hom_1_1': 'They', 'hom_1_2': 'hid', 'hom_1_3': 'from', 'hom_1_4': 'the', 'hom_1_5': 'gunman', 'hom_1_6': 'in', 'hom_1_7': 'a', 'hom_1_8': 'sauna', 'hom_1_9': 'where', 'hom_1_10': 'they', 'hom_1_11': 'could', 'hom_1_12': 'sweat', 'hom_1_13': 'it', 'hom_1_14': 'out', 'hom_1_15': '.'}}


In [None]:
gold = []
with open('semeval2017_task7/data/test/subtask1-homographic-test.gold', 'r') as fin:
  for row in fin:
    gold.append(int(row.strip().split('\t')[1]))
print(gold)

[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 

In [None]:
subtask1_homographic = []
for i in puns:
    for pun in i.values():
        #poss = [x for x in pun]
        sentence = ' '.join([pun[x].replace(u'\xa0', '_') for x in pun])
        # print(sentence)
        subtask1_homographic.append(sentence)

print(len(gold))
print(len(subtask1_homographic))
print(subtask1_homographic[0])

2250
2250
They hid from the gunman in a sauna where they could sweat it out .


In [None]:
test_dataset = create_dataset(texts=subtask1_homographic, labels=gold,
                         tokenizer=tokenizer, max_length=max_length)

# Create DataLoaders for each set with a batch size
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test_loss, test_acc = eval_epoch(loaded_model, test_loader, loss_fn, device)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

100%|██████████| 2250/2250 [00:35<00:00, 63.73it/s]
100%|██████████| 71/71 [06:57<00:00,  5.88s/it]

Test Loss: 0.4749, Test Accuracy: 0.8577



