# Import and Config

In [1]:
import pandas as pd
import spacy
import benepar
from sentence_transformers import SentenceTransformer




In [2]:
# choose models
SPACY_MODEL = "en_core_web_sm"
BEP_MODEL = "benepar_en3"               # benepar model name (download below)
SENT_TRANS_MODEL = "all-MiniLM-L6-v2"  # small/faster SBERT model

In [3]:
# ensure benepar model is available
try:
    benepar.download(BEP_MODEL)
except Exception:
    # benepar may already be present or download may be disabled in some envs
    pass

nlp = spacy.load(SPACY_MODEL)
# add benepar to spaCy pipeline (spaCy v3 style)
nlp.add_pipe("benepar", config={"model": BEP_MODEL})

# sentence-transformers model
embedder = SentenceTransformer(SENT_TRANS_MODEL)

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\igeek\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Load Dataset

In [9]:
import wget as wget
wget.download("https://pilehvar.github.io/wic/package/WiC_dataset.zip")

'WiC_dataset (1).zip'

In [10]:
import zipfile
import os

# Define the path to your zip file and the target directory for extraction
zip_file_path = 'WiC_dataset.zip'
extract_dir = 'WiC_dataset'

# Create the target directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Open the zip file in read mode ('r')
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all contents to the specified directory
    zip_ref.extractall(extract_dir)

print(f"Contents of '{zip_file_path}' extracted to '{extract_dir}'")

Contents of 'WiC_dataset.zip' extracted to 'WiC_dataset'


In [11]:
import pandas as pd

train_df = pd.read_csv("./WiC_dataset/train/train.data.txt", sep="\t", header=None)
train_df.columns = ["lemma", "pos", "index1-index2", "sent_1", "sent_2"]

# clean dataframe
train_df["pos"] = train_df["pos"].apply(lambda x: x.lower())
train_df["tok_idx_1"] = train_df["index1-index2"].apply(lambda x: int(x.split("-")[0]))
train_df["tok_idx_2"] = train_df["index1-index2"].apply(lambda x: int(x.split("-")[1]))
train_df.drop("index1-index2", axis=1, inplace=True)

train_df.head()

Unnamed: 0,lemma,pos,sent_1,sent_2,tok_idx_1,tok_idx_2
0,carry,v,You must carry your camping gear .,Sound carries well over water .,2,1
1,go,v,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,2,6
2,break,v,Break an alibi .,The wholesaler broke the container loads into ...,0,2
3,cup,n,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,8,4
4,academy,n,The Academy of Music .,The French Academy .,1,2


# Main

In [7]:
embedding_dict = dict()
for i in [1, 2]:
    embedding_dict.update({
        f"branch_tuple_{i}": list(),
        f"branch_text_{i}": list(),
        f"branch_embedding_{i}": list(),
    })

for i, row in train_df[:200].iterrows():
    
    for j in [1, 2]:
        doc = nlp(row[f"sent_{j}"])
        tok = doc[row[f"tok_idx_{j}"]]

        tok_lefts = [t.text for t in tok.lefts]
        tok_txt = [tok.text]
        tok_rights = [t.text for t in tok.rights]
        reduced_text = " ".join(tok_lefts + tok_txt + tok_rights)
        reduced_branch = (tuple(tok_lefts), tok.text, tuple(tok_rights))

        # store in dataframe
        embedding_dict[f"branch_tuple_{j}"].append(reduced_branch)
        embedding_dict[f"branch_text_{j}"].append(reduced_text)
        # embed and convert numpy array to list for safe JSON/csv storage; you can keep numpy if you prefer
        embedding_dict[f"branch_embedding_{j}"].append(embedder.encode(reduced_text).tolist())

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [12]:
embedding_df = pd.DataFrame.from_dict(embedding_dict)
train_df = pd.concat([train_df, embedding_df], axis=1)
train_df.head()

Unnamed: 0,lemma,pos,sent_1,sent_2,tok_idx_1,tok_idx_2,branch_tuple_1,branch_text_1,branch_embedding_1,branch_tuple_2,branch_text_2,branch_embedding_2
0,carry,v,You must carry your camping gear .,Sound carries well over water .,2,1,"((You, must), carry, (gear, .))",You must carry gear .,"[-0.012893946841359138, 0.01587832346558571, 0...","((Sound,), carries, (over, .))",Sound carries over .,"[0.053058475255966187, -0.03330995887517929, 0..."
1,go,v,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,2,6,"((Messages, must), go, (through, .))",Messages must go through .,"[0.039575908333063126, -0.0436822883784771, 0....","((sofa, will), go, (through,))",sofa will go through,"[0.09874849766492844, -0.051114026457071304, -..."
2,break,v,Break an alibi .,The wholesaler broke the container loads into ...,0,2,"((), Break, (alibi, .))",Break alibi .,"[0.014493840746581554, 0.07792778313159943, -0...","((wholesaler,), broke, (loads, into, .))",wholesaler broke loads into .,"[-0.0806029811501503, -0.0019472605781629682, ..."
3,cup,n,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,8,4,"((a, metal), cup, ())",a metal cup,"[-0.02742213010787964, 0.07205650210380554, 0....","((the, waxen), cups, ())",the waxen cups,"[-0.01649600639939308, 0.02845943346619606, 0...."
4,academy,n,The Academy of Music .,The French Academy .,1,2,"((The,), Academy, (of, .))",The Academy of .,"[-0.07104917615652084, -0.016971737146377563, ...","((The, French), Academy, (.,))",The French Academy .,"[-0.042090386152267456, 0.0012605104129761457,..."
