# Import and Config

In [4]:
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
import tensorflow as tf

# Import DisCoFuzz package classes
from discofuzz.constants import *
from discofuzz import (
    LemmaVectorizer,
    FourierPDF,
    FuzzyFourierSetMixin,
    FourierFuzzifier,
    FuzzyFourierTensorTransformer,
    SpacyDependencyComposer
)

In [5]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPU available: {gpus}")

In [6]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
spacy_model = spacy.load("en_core_web_sm")
lemma_vectorizer = LemmaVectorizer()
fuzzifier = FuzzyFourierTensorTransformer()

# npsd-ot seems to do best for the fuzzy intersection+mean composition model
sim_metric = "npsd-ot"

composers = {
    s: SpacyDependencyComposer(
        strategy=s,
        fuzzifier=fuzzifier
    )
    for s in STRATEGIES
}

gloss_vect_fuzzy_cols = {
    s: f"gloss_fuzzy_{s}"
    for s in STRATEGIES
}

In [7]:
sim = tf.keras.losses.CosineSimilarity(
    axis=-1,
    reduction='sum_over_batch_size',
    name='cosine_similarity'
)
def cosine_similarity(a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
  return -1*sim(a, b).numpy()

# Load Dataset

In [8]:
import wget as wget
wget.download("https://pilehvar.github.io/wic/package/WiC_dataset.zip")

'WiC_dataset (1).zip'

In [9]:
import zipfile
import os

# Define the path to your zip file and the target directory for extraction
zip_file_path = 'WiC_dataset.zip'
extract_dir = 'WiC_dataset'

# Create the target directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Open the zip file in read mode ('r')
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all contents to the specified directory
    zip_ref.extractall(extract_dir)

print(f"Contents of '{zip_file_path}' extracted to '{extract_dir}'")

Contents of 'WiC_dataset.zip' extracted to 'WiC_dataset'


In [10]:
import pandas as pd

train_df = pd.read_csv("./WiC_dataset/train/train.data.txt", sep="\t", header=None)
train_df.columns = ["lemma", "pos", "index1-index2", "sent_1", "sent_2"]

# clean dataframe
train_df["pos"] = train_df["pos"].apply(lambda x: x.lower())
train_df["tok_idx_1"] = train_df["index1-index2"].apply(lambda x: int(x.split("-")[0]))
train_df["tok_idx_2"] = train_df["index1-index2"].apply(lambda x: int(x.split("-")[1]))
train_df.drop("index1-index2", axis=1, inplace=True)

train_df.head()

Unnamed: 0,lemma,pos,sent_1,sent_2,tok_idx_1,tok_idx_2
0,carry,v,You must carry your camping gear .,Sound carries well over water .,2,1
1,go,v,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,2,6
2,break,v,Break an alibi .,The wholesaler broke the container loads into ...,0,2
3,cup,n,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,8,4
4,academy,n,The Academy of Music .,The French Academy .,1,2


# Main

In [11]:
def get_branch_tuple_embedding(branch: tuple|spacy.tokens.Token):
    if isinstance(branch, tuple):
        return tuple([get_branch_tuple_embedding(t) for t in branch])
    elif isinstance(branch, spacy.tokens.Token):
        # lemma_vectorizer returns a numpy array, so convert it to tf.Tensor for fuzzification
        current_tok_tens = lemma_vectorizer(branch.lemma_.lower())
        current_tok_tens = tf.convert_to_tensor(current_tok_tens, dtype=tf.float32)
        return fuzzifier.fuzzify(current_tok_tens)

In [None]:
embedding_dict = dict()
for i in [1, 2]:
    embedding_dict.update({
        f"branch_tuple_embedding_{i}": list(),
        f"branch_text_{i}": list(),
        f"branch_text_embedding_{i}": list(),
    })

for i, row in train_df[:200].iterrows():
    
    for j in [1, 2]:
        doc = spacy_model(row[f"sent_{j}"])
        tok = doc[row[f"tok_idx_{j}"]]

        tok_lefts = [t for t in tok.lefts]
        tok_rights = [t for t in tok.rights if not t.is_punct]
        branch = tok_lefts + [tok] + tok_rights

        branch_tuple_embedding = get_branch_tuple_embedding((
            tuple(tok_lefts),
            (tok, ),
            tuple(tok_rights)
        ))
        branch_text = " ".join([t.text for t in branch])
        branch_text_embedding = embedding_model.encode(branch_text).tolist()

        # store in dataframe
        embedding_dict[f"branch_tuple_embedding_{j}"].append(branch_tuple_embedding)
        embedding_dict[f"branch_text_{j}"].append(branch_text)
        embedding_dict[f"branch_text_embedding_{j}"].append(branch_text_embedding)

In [16]:
embedding_df = pd.DataFrame.from_dict(embedding_dict)
train_df = pd.concat([train_df, embedding_df], axis=1)
train_df.head()

Unnamed: 0,lemma,pos,sent_1,sent_2,tok_idx_1,tok_idx_2,branch_tuple_embedding_1,branch_text_1,branch_text_embedding_1,branch_tuple_embedding_2,branch_text_2,branch_text_embedding_2
0,carry,v,You must carry your camping gear .,Sound carries well over water .,2,1,(((tf.Tensor(\n[0.00338854-2.8324742e-03j 0.00...,You must carry gear,"[-0.029455117881298065, 0.014959510415792465, ...",(((tf.Tensor(\n[0.0043277 -6.1928941e-04j 0.00...,Sound carries over,"[0.03301728144288063, -0.04025080427527428, 0...."
1,go,v,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,2,6,(((tf.Tensor(\n[0.0043698 -1.7961758e-05j 0.00...,Messages must go through,"[0.052632834762334824, -0.03701762855052948, 0...",(((tf.Tensor(\n[0.00379175+2.2262512e-03j 0.00...,sofa will go through,"[0.09874849766492844, -0.051114026457071304, -..."
2,break,v,Break an alibi .,The wholesaler broke the container loads into ...,0,2,"((), ((tf.Tensor(\n[0.00409203-1.5697419e-03j ...",Break alibi,"[-0.0074392096139490604, 0.07352712750434875, ...",(((tf.Tensor(\n[0.00377977-2.2476572e-03j 0.00...,wholesaler broke loads into,"[-0.08407348394393921, -0.015558857470750809, ..."
3,cup,n,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,8,4,(((tf.Tensor(\n[0.00426647-9.6678629e-04j 0.00...,a metal cup,"[-0.02742213010787964, 0.07205650210380554, 0....",(((tf.Tensor(\n[0.00419467-1.2535643e-03j 0.00...,the waxen cups,"[-0.01649600639939308, 0.02845943346619606, 0...."
4,academy,n,The Academy of Music .,The French Academy .,1,2,(((tf.Tensor(\n[0.00419467-1.2535643e-03j 0.00...,The Academy of,"[-0.07165632396936417, -0.004963829647749662, ...",(((tf.Tensor(\n[0.00419467-1.2535643e-03j 0.00...,The French Academy,"[-0.05268881469964981, 0.00687433360144496, -0..."
