In [1]:
import os
import zipfile
import kaggle
from pathlib import Path

import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from functools import partial
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def download_dataset():
    cred_path = Path("/home/ubuntu/.config/kaggle/kaggle.json")
    cred_path.chmod(0o600)
    path = Path('us-patent-phrase-to-phrase-matching')

    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f"{path}.zip").extractall(path)

In [3]:
download_dataset()

us-patent-phrase-to-phrase-matching.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
df = pd.read_csv("./us-patent-phrase-to-phrase-matching/train.csv")
df.describe(include="object")

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,8d135da0b55b8c88,component composite coating,composition,H01
freq,1,152,24,2186


In [5]:
df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [6]:
df.tail()

Unnamed: 0,id,anchor,target,context,score
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.0
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.5
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.5
36471,756ec035e694722b,wood article,wooden material,B44,0.75
36472,8d135da0b55b8c88,wood article,wooden substrate,B44,0.5


In [7]:
df.shape

(36473, 5)

In [8]:
def transform(data):
    data["input"] = "TEXT1: " + data["context"] + "; TEXT2: " + data["target"] + "; ANC1: " + data["anchor"]
    return data

In [9]:
df = transform(df)
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [10]:
model_name = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_name)



In [11]:
def tok_func(tokz, x):
    return tokz(x["input"])

In [12]:
tok_func_partial = partial(tok_func, tokz)

In [13]:
tok_ds = ds.map(tok_func_partial, batched=True)

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Map: 100%|██████████| 36473/36473 [00:01<00:00, 32597.46 examples/s]


In [14]:
tok_ds = tok_ds.rename_columns({"score": "labels"})

In [15]:
eval_df = pd.read_csv("us-patent-phrase-to-phrase-matching/test.csv")


In [None]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [None]:
eval_df = transform(eval_df)
eval_ds = Dataset.from_pandas(eval_df).map(tok_func_partial, batched=True)

Map: 100%|██████████| 36/36 [00:00<00:00, 5447.34 examples/s]


In [18]:
from transformers import TrainingArguments, Trainer

In [19]:
bs = 128
epochs = 4
lr = 8e-5

args = TrainingArguments(
    "outputs",
    learning_rate=lr,
    warmup_ratio=0.1,
    num_epochs=10,
    lr_scheduler_type="cosine",
    fp16=True,
    eval_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to="wandb"
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
def corr_d(pred):
    def corr(x, y):
        return np.corrcoef(x, y)[0][1]
    return {"pearson": corr(*pred)}

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1 ) # num_labels=1 as regression
trainer = Trainer(
    model, args, train_dataset=dds["train"], eval_dataset=dds["test"], tokenizer=tokz, compute_metrics=corr_d
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.028079,0.76528
2,No log,0.024313,0.805469
3,No log,0.02344,0.815576
4,No log,0.024376,0.818836




TrainOutput(global_step=216, training_loss=0.031473632212038395, metrics={'train_runtime': 141.0337, 'train_samples_per_second': 775.815, 'train_steps_per_second': 1.532, 'total_flos': 785867506545960.0, 'train_loss': 0.031473632212038395, 'epoch': 4.0})

eva_df = pd.rea

In [47]:
preds = trainer.predict(eval_ds).predictions
preds = np.clip(preds, 0, 1)

In [48]:
eval_df["preds"] = preds

In [49]:
eval_df

Unnamed: 0,id,anchor,target,context,input,preds
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,TEXT1: G02; TEXT2: inorganic photoconductor dr...,0.657263
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,TEXT1: F23; TEXT2: altering gas flow; ANC1: ad...,0.768425
2,36baf228038e314b,lower trunnion,lower locating,B60,TEXT1: B60; TEXT2: lower locating; ANC1: lower...,0.490535
3,1f37ead645e7f0c8,cap component,upper portion,D06,TEXT1: D06; TEXT2: upper portion; ANC1: cap co...,0.375327
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,TEXT1: H04; TEXT2: artificial neural network; ...,0.0
5,474c874d0c07bd21,dry corn,dry corn starch,C12,TEXT1: C12; TEXT2: dry corn starch; ANC1: dry ...,0.562612
6,442c114ed5c4e3c9,tunneling capacitor,capacitor housing,G11,TEXT1: G11; TEXT2: capacitor housing; ANC1: tu...,0.482692
7,b8ae62ea5e1d8bdb,angular contact bearing,contact therapy radiation,B23,TEXT1: B23; TEXT2: contact therapy radiation; ...,0.011487
8,faaddaf8fcba8a3f,produce liquid hydrocarbons,produce a treated stream,C10,TEXT1: C10; TEXT2: produce a treated stream; A...,0.215943
9,ae0262c02566d2ce,diesel fuel tank,diesel fuel tanks,F02,TEXT1: F02; TEXT2: diesel fuel tanks; ANC1: di...,1.0


download_dataset()