In [1]:
import os
import zipfile
import kaggle
from pathlib import Path

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from functools import partial

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def download_dataset():
    cred_path = Path("/Users/yetao/.kaggle/kaggle.json")
    cred_path.chmod(0o600)
    path = Path('us-patent-phrase-to-phrase-matching')

    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f"{path}.zip").extractall(path)
    

In [3]:
download_dataset()

us-patent-phrase-to-phrase-matching.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
df = pd.read_csv("./us-patent-phrase-to-phrase-matching/train.csv")
df.describe(include="object")

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,8d135da0b55b8c88,component composite coating,composition,H01
freq,1,152,24,2186


In [5]:
df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [6]:
df.tail()

Unnamed: 0,id,anchor,target,context,score
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.0
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.5
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.5
36471,756ec035e694722b,wood article,wooden material,B44,0.75
36472,8d135da0b55b8c88,wood article,wooden substrate,B44,0.5


In [7]:
df.shape

(36473, 5)

In [8]:
def transform(data):
    data["input"] = "TEXT1: " + data["context"] + "; TEXT2: " + data["target"] + "; ANC1: " + data["anchor"]
    return data

In [9]:
df = transform(df)
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [10]:
model_name = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_name)



In [11]:
def tok_func(tokz, x):
    return tokz(x["input"])

In [12]:
tok_func_partial = partial(tok_func, tokz)

In [13]:
tok_ds = ds.map(tok_func_partial, batched=True)

Map: 100%|██████████| 36473/36473 [00:02<00:00, 18097.66 examples/s]


In [14]:
tok_ds = tok_ds.rename_columns({"score": "labels"})

In [15]:
eval_df = pd.read_csv("us-patent-phrase-to-phrase-matching/test.csv")


In [16]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [17]:
eval_df = transform(eval_df)
eval_ds = Dataset.from_pandas(eval_df).map(tok_func_partial, batched=True)

Map: 100%|██████████| 36/36 [00:00<00:00, 3619.07 examples/s]


In [18]:
from transformers import TrainingArguments, Trainer

In [21]:
bs = 128
epochs = 4
lr = 8e-5

args = TrainingArguments(
    "outputs",
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    fp16=False,
    evaluation_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to="none"
)



In [22]:
def corr_d(pred):
    return {"pearson": corr(*pred)}

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1 )
trainer = Trainer(
    model, args, train_dataset=dds["train"], eval_dataset=dds["test"], tokenizer=tokz, compute_metrics=corr_d
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [23]:
trainer.train()

  1%|▏         | 12/856 [01:54<2:09:04,  9.18s/it]

KeyboardInterrupt: 

eva_df = pd.rea

In [31]:
tok_ds[1]

{'id': '7b9652b17b68b7a4',
 'anchor': 'abatement',
 'target': 'act of abating',
 'context': 'A47',
 'score': 0.75,
 'input': 'TEXT1: A47; TEXT2: act of abating; ANC1: abatement',
 'input_ids': [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  1727,
  265,
  266,
  17441,
  510,
  346,
  23702,
  435,
  294,
  47284,
  2],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

download_dataset()