In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv
/kaggle/input/cpc-codes/titles.csv


In [2]:
import sys
sys.path.append("../input/transformers/src")
import transformers

print(f"Transformers version: {transformers.__version__}")

Transformers version: 4.18.0


In [3]:
pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ["WANDB_DISABLED"] = "true"

In [5]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = 'microsoft/deberta-v3-base'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 16

In [6]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
train_df = train_df.merge(titles, left_on='context', right_on='code')

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    # data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [7]:
train_df['input'] = train_df['title']+' '+train_df['anchor']
train_df = create_folds(train_df, CFG.num_fold)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer( inputs, targets ),
        'label':label.astype(np.float32)
    }

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [11]:
oof_df = pd.DataFrame()
for fold in range(CFG.num_fold):
    
    tr_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
    va_data = train_df[train_df['fold']==fold].reset_index(drop=True)
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)
    
    args = TrainingArguments(
        output_dir=f"/tmp/uspppm",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model="pearson",
        load_best_model_at_end=True,
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    shutil.rmtree(f"/tmp/uspppm")
    trainer.save_model(f"uspppm_{fold}")
    
    outputs = trainer.predict(va_dataset)
    predictions = outputs.predictions.reshape(-1)
    va_data['preds'] = predictions
    oof_df = pd.concat([oof_df, va_data])

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Epoch,Training Loss,Validation Loss,Pearson
1,0.0261,0.025272,0.822891
2,0.0184,0.020138,0.842127
3,0.0144,0.019757,0.847506
4,0.0114,0.021683,0.852718
5,0.0094,0.020005,0.856185


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-1824
Configuration saved in /tmp/uspppm/checkpoint-1824/config.json
Model weights saved in /tmp/uspppm/checkpoint-1824/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-1824/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-1824/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-1824/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-3648
Configuration saved in /tmp/uspppm/checkpoint-3648/config.json
Model weights saved in /tmp/uspppm/checkpoint-3648/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-3648/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-3648/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-3648/added_tokens.json
***** Running 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e6f9db57345f0f60c9f837fa97bcb27b1ed31e99feb33d732d7d8c80cb8f8459.de97182a9f32a68819030ba8f3f6ff2ba47276be3864425925523202f54cc79c
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LAB

Epoch,Training Loss,Validation Loss,Pearson
1,0.027,0.022401,0.825186
2,0.0189,0.024039,0.839431
3,0.014,0.022751,0.845142
4,0.0115,0.023379,0.847263
5,0.0091,0.021488,0.849624


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-1824
Configuration saved in /tmp/uspppm/checkpoint-1824/config.json
Model weights saved in /tmp/uspppm/checkpoint-1824/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-1824/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-1824/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-1824/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-3648
Configuration saved in /tmp/uspppm/checkpoint-3648/config.json
Model weights saved in /tmp/uspppm/checkpoint-3648/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-3648/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-3648/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-3648/added_tokens.json
***** Running 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e6f9db57345f0f60c9f837fa97bcb27b1ed31e99feb33d732d7d8c80cb8f8459.de97182a9f32a68819030ba8f3f6ff2ba47276be3864425925523202f54cc79c
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LAB

Epoch,Training Loss,Validation Loss,Pearson
1,0.0275,0.026732,0.822289
2,0.0186,0.021876,0.845285
3,0.014,0.019981,0.851279
4,0.0111,0.019349,0.855336
5,0.0086,0.019769,0.85476


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-1824
Configuration saved in /tmp/uspppm/checkpoint-1824/config.json
Model weights saved in /tmp/uspppm/checkpoint-1824/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-1824/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-1824/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-1824/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-3648
Configuration saved in /tmp/uspppm/checkpoint-3648/config.json
Model weights saved in /tmp/uspppm/checkpoint-3648/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-3648/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-3648/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-3648/added_tokens.json
***** Running 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e6f9db57345f0f60c9f837fa97bcb27b1ed31e99feb33d732d7d8c80cb8f8459.de97182a9f32a68819030ba8f3f6ff2ba47276be3864425925523202f54cc79c
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LAB

Epoch,Training Loss,Validation Loss,Pearson
1,0.027,0.025523,0.826377
2,0.0188,0.020682,0.842683
3,0.0143,0.022045,0.855853
4,0.0108,0.018537,0.857531
5,0.0092,0.019367,0.85854


***** Running Evaluation *****
  Num examples = 7294
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-1824
Configuration saved in /tmp/uspppm/checkpoint-1824/config.json
Model weights saved in /tmp/uspppm/checkpoint-1824/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-1824/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-1824/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-1824/added_tokens.json
***** Running Evaluation *****
  Num examples = 7294
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-3648
Configuration saved in /tmp/uspppm/checkpoint-3648/config.json
Model weights saved in /tmp/uspppm/checkpoint-3648/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-3648/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-3648/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-3648/added_tokens.json
***** Running 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e6f9db57345f0f60c9f837fa97bcb27b1ed31e99feb33d732d7d8c80cb8f8459.de97182a9f32a68819030ba8f3f6ff2ba47276be3864425925523202f54cc79c
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LAB

Epoch,Training Loss,Validation Loss,Pearson
1,0.0264,0.026488,0.818809
2,0.0177,0.021986,0.837338
3,0.0142,0.021073,0.846394
4,0.0107,0.020533,0.851212
5,0.0091,0.019474,0.854555


***** Running Evaluation *****
  Num examples = 7294
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-1824
Configuration saved in /tmp/uspppm/checkpoint-1824/config.json
Model weights saved in /tmp/uspppm/checkpoint-1824/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-1824/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-1824/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-1824/added_tokens.json
***** Running Evaluation *****
  Num examples = 7294
  Batch size = 16
Saving model checkpoint to /tmp/uspppm/checkpoint-3648
Configuration saved in /tmp/uspppm/checkpoint-3648/config.json
Model weights saved in /tmp/uspppm/checkpoint-3648/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-3648/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-3648/special_tokens_map.json
added tokens file saved in /tmp/uspppm/checkpoint-3648/added_tokens.json
***** Running 

In [12]:
predictions = oof_df['preds'].values
label = oof_df['score'].values
eval_pred = predictions, label
compute_metrics(eval_pred)

{'pearson': 0.8543242077660633}

In [13]:
oof_df.to_csv('oof_df.csv')