## Analysis and using models from three notebooks

**1.** Deberta v3 large (0.8392)
> [Inference BERT for usPatents](https://www.kaggle.com/code/leehann/inference-bert-for-uspatents)

**2.** Deberta v3 large (0.8338)
> [PPPM / Deberta-v3-large baseline [inference]](https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-inference)

**3.** Roberta-large (0.8143)
> [PatentPhrase RoBERTa Inference](https://www.kaggle.com/code/santhoshkumarv/patentphrase-roberta-inference-lb-0-814)

#### Please upvote the original notebooks!

## UPD: I have an error in my code (Version 1)!

Method merge in model 1 shuffled the dataframe.

```
test = test.merge(titles, left_on='context', right_on='code')
```

So I reseted index, merged, sorted and drop index.

```
test.reset_index(inplace=True)
test = test.merge(titles, left_on='context', right_on='code')
test.sort_values(by='index', inplace=True)
test.drop(columns='index', inplace=True)
```

# 1. Import & Def & Set & Load

In [1]:
!pip uninstall -q -y transformers

[0m

In [2]:
import sys
sys.path.append("../input/torch-components-library/torch-components-main")
sys.path.append("../input/transformers/src")

In [3]:
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch_components import Configuration, Timer, Averager
from torch_components.utils import seed_everything, get_batch, load_checkpoint
from torch.cuda.amp import GradScaler, autocast
from sklearn.model_selection import StratifiedKFold
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import numpy as np
import warnings
import random
import os
import shutil
import gc


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEBUG = False

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["EXPERIMENT_NAME"] = "none"

        
warnings.simplefilter("ignore")

In [4]:
pathes = Configuration(train="../input/us-patent-phrase-to-phrase-matching/train.csv", 
                       test="../input/us-patent-phrase-to-phrase-matching/test.csv",
                       sample_submission="../input/us-patent-phrase-to-phrase-matching/sample_submission.csv",
                       cpc_codes="../input/cpc-codes/titles.csv")
Configuration
config = Configuration(seed=42,
                       max_length=72,
                       batch_size=24,
                       num_workers=4,
                       pin_memory=True,
                       folds=4,  
                       verbose=250,
                       device=DEVICE,
                       amp=True, 
                       input_directory="../input/deberta-v3-small-anchor-change",
                       debug=True)

seed_everything(config.seed)

42

In [5]:
def create_submission(ids, predictions, path="submission.csv"):
    submission = pd.DataFrame({
        "id": ids,
        "score": predictions,
    })
    
    submission.to_csv(path, index=False)
    return submission

def prediction_loop(loader, 
                    model, 
                    device="cpu", 
                    amp=False, 
                    verbose=1, 
                    time_format="{hours}:{minutes}:{seconds}", 
                    logger="print"):
    
    if device is not None:
        model.to(device)
    
    model.eval()
    outputs = []
    timer = Timer(time_format)
    steps = len(loader)
    
    if logger == "tqdm":
        loader = tqdm(iterable=loader, 
                      total=len(loader),
                      colour="#000",
                      bar_format="{l_bar} {bar} {n_fmt}/{total_fmt} - remain: {remaining}{postfix}")
            
        loader.set_description_str("[Prediction]")
    
    for step, batch in enumerate(loader, 1):
        with torch.no_grad():
            with autocast(enabled=amp):
                batch_outputs = prediction_step(batch=batch, model=model, device=device)
                
            outputs.extend(batch_outputs.to("cpu").numpy())
            
            if logger == "print":
                if step % verbose == 0 or step == steps:
                    elapsed, remain = timer(step/steps)

                    print(f"[Prediction] "
                          f"{step}/{steps} - "
                          f"remain: {remain}")
            
    outputs = torch.tensor(outputs)
    return outputs
def prediction_step(batch, model, device="cpu"):
    input_ids, attention_mask = batch
    
    input_ids = input_ids.to(device).long()
    attention_mask = attention_mask.to(device).long()
    
    outputs = model(input_ids, attention_mask)
    
    return outputs.sigmoid().squeeze()

In [6]:
cpc_codes = pd.read_csv(pathes.cpc_codes)

path = pathes.train if DEBUG else pathes.test 
test = pd.read_csv(path)
test = test.merge(cpc_codes, left_on="context", right_on="code")
test_ids = test["id"].values

sample_submission = pd.read_csv(pathes.sample_submission)

if config.debug:
    display(test)

Unnamed: 0,id,anchor,target,context,code,title,section,class,subclass,group,main_group
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G02,OPTICS,G,2.0,,,
1,5203a36c501f1b7c,generate in layer,generate by layer,G02,G02,OPTICS,G,2.0,,,
2,7aa5908a77a7ec24,el display,illumination,G02,G02,OPTICS,G,2.0,,,
3,09e418c93a776564,adjust gas flow,altering gas flow,F23,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,F,23.0,,,
4,36baf228038e314b,lower trunnion,lower locating,B60,B60,VEHICLES IN GENERAL,B,60.0,,,
5,b892011ab2e2cabc,carry by platform,carry on platform,B60,B60,VEHICLES IN GENERAL,B,60.0,,,
6,1f37ead645e7f0c8,cap component,upper portion,D06,D06,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...,D,6.0,,,
7,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,H04,ELECTRIC COMMUNICATION TECHNIQUE,H,4.0,,,
8,16ae4b99d3601e60,transmit to platform,direct receiving,H04,H04,ELECTRIC COMMUNICATION TECHNIQUE,H,4.0,,,
9,474c874d0c07bd21,dry corn,dry corn starch,C12,C12,BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MI...,C,12.0,,,


In [7]:
cpc_texts = torch.load("../input/folds-dump-the-two-paths-fix/cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
test['text'] = test['text'].apply(str.lower)
test.head()

Unnamed: 0,id,anchor,target,context,code,title,section,class,subclass,group,main_group,context_text,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,opc drum[sep]inorganic photoconductor drum[sep...
1,5203a36c501f1b7c,generate in layer,generate by layer,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,generate in layer[sep]generate by layer[sep]ph...
2,7aa5908a77a7ec24,el display,illumination,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,el display[sep]illumination[sep]physics. optics
3,09e418c93a776564,adjust gas flow,altering gas flow,F23,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,F,23.0,,,,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...,adjust gas flow[sep]altering gas flow[sep]mech...
4,36baf228038e314b,lower trunnion,lower locating,B60,B60,VEHICLES IN GENERAL,B,60.0,,,,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,lower trunnion[sep]lower locating[sep]performi...


In [8]:
tokenizer_path = os.path.join(config.input_directory, "tokenizer/")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [9]:
class DynamicPadding:
    def __init__(self, tokenizer, max_length=None, padding=True, pad_to_multiple_of=None, return_tensors="pt"):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_tensors = return_tensors
    
    def __call__(self, tokenized):
        max_length = max(len(_["input_ids"]) for _ in tokenized)
        max_length = min(max_length, self.max_length) if self.max_length is not None else max_length
                
        padded = self.tokenizer.pad(encoded_inputs=tokenized,
                                    max_length=max_length,
                                    padding=self.padding, 
                                    pad_to_multiple_of=self.pad_to_multiple_of, 
                                    return_tensors=self.return_tensors)
        
        return padded
    
    
    
class Collator:
    def __init__(self, return_targets=True, **kwargs):
        self.dynamic_padding = DynamicPadding(**kwargs)
        self.return_targets = return_targets
    
    def __call__(self, batch):
        all_tokenized, all_targets = [], []
        for sample in batch:
            if self.return_targets:
                tokenized, target = sample
                all_targets.append(target)
            else:
                tokenized = sample
                
            all_tokenized.append(tokenized)
        
        tokenized = self.dynamic_padding(all_tokenized)
        
        input_ids = torch.tensor(tokenized.input_ids)
        attention_mask = torch.tensor(tokenized.attention_mask)
        
        if self.return_targets:
            all_targets = torch.tensor(all_targets)
        
            return input_ids, attention_mask, all_targets
        
        return input_ids, attention_mask

In [10]:
class Dataset:
    def __init__(self, texts, pair_texts, tokenizer, contexts=None, sep=None, targets=None, max_length=128):
        self.texts = texts
        self.pair_texts = pair_texts
        self.contexts = contexts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sep = sep if sep is not None else self.tokenizer.sep_token
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index].lower()
        pair_text = self.pair_texts[index].lower()
        
        if self.contexts is not None:
            context = self.contexts[index].lower()
            text = text + self.sep + context
        
        tokenized = self.tokenizer(text=text, 
                                   text_pair=pair_text, 
                                   add_special_tokens=True,
                                   #max_length=self.max_length,
                                   #padding="max_length",
                                   truncation=True,
                                   return_attention_mask=True,
                                   return_token_type_ids=False,
                                   return_offsets_mapping=False)
        
        
        if self.targets is not None:
            target = self.targets[index]
            
            return tokenized, target
            
        return tokenized


In [11]:
collator = Collator(return_targets=False, tokenizer=tokenizer, max_length=config.max_length)

test_dataset = Dataset(texts=test["text"].values, 
                       pair_texts=test["target"].values,
                       contexts=test["title"].values,
                       max_length=config.max_length,
                       sep=tokenizer.sep_token,
                       tokenizer=tokenizer)
    
test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=config.batch_size*2, 
                         num_workers=config.num_workers,
                         pin_memory=config.pin_memory,
                         collate_fn=collator,
                         shuffle=False, 
                         drop_last=False)

print(f"Test Samples: {len(test_dataset)}")


Test Samples: 36


In [12]:
class Model(nn.Module):
    def __init__(self, model_path="../input/deberta-small-real", config_path=None, config_updates={}, reinitialization_layers=0):
        super(Model, self).__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(model_path)
        else:
            self.config = AutoConfig.from_pretrained(config_path)
        
        self.config.output_hidden_states = True
        self.config.update(config_updates)
        
        if config_path is None:
            self.model = AutoModel.from_pretrained(model_path, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
                
                
        self.reinit_layers(n=reinitialization_layers, layers=self.model.encoder.layer, std=self.config.initializer_range)

        self.head = nn.Linear(in_features=self.config.hidden_size, out_features=1)
        self.init_weights(self.head, std=self.config.initializer_range)
    
    
    def reinit_layers(self, layers, n=0, std=0.02):
        if n > 0:
            for layer in layers[-n:]:
                for name, module in layer.named_modules():
                    self.init_weights(module, std=std)
            
            print(f"Reinitializated last {n} layers.")
                
    
    def init_weights(self, module, std=0.02):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    
    def forward(self, input_ids, attention_mask=None):
        transformer_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        features = transformer_outputs.hidden_states[-1]
        features = features[:, 0, :]
        outputs = self.head(features)
        return outputs


In [13]:

oof_predictions = []
for fold in range(1, config.folds + 1):
    print(f"Fold [{fold}/{config.folds}]")
    
    fold_directory = os.path.join(config.input_directory, f"fold_{fold}/")
    model_config_path = os.path.join(fold_directory, "model_config.json")
    model_path = os.path.join(fold_directory, "model.pth")
    checkpoints_directory = os.path.join(fold_directory, "checkpoints/")
    checkpoint_path = os.path.join(checkpoints_directory, "checkpoint.pth")
    
    model = Model(config_path=model_config_path)
    
    fold_checkpoint = load_checkpoint(path=checkpoint_path, 
                                      model=model, 
                                      strict=True, 
                                      ignore_warnings=True)
    
    
    print(f"Loaded checkpoint from '{checkpoint_path}'.")
    
    fold_predictions = prediction_loop(loader=test_loader, 
                                       model=model, 
                                       amp=config.amp, 
                                       device=config.device)
    
    oof_predictions.append(fold_predictions.numpy())
    
    del model, fold_checkpoint, fold_predictions
    torch.cuda.empty_cache()
    gc.collect()
    
    print(end="\n"*3)
    
oof_predictions = np.array(oof_predictions)


Fold [1/4]
Loaded checkpoint from '../input/deberta-v3-small-anchor-change/fold_1/checkpoints/checkpoint.pth'.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Prediction] 1/1 - remain: 0:0:0



Fold [2/4]
Loaded checkpoint from '../input/deberta-v3-small-anchor-change/fold_2/checkpoints/checkpoint.pth'.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Prediction] 1/1 - remain: 0:0:0



Fold [3/4]
Loaded checkpoint from '../input/deberta-v3-small-anchor-change/fold_3/checkpoints/checkpoint.pth'.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Prediction] 1/1 - remain: 0:0:0



Fold [4/4]
Loaded checkpoint from '../input/deberta-v3-small-anchor-change/fold_4/checkpoints/checkpoint.pth'.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Prediction] 1/1 - remain: 0:0:0





In [14]:
from sklearn.preprocessing import MinMaxScaler

In [15]:
min_max_scaler = MinMaxScaler()

In [16]:
small_predictions = np.mean(oof_predictions, axis=0)

small_preds=pd.concat([pd.DataFrame(test_ids),pd.DataFrame(small_predictions)],keys=["ids","score"],axis=1)


In [17]:
config = Configuration(seed=42,
                       max_length=72,
                       batch_size=24,
                       num_workers=4,
                       pin_memory=True,
                       folds=4,  
                       verbose=250,
                       device=DEVICE,
                       amp=True, 
                       input_directory="../input/largeanchor-change",
                       debug=True)

seed_everything(config.seed)

42

In [18]:
cpc_codes = pd.read_csv(pathes.cpc_codes)

path = pathes.train if DEBUG else pathes.test 
test = pd.read_csv(path)
test = test.merge(cpc_codes, left_on="context", right_on="code")
test_ids = test["id"].values

sample_submission = pd.read_csv(pathes.sample_submission)

if config.debug:
    display(test.head())

Unnamed: 0,id,anchor,target,context,code,title,section,class,subclass,group,main_group
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G02,OPTICS,G,2.0,,,
1,5203a36c501f1b7c,generate in layer,generate by layer,G02,G02,OPTICS,G,2.0,,,
2,7aa5908a77a7ec24,el display,illumination,G02,G02,OPTICS,G,2.0,,,
3,09e418c93a776564,adjust gas flow,altering gas flow,F23,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,F,23.0,,,
4,36baf228038e314b,lower trunnion,lower locating,B60,B60,VEHICLES IN GENERAL,B,60.0,,,


In [19]:
cpc_texts = torch.load("../input/folds-dump-the-two-paths-fix/cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
test['text'] = test['text'].apply(str.lower)
test.head()

Unnamed: 0,id,anchor,target,context,code,title,section,class,subclass,group,main_group,context_text,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,opc drum[sep]inorganic photoconductor drum[sep...
1,5203a36c501f1b7c,generate in layer,generate by layer,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,generate in layer[sep]generate by layer[sep]ph...
2,7aa5908a77a7ec24,el display,illumination,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,el display[sep]illumination[sep]physics. optics
3,09e418c93a776564,adjust gas flow,altering gas flow,F23,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,F,23.0,,,,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...,adjust gas flow[sep]altering gas flow[sep]mech...
4,36baf228038e314b,lower trunnion,lower locating,B60,B60,VEHICLES IN GENERAL,B,60.0,,,,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,lower trunnion[sep]lower locating[sep]performi...


In [20]:
tokenizer_path = os.path.join(config.input_directory, "tokenizer/")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [21]:
collator = Collator(return_targets=False, tokenizer=tokenizer, max_length=config.max_length)

test_dataset = Dataset(texts=test["text"].values, 
                       pair_texts=test["target"].values,
                       contexts=test["title"].values,
                       max_length=config.max_length,
                       sep=tokenizer.sep_token,
                       tokenizer=tokenizer)
    
test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=config.batch_size*2, 
                         num_workers=config.num_workers,
                         pin_memory=config.pin_memory,
                         collate_fn=collator,
                         shuffle=False, 
                         drop_last=False)

print(f"Test Samples: {len(test_dataset)}")

Test Samples: 36


In [22]:
oof_predictions = []
for fold in range(1, config.folds + 1):
    print(f"Fold [{fold}/{config.folds}]")
    
    fold_directory = os.path.join(config.input_directory, f"fold_{fold}/")
    model_config_path = os.path.join(fold_directory, "model_config.json")
    model_path = os.path.join(fold_directory, "model.pth")
    checkpoints_directory = os.path.join(fold_directory, "checkpoints/")
    checkpoint_path = os.path.join(checkpoints_directory, "checkpoint.pth")
    
    model = Model(config_path=model_config_path)
    
    fold_checkpoint = load_checkpoint(path=checkpoint_path, 
                                      model=model, 
                                      strict=True, 
                                      ignore_warnings=True)
    
    
    print(f"Loaded checkpoint from '{checkpoint_path}'.")
    
    fold_predictions = prediction_loop(loader=test_loader, 
                                       model=model, 
                                       amp=config.amp, 
                                       device=config.device)
    
    oof_predictions.append(fold_predictions.numpy())
    
    del model, fold_checkpoint, fold_predictions
    torch.cuda.empty_cache()
    gc.collect()
    
    print(end="\n"*3)
    
oof_predictions = np.array(oof_predictions)


Fold [1/4]
Loaded checkpoint from '../input/largeanchor-change/fold_1/checkpoints/checkpoint.pth'.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Prediction] 1/1 - remain: 0:0:0



Fold [2/4]
Loaded checkpoint from '../input/largeanchor-change/fold_2/checkpoints/checkpoint.pth'.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Prediction] 1/1 - remain: 0:0:0



Fold [3/4]
Loaded checkpoint from '../input/largeanchor-change/fold_3/checkpoints/checkpoint.pth'.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Prediction] 1/1 - remain: 0:0:0



Fold [4/4]
Loaded checkpoint from '../input/largeanchor-change/fold_4/checkpoints/checkpoint.pth'.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Prediction] 1/1 - remain: 0:0:0





In [23]:
large_predictions = np.mean(oof_predictions, axis=0)

large_preds=pd.concat([pd.DataFrame(test_ids),pd.DataFrame(large_predictions)],keys=["ids","score"],axis=1)

In [24]:
import os
import gc
import random

import numpy as np
import pandas as pd

import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

from dataclasses import dataclass

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel

import warnings 
warnings.filterwarnings('ignore')

In [25]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True    
    torch.backends.cudnn.benchmark = False

    
def inference_fn(test_loader, model, device, is_sigmoid=True):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
        if is_sigmoid == True:
            preds.append(output.sigmoid().to('cpu').numpy())
        else:
            preds.append(output.to('cpu').numpy())

    return np.concatenate(preds)    
    

def upd_outputs(data, is_trim=False, is_minmax=False, is_reshape=False):
    min_max_scaler = MinMaxScaler()
    
    if is_trim == True:
        data = np.where(data <=0, 0, data)
        data = np.where(data >=1, 1, data)

    if is_minmax ==True:
        data = min_max_scaler.fit_transform(data)
    
    if is_reshape == True:
        data = data.reshape(-1)
        
    return data


In [26]:
pd.set_option('display.precision', 4)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

CUSTOM_SEED = 42
CUSTOM_BATCH = 24
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [27]:
competition_dir = "../input/us-patent-phrase-to-phrase-matching/"

submission = pd.read_csv(competition_dir+'sample_submission.csv')
test_origin = pd.read_csv(competition_dir+'test.csv')
test_origin.head()

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04


# 2. Extract predictions

## 2.1 Deberta v3 large - 1

In [28]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           max_length=cfg.max_len,
                           padding="max_length",
                           truncation=True)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg        
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.text[item])
        
        return inputs
   
    
class CustomModel(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_path)
        config.num_labels = 1
        self.base = AutoModelForSequenceClassification.from_config(config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)
        
    def forward(self, inputs):
        output = self.base(**inputs)

        return output[0]

In [29]:
seed_everything(CUSTOM_SEED)

In [30]:
class CFG:
    model_path='../input/deberta-v3-large/deberta-v3-large'
    batch_size=CUSTOM_BATCH
    num_workers=2
    max_len=130
    trn_fold=[0, 1, 2, 3]

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

context_mapping = torch.load("../input/folds-dump-the-two-paths-fix/cpc_texts.pth")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
test = test_origin.copy()
titles = pd.read_csv('../input/cpc-codes/titles.csv')

test.reset_index(inplace=True)
test = test.merge(titles, left_on='context', right_on='code')
test.sort_values(by='index', inplace=True)
test.drop(columns='index', inplace=True)

test['context_text'] = test['context'].map(context_mapping)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
test['text'] = test['text'].apply(str.lower)

test.head()

Unnamed: 0,id,anchor,target,context,code,title,section,class,subclass,group,main_group,context_text,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G02,OPTICS,G,2.0,,,,PHYSICS. OPTICS,opc drum[sep]inorganic photoconductor drum[sep...
3,09e418c93a776564,adjust gas flow,altering gas flow,F23,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,F,23.0,,,,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...,adjust gas flow[sep]altering gas flow[sep]mech...
4,36baf228038e314b,lower trunnion,lower locating,B60,B60,VEHICLES IN GENERAL,B,60.0,,,,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,lower trunnion[sep]lower locating[sep]performi...
6,1f37ead645e7f0c8,cap component,upper portion,D06,D06,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...,D,6.0,,,,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...,cap component[sep]upper portion[sep]textiles; ...
7,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,H04,ELECTRIC COMMUNICATION TECHNIQUE,H,4.0,,,,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE,neural stimulation[sep]artificial neural netwo...


In [32]:
deberta_predicts_1 = []

test_dataset = TestDataset(CFG, test)
test_dataloader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size, shuffle=False,
                             num_workers=CFG.num_workers,
                             pin_memory=True, drop_last=False)

deberta_simple_path = "../input/us-patent-deberta-simple/microsoft_deberta-v3-large"

for fold in CFG.trn_fold:
    fold_path = f"{deberta_simple_path}_best{fold}.pth"
    
    model = CustomModel(CFG.model_path)    
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state['model'])
    
    prediction = inference_fn(test_dataloader, model, DEVICE, is_sigmoid=False)
    
    deberta_predicts_1.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()
    gc.collect()

100%|██████████| 2/2 [00:00<00:00,  2.12it/s]
100%|██████████| 2/2 [00:01<00:00,  1.98it/s]
100%|██████████| 2/2 [00:00<00:00,  2.08it/s]
100%|██████████| 2/2 [00:01<00:00,  1.74it/s]


In [33]:
# -------------- inference_fn([...], is_sigmoid=False)
deberta_predicts_1 = [upd_outputs(x, is_minmax=True, is_reshape=True) for x in deberta_predicts_1]
deberta_predicts_1 = pd.DataFrame(deberta_predicts_1).T

deberta_predicts_1.head(10).style.background_gradient(cmap=cm, axis=1)

Unnamed: 0,0,1,2,3
0,0.551,0.2487,0.239,0.267
1,0.5259,0.76,0.4775,0.6383
2,0.5402,0.4998,0.5037,0.5131
3,0.2696,0.4241,0.2227,0.2377
4,0.4514,0.0021,0.0007,0.0009
5,0.549,0.5047,0.4754,0.4692
6,0.564,0.5154,0.2642,0.4961
7,0.0071,0.0001,0.0067,0.0004
8,0.2316,0.2633,0.2773,0.243
9,1.0,0.9948,1.0,1.0


In [34]:
del test, test_dataset
gc.collect()

46

## 2.2 Deberta v3 large - 2

In [35]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        
        return output

In [36]:
seed_everything(CUSTOM_SEED)

In [37]:
class CFG:
    num_workers=2
    path="../input/pppm-deberta-v3-large-baseline-w-w-b-train/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=CUSTOM_BATCH
    fc_dropout=0.2
    target_size=1
    max_len=133
    trn_fold=[0, 1, 2, 3]
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

context_mapping = torch.load(CFG.path+"cpc_texts.pth")

In [38]:
test = test_origin.copy()

test['context_text'] = test['context'].map(context_mapping)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']

test.head()

Unnamed: 0,id,anchor,target,context,context_text,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,PHYSICS. OPTICS,opc drum[SEP]inorganic photoconductor drum[SEP...
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...,adjust gas flow[SEP]altering gas flow[SEP]MECH...
2,36baf228038e314b,lower trunnion,lower locating,B60,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,lower trunnion[SEP]lower locating[SEP]PERFORMI...
3,1f37ead645e7f0c8,cap component,upper portion,D06,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...,cap component[SEP]upper portion[SEP]TEXTILES; ...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE,neural stimulation[SEP]artificial neural netwo...


In [39]:
deberta_predicts_2 = []

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

folds_path = CFG.path + f"{CFG.model.replace('/', '-')}"

for fold in CFG.trn_fold:
    fold_path = f"{folds_path}_fold{fold}_best.pth"
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state['model'])
    
    prediction = inference_fn(test_loader, model, DEVICE)
    deberta_predicts_2.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()
    gc.collect()

100%|██████████| 2/2 [00:01<00:00,  1.99it/s]
100%|██████████| 2/2 [00:01<00:00,  1.95it/s]
100%|██████████| 2/2 [00:00<00:00,  2.06it/s]
100%|██████████| 2/2 [00:01<00:00,  1.96it/s]


In [40]:
deberta_predicts_2 = [upd_outputs(x, is_reshape=True) for x in deberta_predicts_2]
deberta_predicts_2 = pd.DataFrame(deberta_predicts_2).T

deberta_predicts_2.head(10).style.background_gradient(cmap=cm, axis=1)

Unnamed: 0,0,1,2,3
0,0.5574,0.4311,0.68,0.5707
1,0.7588,0.7719,0.7123,0.6524
2,0.5809,0.4549,0.4873,0.5167
3,0.2177,0.2776,0.2532,0.2429
4,0.0006,0.0003,0.0003,0.0006
5,0.5011,0.4691,0.5171,0.443
6,0.4501,0.3675,0.5017,0.5144
7,0.0001,0.0002,0.0002,0.0001
8,0.3764,0.3466,0.2944,0.2838
9,0.9994,0.999,0.9991,0.9986


In [41]:
del test, test_dataset
gc.collect()

47

## 2.3. Roberta-large

In [42]:
def prepare_input(cfg, text, target):
    inputs = cfg.tokenizer(text, target,
                           padding="max_length",
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.target = df['target'].values
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        target = self.target[item]
        
        inputs = prepare_input(self.cfg, text, target)
        
        return inputs

    
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(CFG.config_path)

        config.update({"output_hidden_states": True,
                       "hidden_dropout_prob": hidden_dropout_prob,
                       "layer_norm_eps": layer_norm_eps,
                       "add_pooling_layer": False})
        
        self.transformer = AutoModel.from_pretrained(CFG.config_path, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size, CFG.num_targets)
        
    def forward(self, inputs):
        transformer_out = self.transformer(**inputs)
        last_hidden_states = transformer_out[0]
        last_hidden_states = self.dropout(torch.mean(last_hidden_states, 1))
        logits1 = self.output(self.dropout1(last_hidden_states))
        logits2 = self.output(self.dropout2(last_hidden_states))
        logits3 = self.output(self.dropout3(last_hidden_states))
        logits4 = self.output(self.dropout4(last_hidden_states))
        logits5 = self.output(self.dropout5(last_hidden_states))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        
        return logits

In [43]:
seed_everything(CUSTOM_SEED)

In [44]:
@dataclass(frozen=True)
class CFG:
    num_workers=2
    config_path='../input/robertalarge'
    model_path='../input/phrase-matching-roberta-training-pytorch-wandb'
    model_name='roberta-large'
    batch_size=CUSTOM_BATCH
    max_len=128
    num_targets=1
    trn_fold=[0, 1, 2, 3, 4]
    tokenizer=AutoTokenizer.from_pretrained('../input/robertalarge')

context_mapping = {
        "A": "Human Necessities",
        "B": "Operations and Transport",
        "C": "Chemistry and Metallurgy",
        "D": "Textiles",
        "E": "Fixed Constructions",
        "F": "Mechanical Engineering",
        "G": "Physics",
        "H": "Electricity",
        "Y": "Emerging Cross-Sectional Technologies",
}

In [45]:
test = test_origin.copy()

test['context_text'] = test['context'].str.slice(stop=1).map(context_mapping)
test['text'] = test['context_text'] + ' ' + test['anchor']

In [46]:
test.head()

Unnamed: 0,id,anchor,target,context,context_text,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,Physics,Physics opc drum
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,Mechanical Engineering,Mechanical Engineering adjust gas flow
2,36baf228038e314b,lower trunnion,lower locating,B60,Operations and Transport,Operations and Transport lower trunnion
3,1f37ead645e7f0c8,cap component,upper portion,D06,Textiles,Textiles cap component
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,Electricity,Electricity neural stimulation


In [47]:
roberta_predicts = []

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

folds_path = CFG.model_path + f"/{CFG.model_name.replace('-','_')}"

for fold in CFG.trn_fold:
    fold_path = f"{folds_path}_patent_model_{fold}.pth"
    
    model = CustomModel()
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state)

    prediction = inference_fn(test_loader, model, DEVICE)
    roberta_predicts.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()    
    gc.collect()

Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2/2 [00:00<00:00,  2.51it/s]
Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.de

In [48]:
roberta_predicts = [upd_outputs(x, is_reshape=True) for x in roberta_predicts]
roberta_predicts = pd.DataFrame(roberta_predicts).T

roberta_predicts.head(10).style.background_gradient(cmap=cm, axis=1)

Unnamed: 0,0,1,2,3,4
0,0.5383,0.5301,0.5678,0.5301,0.7397
1,0.763,0.8501,0.7076,0.7278,0.6489
2,0.4582,0.1184,0.4592,0.4341,0.5646
3,0.283,0.2947,0.3029,0.2711,0.2829
4,0.121,0.0044,0.0006,0.001,0.006
5,0.5122,0.4771,0.4879,0.535,0.461
6,0.4674,0.4594,0.497,0.476,0.5252
7,0.0003,0.0008,0.0002,0.0005,0.0004
8,0.2505,0.361,0.2793,0.2819,0.2967
9,0.9975,0.9969,0.9971,0.9987,0.9973


In [49]:
del test, test_dataset
gc.collect()

90

# 3. Comparison / Ensemble

In [50]:
deberta_predicts_1

Unnamed: 0,0,1,2,3
0,0.551,0.2487,0.239,0.267
1,0.5259,0.76,0.4775,0.6383
2,0.5402,0.4998,0.5037,0.5131
3,0.2696,0.4241,0.2227,0.2377
4,0.4514,0.0021,0.0007,0.0009
5,0.549,0.5047,0.4754,0.4692
6,0.564,0.5154,0.2642,0.4961
7,0.0071,0.0001,0.0067,0.0004
8,0.2316,0.2633,0.2773,0.243
9,1.0,0.9948,1.0,1.0


In [51]:
all_mean = pd.DataFrame({
    'deberta 1': deberta_predicts_1.mean(axis=1),
    'deberta 2': deberta_predicts_2.mean(axis=1),
    'roberta': roberta_predicts.mean(axis=1),
    'deberta small': small_preds.mean(axis=1),
    'deberta large': large_preds.mean(axis=1)
})

all_mean.head(10) \
    .assign(mean=lambda x: x.mean(axis=1)) \
        .style.highlight_max(axis=1, props=props_param)

Unnamed: 0,deberta 1,deberta 2,roberta,deberta small,deberta large,mean
0,0.3264,0.5598,0.5812,0.5151,0.3562,0.4678
1,0.6004,0.7238,0.7395,0.7949,0.8633,0.7444
2,0.5142,0.51,0.4069,0.3936,0.3245,0.4298
3,0.2885,0.2478,0.2869,0.7461,0.7246,0.4588
4,0.1138,0.0005,0.0266,0.5405,0.4692,0.2301
5,0.4996,0.4826,0.4946,0.6357,0.6074,0.544
6,0.4599,0.4584,0.485,0.2666,0.2399,0.382
7,0.0036,0.0001,0.0004,0.0048,0.0022,0.0022
8,0.2538,0.3253,0.2939,0.2205,0.2261,0.2639
9,0.9987,0.999,0.9975,0.5581,0.5024,0.8112


In [52]:
# === N1 ===
# weights_ = [0.33, 0.33, 0.33]
# final_predictions = all_mean.mul(weights_).sum(axis=1)

# === N2 ===
# final_predictions = all_mean.median(axis=1)
final_predictions = all_mean.mean(axis=1)

# === N3 ===
# final_predictions = all_predictions.mean(axis=1)

# === N4 ===
# combs = pd.DataFrame({
#     'deberta_1': deberta_predicts_1.mean(axis=1),
#     'deb_2+rob': (deberta_predicts_2.mean(axis=1) * 0.666) \
#                     + (roberta_predicts.mean(axis=1) * 0.333)
# })
# display(combs.head())
# final_predictions = combs.median(axis=1)
# final_predictions = combs.mean(axis=1)

final_predictions.head()

0    0.4678
1    0.7444
2    0.4298
3    0.4588
4    0.2301
dtype: float32

# 4. Submission

In [53]:
submission = pd.DataFrame({
    'id': test_origin['id'],
    'score': final_predictions,
})

submission.head(14)

Unnamed: 0,id,score
0,4112d61851461f60,0.4678
1,09e418c93a776564,0.7444
2,36baf228038e314b,0.4298
3,1f37ead645e7f0c8,0.4588
4,71a5b6ad068d531f,0.2301
5,474c874d0c07bd21,0.544
6,442c114ed5c4e3c9,0.382
7,b8ae62ea5e1d8bdb,0.0022
8,faaddaf8fcba8a3f,0.2639
9,ae0262c02566d2ce,0.8112


In [54]:
# ===================  Baseline
# 0  4112d61851461f60  0.56127
# 1  09e418c93a776564  0.72176
# 2  36baf228038e314b  0.47086
# 3  1f37ead645e7f0c8  0.25826
# 4  71a5b6ad068d531f  0.00908
# 5  474c874d0c07bd21  0.48173

In [55]:
submission.to_csv('submission.csv', index=False)