# Libraries

In [None]:
!pip uninstall -q -y transformers

In [None]:
import sys
sys.path.append("../input/torch-components-library/torch-components-main")
sys.path.append("../input/transformers/src")

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch_components import Configuration, Timer, Averager
from torch_components.utils import seed_everything, get_batch, load_checkpoint
from torch.cuda.amp import GradScaler, autocast
from sklearn.model_selection import StratifiedKFold
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import numpy as np
import warnings
import random
import os
import shutil
import gc


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEBUG = False

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["EXPERIMENT_NAME"] = "none"

        
warnings.simplefilter("ignore")

In [None]:
pathes = Configuration(train="../input/us-patent-phrase-to-phrase-matching/train.csv", 
                       test="../input/us-patent-phrase-to-phrase-matching/test.csv",
                       sample_submission="../input/us-patent-phrase-to-phrase-matching/sample_submission.csv",
                       cpc_codes="../input/cpc-codes/titles.csv")

# Configuration

In [None]:
config = Configuration(seed=42,
                       max_length=72,
                       batch_size=24,
                       num_workers=4,
                       pin_memory=True,
                       folds=4,  
                       verbose=250,
                       device=DEVICE,
                       amp=True, 
                       input_directory="../input/deberta-large-anchor-change",
                       debug=True)

seed_everything(config.seed)

# Utilities

In [None]:
def create_submission(ids, predictions, path="submission.csv"):
    submission = pd.DataFrame({
        "id": ids,
        "score": predictions,
    })
    
    submission.to_csv(path, index=False)
    return submission

def prediction_loop(loader, 
                    model, 
                    device="cpu", 
                    amp=False, 
                    verbose=1, 
                    time_format="{hours}:{minutes}:{seconds}", 
                    logger="print"):
    
    if device is not None:
        model.to(device)
    
    model.eval()
    outputs = []
    timer = Timer(time_format)
    steps = len(loader)
    
    if logger == "tqdm":
        loader = tqdm(iterable=loader, 
                      total=len(loader),
                      colour="#000",
                      bar_format="{l_bar} {bar} {n_fmt}/{total_fmt} - remain: {remaining}{postfix}")
            
        loader.set_description_str("[Prediction]")
    
    for step, batch in enumerate(loader, 1):
        with torch.no_grad():
            with autocast(enabled=amp):
                batch_outputs = prediction_step(batch=batch, model=model, device=device)
                
            outputs.extend(batch_outputs.to("cpu").numpy())
            
            if logger == "print":
                if step % verbose == 0 or step == steps:
                    elapsed, remain = timer(step/steps)

                    print(f"[Prediction] "
                          f"{step}/{steps} - "
                          f"remain: {remain}")
            
    outputs = torch.tensor(outputs)
    return outputs

In [None]:
def prediction_step(batch, model, device="cpu"):
    input_ids, attention_mask = batch
    
    input_ids = input_ids.to(device).long()
    attention_mask = attention_mask.to(device).long()
    
    outputs = model(input_ids, attention_mask)
    
    return outputs.sigmoid().squeeze()

# Loading dataset

In [None]:
print(pd.read_csv(pathes.train))

In [None]:
cpc_codes = pd.read_csv(pathes.cpc_codes)

path = pathes.train if DEBUG else pathes.test 
test = pd.read_csv(path)
test = test.merge(cpc_codes, left_on="context", right_on="code")
test_ids = test["id"].values

sample_submission = pd.read_csv(pathes.sample_submission)

if config.debug:
    display(test)

In [None]:
cpc_texts = torch.load("../input/foldsdump/cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
test['text'] = test['text'].apply(str.lower)

In [None]:
test

# Tokenizer

In [None]:
tokenizer_path = os.path.join(config.input_directory, "tokenizer/")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Dataset

In [None]:
class DynamicPadding:
    def __init__(self, tokenizer, max_length=None, padding=True, pad_to_multiple_of=None, return_tensors="pt"):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_tensors = return_tensors
    
    def __call__(self, tokenized):
        max_length = max(len(_["input_ids"]) for _ in tokenized)
        max_length = min(max_length, self.max_length) if self.max_length is not None else max_length
                
        padded = self.tokenizer.pad(encoded_inputs=tokenized,
                                    max_length=max_length,
                                    padding=self.padding, 
                                    pad_to_multiple_of=self.pad_to_multiple_of, 
                                    return_tensors=self.return_tensors)
        
        return padded
    
    
    
class Collator:
    def __init__(self, return_targets=True, **kwargs):
        self.dynamic_padding = DynamicPadding(**kwargs)
        self.return_targets = return_targets
    
    def __call__(self, batch):
        all_tokenized, all_targets = [], []
        for sample in batch:
            if self.return_targets:
                tokenized, target = sample
                all_targets.append(target)
            else:
                tokenized = sample
                
            all_tokenized.append(tokenized)
        
        tokenized = self.dynamic_padding(all_tokenized)
        
        input_ids = torch.tensor(tokenized.input_ids)
        attention_mask = torch.tensor(tokenized.attention_mask)
        
        if self.return_targets:
            all_targets = torch.tensor(all_targets)
        
            return input_ids, attention_mask, all_targets
        
        return input_ids, attention_mask

In [None]:
"""
        def __init__(self, texts, pair_texts, tokenizer, contexts=None, sep=None, targets=None, max_length=128):
        self.texts = texts
        self.pair_texts = pair_texts
        self.contexts = contexts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sep = sep if sep is not None else self.tokenizer.sep_token"""

In [None]:
class Dataset:
    def __init__(self, texts, pair_texts, tokenizer, contexts=None, sep=None, targets=None, max_length=128):
        self.texts = texts
        self.pair_texts = pair_texts
        self.contexts = contexts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sep = sep if sep is not None else self.tokenizer.sep_token
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index].lower()
        pair_text = self.pair_texts[index].lower()
        
        if self.contexts is not None:
            context = self.contexts[index].lower()
            text = text + self.sep + context
        
        tokenized = self.tokenizer(text=text, 
                                   text_pair=pair_text, 
                                   add_special_tokens=True,
                                   #max_length=self.max_length,
                                   #padding="max_length",
                                   truncation=True,
                                   return_attention_mask=True,
                                   return_token_type_ids=False,
                                   return_offsets_mapping=False)
        
        
        if self.targets is not None:
            target = self.targets[index]
            
            return tokenized, target
            
        return tokenized

In [None]:
"""class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_length):
        self.text = df['text'].values.astype(str)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.text[item]
        
        inputs = self.tokenizer(inputs,
                    max_length=self.max_input_length,
                    padding='max_length',
                    truncation=True )
        return torch.as_tensor(inputs['input_ids'], dtype=torch.long),\
               torch.as_tensor(inputs['token_type_ids'], dtype=torch.long),\
               torch.as_tensor(inputs['attention_mask'], dtype=torch.long)"""

In [None]:
test

In [None]:
collator = Collator(return_targets=False, tokenizer=tokenizer, max_length=config.max_length)

test_dataset = Dataset(texts=test["text"].values, 
                       pair_texts=test["target"].values,
                       contexts=test["title"].values,
                       max_length=config.max_length,
                       sep=tokenizer.sep_token,
                       tokenizer=tokenizer)
    
test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=config.batch_size*2, 
                         num_workers=config.num_workers,
                         pin_memory=config.pin_memory,
                         collate_fn=collator,
                         shuffle=False, 
                         drop_last=False)

print(f"Test Samples: {len(test_dataset)}")

# Model

In [None]:
class Model(nn.Module):
    def __init__(self, model_path="../input/deberta-large-anchor-change", config_path=None, config_updates={}, reinitialization_layers=0):
        super(Model, self).__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(model_path)
        else:
            self.config = AutoConfig.from_pretrained(config_path)
        
        self.config.output_hidden_states = True
        self.config.update(config_updates)
        
        if config_path is None:
            self.model = AutoModel.from_pretrained(model_path, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
                
                
        self.reinit_layers(n=reinitialization_layers, layers=self.model.encoder.layer, std=self.config.initializer_range)

        self.head = nn.Linear(in_features=self.config.hidden_size, out_features=1)
        self.init_weights(self.head, std=self.config.initializer_range)
    
    
    def reinit_layers(self, layers, n=0, std=0.02):
        if n > 0:
            for layer in layers[-n:]:
                for name, module in layer.named_modules():
                    self.init_weights(module, std=std)
            
            print(f"Reinitializated last {n} layers.")
                
    
    def init_weights(self, module, std=0.02):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    
    def forward(self, input_ids, attention_mask=None):
        transformer_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        features = transformer_outputs.hidden_states[-1]
        features = features[:, 0, :]
        outputs = self.head(features)
        return outputs

# Inference

In [None]:
oof_predictions = []
for fold in range(1, config.folds+1):
    print(f"Fold [{fold}/{config.folds}]")
    
    fold_directory = os.path.join(config.input_directory, f"fold_{fold}/")
    model_config_path = os.path.join(fold_directory, "model_config.json")
    model_path = os.path.join(fold_directory, "model.pth")
    checkpoints_directory = os.path.join(fold_directory, "checkpoints/")
    checkpoint_path = os.path.join(checkpoints_directory, "checkpoint.pth")
    
    model = Model(config_path=model_config_path)
    
    fold_checkpoint = load_checkpoint(path=checkpoint_path, 
                                      model=model, 
                                      strict=True, 
                                      ignore_warnings=True)
    
    
    print(f"Loaded checkpoint from '{checkpoint_path}'.")
    
    fold_predictions = prediction_loop(loader=test_loader, 
                                       model=model, 
                                       amp=config.amp, 
                                       device=config.device)
    
    oof_predictions.append(fold_predictions.numpy())
    
    del model, fold_checkpoint, fold_predictions
    torch.cuda.empty_cache()
    gc.collect()
    
    print(end="\n"*3)
    
oof_predictions = np.array(oof_predictions)

In [None]:
test_predictions = np.mean(oof_predictions, axis=0)

In [None]:
create_submission(ids=test_ids, predictions=test_predictions, path="submission.csv")