## Start

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon May 16 16:48:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os

os.chdir('/content/drive/My Drive/uspppm/notebook')
path = os.getcwd()
print(path)

/content/drive/My Drive/uspppm/notebook


In [5]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install bitsandbytes-cuda112==0.26.0

Collecting transformers
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 14.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 88.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 76.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0

In [6]:
import pandas as pd
import numpy as np
import os

import shutil
import gc
gc.enable()

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


#from tqdm import tqdm
# tqdm doesn't work well in colab.
# This is the solution:
# https://stackoverflow.com/questions/41707229/tqdm-printing-to-newline
import tqdm.notebook as tq
#for i in tq.tqdm(...):


import string

from sklearn import model_selection
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup




# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')

os.environ["WANDB_DISABLED"] = "true"


print(torch.__version__)
#print(torchvision.__version__)

1.11.0+cu113


In [7]:
# Set the seed values

import random

seed = 1024

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [8]:
os.listdir('../input/')

['deberta-v2-xlarge',
 'us-patent-phrase-to-phrase-matching',
 'deberta-v3-large',
 'allenai-longformer-large-4096',
 'cpc-codes']

In [9]:
base_path = '../input/us-patent-phrase-to-phrase-matching/'


## Config

In [10]:
# The model is stored in a Kaggle dataset.
# The internet connection in this notebook is off.
#MODEL_PATH = '../input/deberta-v3-large/'
#MODEL_PATH = '../input/deberta-v2-xlarge/'
#MODEL_PATH = 'funnel-transformer/xlarge'

# Set the max token length.
# Determine this by looking at max token lengths 
# in the train set. Process is shown below.
#MAX_LEN = 64
MAX_LEN = 512

NUM_EPOCHS = 5

NUM_FOLDS = 5

# Specify which folds should be used in training.
# This is helpful when you have to train the folds in 
# separate notebooks.
START_FOLD = 0
STOP_FOLD = 5 # this number is not included

NUM_CLASSES = 5 # [0, 1, 2, 3, 4]
NUM_CORES = os.cpu_count()

# When training with multiple GPUs, if the number
# of workers (CPU cores) is set too high that can slow down training.
# Not applicable on Kaggle because there's only one GPU.
if torch.cuda.device_count() > 1:
    NUM_CORES = 4

NUM_CORES

8

## Check the device

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

if torch.cuda.is_available():
    print('Num GPUs:', torch.cuda.device_count())
    print('GPU Type:', torch.cuda.get_device_name(0))

cuda:0
Num GPUs: 1
GPU Type: Tesla V100-SXM2-16GB


## Load the data

In [12]:
# Test data

#path = base_path + 'test.csv'
#path = base_path + 'train.csv'
path = base_path + 'aug_data.csv'

df_test = pd.read_csv(path)

print(df_test.shape)

df_test.head()

(1216611, 4)


Unnamed: 0,id,anchor,target,context
0,,abatement,adhering mount,A47
1,,abatement,adhesive,A47
2,,abatement,adhesive attachment,A47
3,,abatement,adhesive bases,A47
4,,abatement,adhesive forces,A47


## Add the context meanings

Here we will add the context meanings to the train and test data. We will create a new column call 'title'.

In [13]:
# Ref: https://en.wikipedia.org/wiki/Cooperative_Patent_Classification

# The letters (keys) in this dictionary are the
# first letters of the context. Refer to the column called 'context'.

context_mapping_dict = {
                        "A": "Human Necessities",
                        "B": "Operations and Transport",
                        "C": "Chemistry and Metallurgy",
                        "D": "Textiles",
                        "E": "Fixed Constructions",
                        "F": "Mechanical Engineering",
                        "G": "Physics",
                        "H": "Electricity",
                        "Y": "Emerging Cross-Sectional Technologies",
                        }

In [14]:
def map_context(x):
    
    # get the first letter
    letter = x[0]
    
    # extract the meaning from the dictionary
    meaning = context_mapping_dict[letter]
    
    return meaning

In [15]:
# Test data.
# Create a new column.
df_test['title'] = df_test['context'].apply(map_context)

df_test.head()

Unnamed: 0,id,anchor,target,context,title
0,,abatement,adhering mount,A47,Human Necessities
1,,abatement,adhesive,A47,Human Necessities
2,,abatement,adhesive attachment,A47,Human Necessities
3,,abatement,adhesive bases,A47,Human Necessities
4,,abatement,adhesive forces,A47,Human Necessities


## Create the label column

In [16]:
def create_label(x):
    
    if x == 0:
        return 0

    if x == 0.25:
        return 1
    
    if x == 0.5:
        return 2

    if x == 0.75:
        return 3

    if x == 1.0:
        return 4

# Note: This column must be called 'labels'. The Hugging Face trainer
# automatically detects the column that contains the training labels.
#df_data['labels'] = df_data['score'].apply(create_label)

# Create a dummy label column so that the dataloader works on the test set.
df_test['labels'] = 0

print(df_test.shape)

df_test.head()

(1216611, 6)


Unnamed: 0,id,anchor,target,context,title,labels
0,,abatement,adhering mount,A47,Human Necessities,0
1,,abatement,adhesive,A47,Human Necessities,0
2,,abatement,adhesive attachment,A47,Human Necessities,0
3,,abatement,adhesive bases,A47,Human Necessities,0
4,,abatement,adhesive forces,A47,Human Necessities,0


## Combine the anchor and target

In [17]:
df_test['combined_sentence'] = df_test['anchor'] + ' vs ' + df_test['target']

df_test.head()

Unnamed: 0,id,anchor,target,context,title,labels,combined_sentence
0,,abatement,adhering mount,A47,Human Necessities,0,abatement vs adhering mount
1,,abatement,adhesive,A47,Human Necessities,0,abatement vs adhesive
2,,abatement,adhesive attachment,A47,Human Necessities,0,abatement vs adhesive attachment
3,,abatement,adhesive bases,A47,Human Necessities,0,abatement vs adhesive bases
4,,abatement,adhesive forces,A47,Human Necessities,0,abatement vs adhesive forces


## Set up the tokenize function and the metric function

In [18]:
def tokenize_data_fn(hf_dataset):
    
    """
    This function will tokenize all text in a specified column.
    We use it in the same way that we use 'apply' in Pandas.
    
    """
    
    tokenized_examples = tokenizer(
                            hf_dataset['combined_sentence'], # sentence1
                            hf_dataset['title'], # sentence2 - context
                            truncation="only_second", # only truncate sentence2
                            max_length=MAX_LEN,
                            padding="max_length",
                            )
    
    return tokenized_examples




def compute_metrics(eval_pred):
    
    # Declare as global so we can calculate the cv score for all folds and 
    # then print it when training is complete.
    global corr
    
    """    
    This function is used to calculate the metric during training.
    We will save the best model based on this metric.
    
    """
    
    from scipy.stats import pearsonr
    
    score_list = []
    
    logits, labels = eval_pred
    
    # logits shape: (num_rows, num_cols)
    # labels shape: (num_rows,)
    
    # take the argmax
    preds = np.argmax(logits, axis=1)
    
    # Calculate the correlation.
    # preds and labels should have the same length.
    # corr is a scalar.
    corr, _ = pearsonr(preds, labels)
    
    print(f'Pearson: {corr}')
    
    return {
            'pearson': corr
            }
    


## Inference

In [19]:
# Create a list of fold model paths

model_0 = 'v3_clf_model_2'

#MODEL_LIST = [model_0, model_1, model_2, model_3, model_4]
MODEL_LIST = [model_0]

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from datasets import Dataset

# Make predictions using all fold models

raw_predictions_list = []

# Make a prediction using each fold model
for i, model_path in enumerate(MODEL_LIST):

    training_args = TrainingArguments(
        #do_train=True,
        output_dir=f"/tmp/uspppm",
        #evaluation_strategy="epoch",
        #save_strategy="epoch",
        #learning_rate=CFG.learning_rate,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        #num_train_epochs=CFG.epochs,
        #weight_decay=CFG.weight_decay,
        #metric_for_best_model="pearson",
        #load_best_model_at_end=True,
        fp16=True,
        #gradient_accumulation_steps=4,
        #gradient_checkpointing=True,
        #seed = 542,
    )


    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                              num_labels=NUM_CLASSES)

    trainer = Trainer(
            model,
            #training_args,
            tokenizer=tokenizer,
        )
    
    # Create the test dataset
    test_dataset = Dataset.from_pandas(df_test)

    test_features = test_dataset.map(
                    tokenize_data_fn,
                    batched=True,
                    remove_columns=test_dataset.column_names
                    )

    # Make a prediction for one model
    raw_predictions = trainer.predict(test_features)
    #test_loader = DataLoader(test_features, batch_size=64, shuffle=False)
    #raw_predictions = trainer.prediction_loop(test_loader, description="prediction")

    # Save the predictions from each fold in a list
    raw_predictions_list.append(raw_predictions)


print(len(raw_predictions_list))

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  0%|          | 0/1217 [00:00<?, ?ba/s]

***** Running Prediction *****
  Num examples = 1216611
  Batch size = 8


1


In [21]:
# Average the predictions for all folds

for i, raw_preds in enumerate(raw_predictions_list):
    
    np_preds_logits = raw_preds.predictions

    if i == 0:     
        fin_logits = np_preds_logits
        
    else:
        fin_logits = fin_logits + np_preds_logits

        
# Average the predictions
avg_logits = fin_logits/len(MODEL_LIST)

avg_logits.shape

(1216611, 5)

In [22]:
avg_logits

array([[ 1.1547874 ,  4.902403  , -0.60784787, -2.6181736 , -4.0003633 ],
       [ 4.6857014 ,  2.5310535 , -2.0127296 , -3.1182218 , -3.6591306 ],
       [ 1.0790341 ,  4.710485  , -0.3024546 , -2.5169566 , -3.9676554 ],
       ...,
       [-0.13448134,  6.168163  , -1.6778233 , -2.0416749 , -3.4075418 ],
       [ 1.0256528 ,  5.7779117 , -2.3130076 , -2.4107504 , -3.3051505 ],
       [-0.65476626,  6.142751  , -0.7500027 , -2.0846798 , -3.7337112 ]],
      dtype=float32)

In [23]:
avg_logits.min(axis=1)

array([-4.0003633, -3.6591306, -3.9676554, ..., -3.4075418, -3.3051505,
       -3.7337112], dtype=float32)

In [24]:
avg_logits[ np.arange(avg_logits.shape[0]) , np.argsort(avg_logits, axis=1)[:,-2] ]

array([ 1.1547874 ,  2.5310535 ,  1.0790341 , ..., -0.13448134,
        1.0256528 , -0.65476626], dtype=float32)

In [25]:
logits_diff = np.amax(avg_logits,axis=1) - avg_logits[ np.arange(avg_logits.shape[0]) , np.argsort(avg_logits, axis=1)[:,-2] ]

In [26]:
np.argmax(avg_logits, axis=1)

array([1, 0, 1, ..., 1, 1, 1])

In [27]:
# Take the argmax

preds = np.argmax(avg_logits, axis=1)

preds.shape

(1216611,)

In [28]:
# Add the preds to df_test

df_test['preds'] = preds

In [29]:
# Change the preds to the corresponding float values

def change_preds(x):
    
    if x == 0:
        return 0

    if x == 1:
        return 0.25
    
    if x == 2:
        return 0.5

    if x == 3:
        return 0.75

    if x == 4:
        return 1.0
    
df_test['modified_preds'] = df_test['preds'].apply(change_preds)

# filter out the columns we don't need
cols = ['id', 'modified_preds']
df = df_test[cols]

print(df_test.shape)

df_test.head()

(1216611, 9)


Unnamed: 0,id,anchor,target,context,title,labels,combined_sentence,preds,modified_preds
0,,abatement,adhering mount,A47,Human Necessities,0,abatement vs adhering mount,1,0.25
1,,abatement,adhesive,A47,Human Necessities,0,abatement vs adhesive,0,0.0
2,,abatement,adhesive attachment,A47,Human Necessities,0,abatement vs adhesive attachment,1,0.25
3,,abatement,adhesive bases,A47,Human Necessities,0,abatement vs adhesive bases,1,0.25
4,,abatement,adhesive forces,A47,Human Necessities,0,abatement vs adhesive forces,1,0.25


In [30]:
df_test2 = pd.concat([df_test,pd.DataFrame(np.amax(avg_logits,axis=1),columns=["logits_max"]), pd.DataFrame(logits_diff,columns=["logits_diff"])], axis=1)
df_test2.head()

Unnamed: 0,id,anchor,target,context,title,labels,combined_sentence,preds,modified_preds,logits_max,logits_diff
0,,abatement,adhering mount,A47,Human Necessities,0,abatement vs adhering mount,1,0.25,4.902403,3.747615
1,,abatement,adhesive,A47,Human Necessities,0,abatement vs adhesive,0,0.0,4.685701,2.154648
2,,abatement,adhesive attachment,A47,Human Necessities,0,abatement vs adhesive attachment,1,0.25,4.710485,3.631451
3,,abatement,adhesive bases,A47,Human Necessities,0,abatement vs adhesive bases,1,0.25,4.234077,1.499687
4,,abatement,adhesive forces,A47,Human Necessities,0,abatement vs adhesive forces,1,0.25,4.304574,1.240276


In [31]:
df_test2.to_csv("train_with_PL.csv", index=None)