# Introduction
**Info ℹ️: I refactored everything and include some new learnings - My submitted version for the competition was version 14 if you are interested. I used a different approach there and this version is a recap of everything**

Hi visitor,
this is my first NLP project and my first competition on Kaggle. I am familliar with the theoretical basics of NLP but never did a project on this topics especially with some pretrained models. So this is it. 

In this project I first tried two approaches of pre-trained model. One where I load the pre-trained model manually in the embeddings layer and use that layer as a part of my model (glove) and the other one based on Huggingfaces🤗 framework, where I use the from_pretrained() function which loads the whole model (with all layers). This can be found in the notebook version where I submitted the competition with, which is version 14.
The current version/approach of the notebook is just the Huggingface🤗 edition because this makes all of this more readable and easier to understand 😁

HINT - After the Competition:
For a better learning process I recaped my work and compared it with other competitions contributors work. One main notebook here was Jeremy Howards "Iterate like a grandmaster!" as well as the notebook of Mohamad Merchant who also wrote a blog article about "Semantic Similarity with BERT" on Keras, which handles the use of NLP models on Keras. The notebook which I got a lot inspired on can be found here: https://www.kaggle.com/code/mohamadmerchant/us-phrase-matching-tf-keras-train-tpu. 
I used a lot of bothes approaches in this notebook in the recap phase. Once again: If you want to see my initial approach where I got around 70% accuracy you should take a look at version 14. This was the version that I submitted to the competition. All work after this version is part of the recap phase and therefore full of inspiring code parts of other contributors.

I thereforce ask you to bear with?! 🤗

# Imports and Datasets

In [35]:
import sys
assert sys.version_info >= (3,5)
import os
import pathlib

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from functools import partial
import seaborn as sns
from datasets import Dataset
from datasets import DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

import nltk
from string import punctuation
from collections import Counter

from scipy.spatial.distance import cosine

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import layers
from keras.layers import Embedding, LSTM, Dense, Dropout, CuDNNLSTM, Bidirectional
from keras.layers.merge import concatenate
from transformers import TrainingArguments
from transformers import BertTokenizer, TFDebertaModel
from transformers import RobertaTokenizer, TFRobertaModel, TFRobertaForSequenceClassification
from transformers import TFAutoModel

#import mlflow
#from mlflow import log_metric, log_param, log_artifacts
#import mlflow.tensorflow
#from mlflow import pyfunc

assert tf.__version__ >= "2.0"

print(f"Tensorflow Version: {tf.__version__}")
print(f"Keras Version: {keras.__version__}")

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")
else:
    print(f'---Tensorflow is running with GPU Power now---')
    sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
    


random_state=42
tf.random.set_seed(random_state)
np.random.seed(random_state)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE','')
#kaggle = 0 # Kaggle path active = 1

MAIN_PATH = os.getcwd()

# change your local path here
if iskaggle:
    DATA_PATH = os.path.join(MAIN_PATH, '../input')
    PHRASES_PATH = os.path.join(DATA_PATH, 'us-patent-phrase-to-phrase-matching')
else:
    DATA_PATH = os.path.join(MAIN_PATH, 'data')
    PHRASES_PATH = os.path.join(DATA_PATH,'input\\us-patent-phrase-to-phrase-matching')



for dirname, _, filenames in os.walk(PHRASES_PATH): 
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# Get the Data

In [36]:
# Data path and file
CSV_FILE_TRAIN='train.csv'
CSV_FILE_TEST='test.csv'
CSV_FILE_COMF='sample_submission.csv'
CSV_FILE_CPC='titles.csv'
CPC_PATH='cpc-codes'
DEBERTA_PATH='huggingface-deberta-variants'
ROBERTA_PATH='roberta-base'

def load_csv_data(path, csv_file):
    csv_path = os.path.join(path, csv_file)
    return pd.read_csv(csv_path)

def load_csv_data_manuel(path, csv_file):
    csv_path = os.path.join(path, csv_file)
    csv_file = open(csv_path, 'r')
    csv_data = csv_file.readlines()
    csv_file.close()
    return csv_data
    

train = load_csv_data(PHRASES_PATH,CSV_FILE_TRAIN)
test = load_csv_data(PHRASES_PATH,CSV_FILE_TEST)
competition_file = load_csv_data(PHRASES_PATH,CSV_FILE_COMF)
cpc_code = load_csv_data(os.path.join(DATA_PATH, CPC_PATH), CSV_FILE_CPC)


print(f'Length of loaded trainset: {len(train)}')
print(f'Length of loaded testset: {len(test)}')
print(f'Length of loaded competition file: {len(competition_file)}')
print(f'Length of loaded cpc_codeset: {len(cpc_code)}')

In [37]:
train = train.join(cpc_code.set_index('code'), on = 'context')
test = test.join(cpc_code.set_index('code'), on = 'context')

## Loading Model Files

In [38]:
if iskaggle:
    ROBERTA_BASE = os.path.join(DATA_PATH, ROBERTA_PATH) # kaggle datasource location
else:
    ROBERTA_BASE = 'roberta-base'

# Data Understanding

## Given Attributes
- id - a unique identifier for a pair of phrases
- anchor - the first phrase
- target - the second phrase
- context - the CPC classification (version 2021.05), which indicates the subject within which the similarity is to be scored
- score - the similarity. This is sourced from a combination of one or more manual expert ratings.


## Score
The scores are in the 0-1 range with increments of 0.25 with the following meanings:

- 1.0 - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
- 0.75 - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
- 0.5 - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
- 0.25 - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
- 0.0 - Unrelated.

In [39]:
train['anchor'].value_counts(dropna=False)

The anchor value has 733 different values. Lets look at the target value.

In [40]:
train['target'].value_counts(dropna=False)

The target looks a little bit different. Here we have 29,340 different values.

In [41]:
train['score'].value_counts(dropna=False)

In [42]:
train['score'].value_counts(dropna=False).sort_index().plot.bar()

In [43]:
train.groupby(['anchor', 'context']).count()

### Configuration

In [44]:
class Config():
    learning_rate = 1e-5
    num_epochs = 10
    batch_size = 32
    decay = 0.01
    max_line_length = 190
    num_folds = 5

    base_model = ROBERTA_BASE

    root_logdir_tb = "../../tensorboard-logs"   # tensorboard logdir

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            if k in self.__dict__:
                setattr(self, k, v)
            else:
                raise KeyError(k)
        


config = Config()

# Data Preparation

#### Loading Model

In [45]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification

In [46]:
tokenizer = AutoTokenizer.from_pretrained(config.base_model)

In [47]:
model_pretrained = TFAutoModelForSequenceClassification.from_pretrained(config.base_model, trainable=True, return_dict=True, num_labels=5, output_hidden_states=True)

In [48]:
#tokenizer.add_special_tokens({'additional_special_tokens': context_list})

#### Building the Input Value for the Model - The Text Corpus

Seperating the loaded cpc titles. They are concatenated by ";".  

In [49]:
# Seperating the cpc titles
train['title'] = train.title.apply(lambda text: text.split(';'))
train['title'] = train.title.apply(lambda context: ' '.join(context))

#### Special Tokens

In [50]:
sep_token = tokenizer.sep_token
print(f'Seperater Token: {sep_token}')

In [51]:
tokenizer.all_special_tokens

Defining the context as special token for the Tokenizer

In [52]:
train['context_token'] = '[' + train['context'] + ']'
test['context_token'] = '[' + test['context'] + ']'
context_list = list(train['context_token'].unique())

In [53]:
train['corpus'] = train['anchor'] + sep_token + train['target']
train['corpus_w_context'] = train['context_token'] + sep_token + train['corpus']
train['corpus_w_full_context'] = train['context_token'] + sep_token + train['corpus'] + sep_token + train['title']

test['corpus'] = test['anchor'] + sep_token + test['target']
test['corpus_w_context'] = test['context_token'] + sep_token + test['corpus']
test['corpus_w_full_context'] = train['context_token'] + sep_token + test['corpus'] + sep_token + test['title']

#### Train / Test / Val Data


In [54]:
anchors = train.anchor.unique()

In [55]:
print(f"Amount of diferent anchor values: {len(anchors)}")

In [56]:
np.random.seed(random_state)
np.random.shuffle(anchors)

In [57]:
anchors[:5]

This anchor set will work as the basement for the validation set slicing.

In [58]:
val_proportion = 0.25
val_size = int(len(anchors)* val_proportion)
val_anchors = anchors[:val_size]

Slicing the data (or the over all index) with the validation index into train and validation index.

In [59]:
is_validation = np.isin(train.anchor, val_anchors)
idxs = np.arange(len(train))

In [60]:
val_indexes = idxs[is_validation]
train_indexes = idxs[~is_validation]
len(val_indexes), len(train_indexes)

#### Distribution of "Score" Values in Train / Val Set

In [61]:
train.iloc[train_indexes].score.mean()

In [62]:
train.iloc[val_indexes].score.mean()

### Encoding

#### Tokenizer Funktion

In [63]:
def tokenize_fkt(text, tokenizer):
    MAX_LINE_LENGTH = len(tokenizer(text).input_ids) # removed the tensorflow return
    encoded_text = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=False,
        max_length=config.max_line_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf"
        )

    input_ids = np.array(encoded_text["input_ids"], dtype="int32")
    attention_masks = np.array(encoded_text["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded_text["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

## Model Build

In [64]:
def build_model(config,):
    input_ids = tf.keras.Input(shape = (config.max_line_length ), dtype = tf.int32, name="input_ids")
    attention_masks = tf.keras.Input(shape = (config.max_line_length), dtype = tf.int32, name="attention_masks")
    token_type_ids = tf.keras.Input(shape = (config.max_line_length ), dtype = tf.int32, name="token_type_ids")

    base_model = TFAutoModel.from_pretrained(
                                    config.base_model,
                                    trainable=True,
                                    return_dict=True,
                                    num_labels=1,
                                    output_hidden_states=True,
                                    from_pt=True
                                )

    base_model_out = base_model(
                            input_ids = input_ids,
                            attention_mask = attention_masks,
                            token_type_ids = token_type_ids,
                            output_hidden_states=True
                            )
    
    last_hidden_state = base_model_out.last_hidden_state

    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
    dropout = tf.keras.layers.Dropout(0.3)(avg_pool)
    #x = tf.keras.layers.Dense(32, activation='relu')(x)
    output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)

    model = tf.keras.models.Model(
        inputs = [input_ids, attention_masks, token_type_ids],
        outputs = output
    )

    model.compile(
        optimizer = tf.keras.optimizers.Nadam(learning_rate=config.learning_rate),
        loss = tf.keras.losses.BinaryCrossentropy()
    )

    return model
    

### Helpers for Keras Training

#### Pearson

In [65]:
class Pearsonr(tf.keras.callbacks.Callback):
    def __init__(self, val_data, y_val):
        self.val_data = val_data
        self.y_val = y_val
    
    def on_epoch_end(self, epoch, logs):
        val_preds = self.model.predict(self.val_data, verbose = 0)
        
        val_pearsonr = stats.pearsonr(self.y_val, val_preds.ravel())[0]
        
        print(f"val_pearsonr: {val_pearsonr:.4f}\n")
        logs["val_pearsonr"] = val_pearsonr

#### Learningrate Scheduler

In [74]:
def lr_scheduler(epoch):
    """
    Returns a custom learning rate that decreases as epochs progress.
    """
    decay = config.decay
    init_lr = config.learning_rate 

    #learning_rate = config.learning_rate * (1 / (1 + config.decay * epoch))
    
    if epoch == 0:
        return init_lr * 0.05
    else:
        return init_lr * (0.8**epoch)

    tf.summary.scalar('learning rate', data=learning_rate, step=epoch)
    return learning_rate


lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)


In [75]:
plt.plot([lr_scheduler(e) for e in range(10)])

#### Tensorboard Logging

In [None]:
#from keras.callbacks import ReduceLROnPlateau
#
## Tensorboard logging structure function
#root_logdir = "../../tensorboard-logs"
#
#def get_run_logdir(root_logdir, project):
#    '''
#    Returns logdir to the Tensorboard log for a specific project.
#
#            Parameters:
#                    root_logdir (str) : basic logdir from Tensorboard
#                    project (str): projectname that will be logged in TB
#
#            Returns:
#                    os.path (str): Path to the final logdir
#    '''
#    import time
#    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
#    project_logdir = os.path.join(root_logdir,project)
#    return os.path.join(project_logdir, run_id)
#
#


In [None]:
#tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=get_run_logdir(config.root_logdir_tb,"nlp_phrase2phrase_roberta"), histogram_freq=1)

#### Training function with KFolds

In [None]:
def train_folds(train, config):
    oof = np.zeros(len(train))
    
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    
    skf = StratifiedKFold(n_splits = config.num_folds,
                         shuffle = True,
                         random_state = random_state)
    
    for fold, (train_indexes, val_indexes) in enumerate(skf.split(train, train['score_map'])):
        print(f"Training fold: {fold + 1}")
        
        train_df = train.loc[train_indexes].reset_index(drop=True)
        val_df = train.loc[val_indexes].reset_index(drop=True)
        
        train_encoded = tokenize_fkt(train_df['corpus_w_full_context'].tolist(), tokenizer)
        val_encoded = tokenize_fkt(val_df['corpus_w_full_context'].tolist(), tokenizer)
        
        train_ds = tf.data.Dataset.from_tensor_slices((train_encoded, train_df['score'].tolist()))
        val_ds = tf.data.Dataset.from_tensor_slices((val_encoded, val_df['score'].tolist()))
        
        train_ds = (
            train_ds
            .shuffle(1024)
            .batch(config.batch_size)
            .prefetch(tf.data.AUTOTUNE)
        )

        val_ds = (
            val_ds
            .batch(config.batch_size)
            .prefetch(tf.data.AUTOTUNE)
        )
        
        
        # Callbacks
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold + 1}.h5',
                                                       monitor = 'val_loss',
                                                       mode = 'min',
                                                       save_best_only = True,
                                                       save_weights_only = True,
                                                       save_freq = 'epoch',
                                                       verbose = 1)
        
        earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                         mode='min',
                                                         patience=3,
                                                         restore_best_weights=True)
        
        lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
        
        
        pearsonr_callback = Pearsonr(val_ds, val_df['score'].values)
        
        
        
        num_train_steps = int(len(train) / config.batch_size * config.num_epochs)
        
        # Model building and training
        model = build_model(config)
        history = model.fit(
            train_ds,
            validation_data = val_ds,
            epochs = config.num_epochs,
            callbacks = [
                pearsonr_callback,
                checkpoint,
                lr_callback,
                earlystopping
            ]
        )
        
        print('\nLoading best model weights ...')
        model.load_weights(f'model-{fold + 1}.h5')
        
        print('Predicting OOF ...')
        oof[val_indexes] = model.predict(val_ds,
                                         batch_size = config.batch_size,
                                         verbose=0
                                        ).reshape(-1)
        
        score = stats.pearsonr(val_df['score'].values, oof[val_indexes])[0]
        print(f'\nFold {fold + 1}: OOF pearson_r: {score:.4f}')
        print("*" * 25)
        
    score = stats.pearsonr(train['score'].values, oof)[0]
    print(f'\nOverall OOF pearson_r: {score:.4f}')
        
    return oof

        

#### Fitting

In [None]:
oof_preds = train_folds(train, config)


# Evaluation

In [None]:
def predict_folds(test, config):
    preds = []
    
    for fold in range(config.num_folds):
        print(f'Predicting fold: {fold + 1}')
        
        test_encoded = tokenize_fkt(test['corpus_w_full_context'].tolist(), tokenizer)
        
        test_ds = tf.data.Dataset.from_tensor_slices((test_encoded))
        
        
        test_ds = (
            test_ds
            .batch(config.batch_size)
            .prefetch(tf.data.AUTOTUNE)
        )

               
        # Model building and prediction
        model = build_model(config)
        print(f'Loading best trained model weights ...')
        model.load_weights(f'model-{fold + 1}.h5')       

        preds.append(
            model.predict(test_ds,
                          batch_size = config.batch_size,
                          verbose=1).reshape(-1)
                         )
        
    preds = np.mean(preds, axis=0)
    return preds

# Submission File

## Training on all Data

## Prediction of Test File Values

In [None]:
competition_file = pd.DataFrame(columns=['score'])
competition_file = pd.read_csv(PHRASES_PATH + "/sample_submission.csv")

In [None]:
test_prediction = predict_folds(test, config)

In [None]:
competition_file['score'] = test_prediction

In [None]:
competition_file['score'].hist()

In [None]:
competition_file.to_csv('submission.csv', index=False)