# Initial EDA, train/dev split and modelling

In [1]:
import os
from pathlib import Path
from typing import Callable

import numpy as np
import pandas as pd
import transformers
from numpy.random import default_rng
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

rng = default_rng(42)

In [2]:
is_kaggle = os.environ.get("KAGGLE_KERNEL_RUN_TYPE", "")

In [3]:
data_dir = Path("../input/us-patent-phrase-to-phrase-matching")

In [4]:
df = pd.read_csv(data_dir / "train.csv")

In [5]:
df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


## train/dev split ensuring indepdendent anchors

Motivated by this thread: https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/315220

In [6]:
anchors = df.anchor.unique()
rng.shuffle(anchors)
train_frac = 0.8
split_idx = int(0.8 * anchors.shape[0])
train_anchors = anchors[:split_idx]
dev_anchors = anchors[split_idx:]

In [7]:
train_idcs = df.anchor.isin(train_anchors)
train_df = df.loc[train_idcs, :]

dev_idcs = df.anchor.isin(dev_anchors)
dev_df = df.loc[dev_idcs, :]

assert len(train_df) + len(dev_df) == len(df)
print(f"{len(train_df)} training samples, {len(dev_df)} dev samples")

29287 training samples, 7186 dev samples


## Modelling - Vanilla PatentSBERTa

In [8]:
model = SentenceTransformer("AI-Growth-Lab/PatentSBERTa")

In [9]:
def get_sims(anchors: pd.Series, targets: pd.Series):
    anchors_embed = model.encode(list(anchors.str.lower()))
    targets_embed = model.encode(list(targets.str.lower()))
    cosine_sims = []
    for i in range(anchors_embed.shape[0]):
        cosine_sims.append(1 - cosine(anchors_embed[i, :], targets_embed[i, :]))
    return cosine_sims

In [None]:
train_sbert_corr = np.corrcoef(
    get_sims(train_df.anchor, train_df.target), train_df.score
)[0, 1]
train_sbert_corr

In [None]:
dev_sbert_corr = np.corrcoef(get_sims(dev_df.anchor, dev_df.target), dev_df.score)[0, 1]
dev_sbert_corr

## Convert to HF Dataset for faster processing 

In [None]:
# TODO
# - tokenize

In [None]:
model_name = "microsoft/deberta-v3-small"

In [None]:
def get_dataset_dict(df: pd.DataFrame, tokenizer: Callable):
    ds = Dataset.from_pandas(df).rename_column("score", "label")
    tok_ds = ds.map(
        tokenizer,
        batched=True,
        remove_columns=("anchor", "target", "context", "inputs", "id", "section"),
    )
    return DatasetDict(
        {"train": tok_ds.select(trn_idxs), "test": tok_ds.select(val_idxs)}
    )