# Initial EDA, train/dev split and modelling

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import transformers
from numpy.random import default_rng
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

rng = default_rng(42)

In [2]:
is_kaggle = os.environ.get("KAGGLE_KERNEL_RUN_TYPE", "")

In [3]:
data_dir = Path("../input/us-patent-phrase-to-phrase-matching")

In [4]:
df = pd.read_csv(data_dir / "train.csv")

In [5]:
df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


## train/dev split ensuring indepdendent anchors

Motivated by this thread: https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/315220

In [6]:
anchors = df.anchor.unique()
rng.shuffle(anchors)
train_frac = 0.8
split_idx = int(0.8 * anchors.shape[0])
train_anchors = anchors[:split_idx]
dev_anchors = anchors[split_idx:]

In [7]:
train_idcs = df.anchor.isin(train_anchors)
train_df = df.loc[train_idcs, :]

dev_idcs = df.anchor.isin(dev_anchors)
dev_df = df.loc[dev_idcs, :]

assert len(train_df) + len(dev_df) == len(df)
print(f"{len(train_df)} training samples, {len(dev_df)} dev samples")

29287 training samples, 7186 dev samples


## Modelling

## Vanilla PatentSBERTa 

In [8]:
model = SentenceTransformer("AI-Growth-Lab/PatentSBERTa")

In [9]:
anchor_embeddings = model.encode(list(dev_df.anchor))

In [10]:
target_embeddings = model.encode(list(dev_df.target))

In [11]:
cosine_sims = []
for i in range(anchor_embeddings.shape[0]):
    cosine_sims.append(1 - cosine(anchor_embeddings[i, :], target_embeddings[i, :]))

In [12]:
dev_df.head()

Unnamed: 0,id,anchor,target,context,score
49,8ff16a96af7558f0,abnormal position,abnormal position data,B23,0.5
50,edfb98a9ff1d471b,abnormal position,attitude,B23,0.25
51,5fd26b0436ca8d94,abnormal position,closed position shown,B23,0.25
52,0d32e5f72293247d,abnormal position,condition illustrated,B23,0.25
53,8aedbf799717e3e7,abnormal position,condition shown,B23,0.25


In [13]:
cosine_sims[:5]

[0.8634754419326782,
 0.29139581322669983,
 0.7115518450737,
 0.49723607301712036,
 0.517238199710846]

In [14]:
np.corrcoef(cosine_sims, dev_df.score)[0,1]

0.5499931036856311