# Environment Setup

In [1]:
# !pip install sentence_transformers
# !unzip nano-1000.zip

from sentence_transformers import SentenceTransformer
import xml.etree.cElementTree as et
import os
import numpy as np
from scipy import spatial
from google.colab import drive
import sys
from functools import lru_cache

import warnings
warnings.filterwarnings('once')

# Data Preperation



Only EP2081075A1 has no claims and only EP2102293A2 has no abstract. I decided not exclude them since they have another information to describe the patent.

In [2]:
patents = {}
for filename in os.listdir("nano"):
  tree=et.parse("nano/" + filename)
  root=tree.getroot()
  doc = {}
  
  doc["title"] = root.findtext("Title").strip("\n")
  if root.findtext("Abstract") is not None:
    doc["abstract"] = root.findtext("Abstract").strip("\n")
  else:
    doc["abstract"] = None

  doc_claims = []
  if root.find("Claims") is not None:
    for claim in root.find("Claims").iter():
      if len(claim.text.strip()) == 0: continue
      doc_claims.append(claim.text.strip("\n"))

  doc["claims"] = doc_claims
  patents[filename] = doc

print("Number of patents is ", len(patents))

Number of patents is  1000


# Sentence-Bert transformer models

In [3]:
class PatentSentenceBert:
  """
  SentenceBert: Sentence-Bert transformer model for patents data
  
  Parameters
  ----------
  model: sentence-transform
      Model to be used to get sentence embeddings
      
  patents: dict
      Dictionary contanis patents object

  patent_as_multi_sent: bool
      Indicate whether to treat a patnent as a list of sentence and use mean aggragation to get patent embedding.
  """
  def __init__(self, model, patents, patent_as_multi_sent):
    self.model = model
    self.patents = patents
    self.patent_sent_func = self._patent_to_sentences if patent_as_multi_sent else self._patent_to_one_sentence

  def _patent_to_one_sentence(self, patent, data):
    """
    _patent_to_one_sentence function merges the patent's sentences in on giant sentence.
    
    Parameters
    ----------
    patent: dict
        Patent object.

    data: List[String]
        list of sentences to add computed sentence to it.

    Returns None
    """
    sentences = []
    
    if patent["title"] is not None:
      sentences.append(patent["title"])
    if patent["abstract"] is not None:
      sentences.append(patent["abstract"])

    for claim in patent["claims"]:
      sentences.append(claim)

    data.append(" ".join(sentences))
    return None

  def _patent_to_sentences(self, patent, data): 
    """
    _patent_to_sentences function transforms a patent object into a list of sentences.
    
    Parameters
    ----------
    patent: dict
        Patent object.

    data: List[String]
        list of sentences to add computed sentences to it.

    Returns None
    """       
    if patent["title"] is not None:
      data.append(patent["title"])
    if patent["abstract"] is not None:
      data.append(patent["abstract"])

    for claim in patent["claims"]:
      data.append(claim)

    return None

  def get_bulk_patent_embedding(self):
    """
    get_bulk_patent_embedding function computes the patent's embedding in bulk to optimize the computations
    
    Returns
    ---------
    patent_embedding: dict
        A patent id to embedding vector mapping
    """
    # collecting all sentences and keep track of range of indexes of each patent's sentences
    patent_range = {}
    data = []
    start = 0
    for p_id in self.patents.keys():
      self.patent_sent_func(self.patents[p_id], data)
      patent_range[p_id] = (start, len(data))
      start = len(data)

    # bulk encoding the whole sentences to sentence embedding space
    sentence_embeddings = self.model.encode(data)

    # get sentences embeddings of each patent and perform mean aggragation to get pantent embedding
    patent_embedding = {}
    for p_id in patent_range.keys():
      start, end = patent_range[p_id]
      p_sent_embedd = sentence_embeddings[start:end]
      patent_embedding[p_id] = np.average(p_sent_embedd, axis=0)

    return patent_embedding
  

# Results

In [4]:
def get_two_most_similar_patents(patent_embeddings):
  """
  get_two_most_similar_patents function find the two most similar patents given a document similarity function. It
  computes the cosine distance between each pair of patents in the embedding space
  
  Parameters
  ----------
  patent_embeddings: dict
    dictionary mapping from patent id to its embedding vector
  
  Returns
  ----------
  res: Tuple
      The ids of the two most similar patents and the distance between them in the embedding space.
  """
  min_so_far = 1e9
  res = None
  keys = list(patent_embeddings.keys())
  for i, p1 in enumerate(keys):
    for j in range(i + 1, len(keys)):
      p2 = keys[j]

      distance = spatial.distance.cosine(patent_embeddings[p1], patent_embeddings[p2])
      if distance < min_so_far:
        res = (p1, p2, distance)
        min_so_far = distance

    if(i + 1) % 100 == 0:
      print("Done with: ", (i+1) / 1000 * 100, '%')
  return res

In [5]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
bpls = PatentSentenceBert(model, patents, patent_as_multi_sent=True)
patent_embeddings = bpls.get_bulk_patent_embedding()
print("Bert transform with patent as list of sentences: ", get_two_most_similar_patents(patent_embeddings))



Done with:  10.0 %
Done with:  20.0 %
Done with:  30.0 %
Done with:  40.0 %
Done with:  50.0 %
Done with:  60.0 %
Done with:  70.0 %
Done with:  80.0 %
Done with:  90.0 %
Done with:  100.0 %
Bert transform with patent as list of sentences:  ('EP2203388A1.xml', 'EP2048116A1.xml', 0.0)


In [6]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
bpls = PatentSentenceBert(model, patents, patent_as_multi_sent=False)
patent_embeddings = bpls.get_bulk_patent_embedding()
print("Bert transform with patent as list of sentences: ", get_two_most_similar_patents(patent_embeddings))



Done with:  10.0 %
Done with:  20.0 %
Done with:  30.0 %
Done with:  40.0 %
Done with:  50.0 %
Done with:  60.0 %
Done with:  70.0 %
Done with:  80.0 %
Done with:  90.0 %
Done with:  100.0 %
Bert transform with patent as list of sentences:  ('EP2203388A1.xml', 'EP2048116A1.xml', 0.0)


In [7]:
model = SentenceTransformer('distiluse-base-multilingual-cased')
bpls = PatentSentenceBert(model, patents, patent_as_multi_sent=True)
patent_embeddings = bpls.get_bulk_patent_embedding()
print("Bert transform with patent as list of sentences: ", get_two_most_similar_patents(patent_embeddings))

100%|██████████| 504M/504M [01:01<00:00, 8.23MB/s]


Done with:  10.0 %
Done with:  20.0 %
Done with:  30.0 %
Done with:  40.0 %
Done with:  50.0 %
Done with:  60.0 %
Done with:  70.0 %
Done with:  80.0 %
Done with:  90.0 %
Done with:  100.0 %
Bert transform with patent as list of sentences:  ('EP2057633A2.xml', 'EP2057683A2.xml', 9.417533874511719e-06)


In [8]:
model = SentenceTransformer('distiluse-base-multilingual-cased')
bpls = PatentSentenceBert(model, patents, patent_as_multi_sent=False)
patent_embeddings = bpls.get_bulk_patent_embedding()
print("Bert transform with patent as list of sentences: ", get_two_most_similar_patents(patent_embeddings))



Done with:  10.0 %
Done with:  20.0 %
Done with:  30.0 %
Done with:  40.0 %
Done with:  50.0 %
Done with:  60.0 %
Done with:  70.0 %
Done with:  80.0 %
Done with:  90.0 %
Done with:  100.0 %
Bert transform with patent as list of sentences:  ('EP2028662A2.xml', 'EP2031602A2.xml', 0.0)
