# Environment Setup

In [1]:
import gensim
import xml.etree.cElementTree as et
import os
import numpy as np
from scipy import spatial
from google.colab import drive
import sys
from functools import lru_cache

import warnings
warnings.filterwarnings('once')

# !unzip nano-1000.zip

# Data Preperation



Only EP2081075A1 has no claims and only EP2102293A2 has no abstract. I decided not exclude them since they have another information to describe the patent.

In [2]:
patents = {}
for filename in os.listdir("nano"):
  tree=et.parse("nano/" + filename)
  root=tree.getroot()
  doc = {}
  
  doc["title"] = root.findtext("Title")
  doc["abstract"] = root.findtext("Abstract")

  doc_claims = []
  if root.find("Claims") is not None:
    for claim in root.find("Claims").iter():
      if len(claim.text.strip()) == 0: continue
      doc_claims.append(claim.text)

  doc["claims"] = doc_claims
  patents[filename] = doc

print("Number of patents is ", len(patents))

Number of patents is  1000


# Word2Vec Model

### Model Data Preperation


Word2Vec model expects data as a list of lists of words. I am doing a mild pre-processing using gensim.utils.simple_preprocess (line). This does some basic pre-processing such as tokenization, lowercasing, removing single-letter words (e.g. "a"), etc. and returns back a list of tokens (words).

In [3]:
data = []
for p in patents:
  if patents[p]["title"] is not None:
    data.append(gensim.utils.simple_preprocess(patents[p]["title"], min_len=2, max_len=30))
  if patents[p]["abstract"] is not None:
    data.append(gensim.utils.simple_preprocess(patents[p]["abstract"], min_len=2, max_len=30))
    
  for claim in patents[p]["claims"]:
    data.append(gensim.utils.simple_preprocess(claim, min_len=2, max_len=30))

print('Number of sentences = ', len(data))

Number of sentences =  37060


### Training

In [4]:
word2vec_model = gensim.models.Word2Vec(data, size=100, min_count=5, window=5, sg=1) # skip-gram model
word2vec_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec (L2-normalized vectors.).

#  Similarity Measurement


In [5]:
@lru_cache(maxsize=1000)
def patent_to_doc(patent_id):
  """
  patent_to_doc function transforms a patent object into a list of sentences. It uses LRU cache for memoization to
  speed up computations.
  
  Parameters
  ----------
  patent_id: str
      The ID of the patent to extract it from patents dictionary
  
  Returns
  ---------
  sentences: List[str]
      list of sentences.
  """
  sentences = []
  if patents[patent_id]["title"] is not None:
    sentences.append(patents[patent_id]["title"])

  if patents[patent_id]["abstract"] is not None:
    sentences.append(patents[patent_id]["abstract"])

  for claim in patents[p]["claims"]:
    sentences.append(claim)

  return sentences

@lru_cache(maxsize=1000)
def get_doc_embedding(p_id, model):
  """
  get_doc_embedding function computes document embedding by averaging all its word embeddings. It uses LRU cache
  for memoization to speed up computations.
  
  Parameters
  ----------
  patent_id: str
      The ID of the patent to extract it from patents dictionary
  Returns
  ----------
  doc_embedding: np.array
      document embedding vector.
  """
  doc = patent_to_doc(p_id)   
  doc_embedd = []
  for sent in doc:
      tokens = gensim.utils.simple_preprocess(sent, min_len=2, max_len=30) # mild pre-processing
      sent_embedd = [model[w] for w in tokens if w in model] # remove words that is not in model vocabulary
      doc_embedd.append(np.average(sent_embedd, axis=0)) # average word embedding to get sentence embedding

  return np.average(doc_embedd, axis=0) # average sentence embedding to get document embedding

def avg_cos_similarity(patent1_id, patent2_id, model):
    """
    avg_cos_similarity function used to find the cosin similarity between two documents by using average aggregator over
    document's word embeddings.

    Parameters
    ----------
    patent1_id: str
        The ID of the first patent.
    
    patent2_id: str
        The ID of the second patent.

    Returns
    ----------
    distance: float
       The Cosine distance.
    """

    doc1_embedd = get_doc_embedding(patent1_id, model)
    doc2_embedd = get_doc_embedding(patent2_id, model)
    distance = spatial.distance.cosine(doc1_embedd, doc2_embedd)
    return distance 

  
@lru_cache(maxsize=1000)
def get_doc_tokens(p_id):
  """
  get_doc_tokens function merges the patent's sentences in on giant sentence. It uses LRU cache
  for memoization to speed up computations.

  Parameters
  ----------
  p_id: str 
      The ID of the patent to extract it from patents dictionary.

  Returns
  ----------
  tokens: str
      patent as a single sentence.
  """

  doc = patent_to_doc(p_id)
  text = ". ".join([sent for sent in doc])
  tokens = gensim.utils.simple_preprocess(text, min_len=2, max_len=30)  
  return tokens

def wmd_similarity(patent1_id, patent2_id, model):
  """
  wmd_similarity function used to find the Word Mover's Distance which uses the word embeddings of the words in two
  texts to measure the minimum amount that the words in one text need to travel in semantic space to reach the words
  of the other text. (gives better results than 1). We treated the document as bag of words.

  Parameters
  ----------
  patent1_id: str 
      The ID of the first patent

  patent2_id: str
      The ID of the second patent

  Returns
  ----------
  similarity: float
      Word Mover's Distance, the smaller the distance the closer the docuements in the embeddings space
  """
  tokens1 = get_doc_tokens(patent1_id)
  tokens2 = get_doc_tokens(patent2_id)

  similarity = model.wv.wmdistance(tokens1, tokens2)
  return similarity




### Results

In [6]:
def get_two_most_similar_patents(doc_similarity_func):
  """
  get_two_most_similar_patents function find the two most similar patents given a document similarity function. It
  computes the similarity between each pair of patents and find most similar ones
  
  Parameters
  ----------
  doc_similarity_func: function: (patent1_id:(Stirng), patent2_id: (String), model:(gensim.models))-> float)
    function used to compute distance between patent's in the embedding space. 
  
  Returns
  ----------
  res: Tuple
      The ids of the two most similar patents and the distance between them in the embedding space.
  """

  min_so_far = 1e9
  res = None
  keys = list(patents.keys())
  for i, p1 in enumerate(keys):
    for j in range(i + 1, len(keys)):
      p2 = keys[j]
      distance = doc_similarity_func(p1, p2, word2vec_model)
      if distance < min_so_far:
        res = (p1, p2, distance)
        min_so_far = distance

    if(i + 1) % 100 == 0:
      print("Done with: ", (i+1) / 1000 * 100, '%')
  return res

print("With cosine distance: ", get_two_most_similar_patents(avg_cos_similarity))

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Done with:  10.0 %
Done with:  20.0 %
Done with:  30.0 %
Done with:  40.0 %
Done with:  50.0 %
Done with:  60.0 %
Done with:  70.0 %
Done with:  80.0 %
Done with:  90.0 %
Done with:  100.0 %
With cosine distance:  ('EP1973178A2.xml', 'EP2166581A2.xml', 0.0)


In [7]:
# print("With Word Mover's Distance similarity: ", get_similar_patents(wmd_similarity))

In [8]:
print(patent_to_doc.cache_info())
print(get_doc_embedding.cache_info())

CacheInfo(hits=0, misses=1000, maxsize=1000, currsize=1000)
CacheInfo(hits=998000, misses=1000, maxsize=1000, currsize=1000)
