# How similar are patent documents

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sklearn
import pandas as pd

In [3]:
# Define the documents
doc_patent1 = " A motive power system includes a first energy storage, a second energy storage, an actuator, an internal combustion engine, a power transmission circuit, and circuitry. The circuitry is configured to control the power transmission circuit in a charge-depleting mode such that the first energy storage supplies to the actuator a first electric energy that is stored in the first energy storage with a first charge rate range and the second energy storage supplies to the actuator a second electric energy that is stored in the second energy storage with a second charge rate range. The first charge rate range is larger than the second charge rate range."

doc_patent2 = "An apparatus includes a pump circuit structured to receive pump data indicative of an operating characteristic of a pump feeding a fluid to a waste heat recovery (WHR) system; a flow circuit structured to receive valve position data indicative of a position of a valve downstream of the pump, estimate a flow rate of the fluid exiting the pump, and estimate the flow rate of the fluid exiting the valve; and a pressure circuit structured to receive pressure data indicative of the pressure of the fluid exiting the valve, estimate a change in pressure of the fluid across the WHR system, and determine a pressure of the fluid in a hot section of the WHR system based on the pressure of the fluid exiting the valve and the change in the pressure of the fluid across the WHR system"

doc_patent3 = "The present invention provides specific systems, methods and algorithms based on artificial intelligence expert system technology for determination of preferred routes of travel for electric vehicles (EVs). The systems, methods and algorithms provide such route guidance for battery-operated EVs in-route to a desired destination, but lacking sufficient battery energy to reach the destination from the current location of the EV. The systems and methods of the present invention disclose use of one or more specifically programmed computer machines with artificial intelligence expert system battery energy management and navigation route control. Such specifically programmed computer machines may be located in the EV and/or cloud-based or remote computer/data processing systems for the determination of preferred routes of travel, including intermediate stops at designated battery charging or replenishing stations."

documents = [doc_patent1, doc_patent2, doc_patent3]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

### Create the Document Term Matrix using count vector

In [5]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

In [6]:
#Convert Sparse Matrix to Pandas Dataframe
doc_term_matrix = sparse_matrix.todense()
df1 = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['doc_1', 'doc_2', 'doc_3'])

In [7]:
df1

Unnamed: 0,across,actuator,algorithms,an,and,apparatus,artificial,at,based,battery,...,the,to,transmission,travel,use,valve,vehicles,waste,whr,with
doc_1,0,3,0,2,2,0,0,0,0,0,...,10,3,2,0,0,0,0,0,0,2
doc_2,2,0,0,2,4,1,0,0,1,0,...,20,4,0,0,0,5,0,1,4,0
doc_3,0,0,2,0,5,0,2,1,2,4,...,9,2,0,2,1,0,1,0,0,1


### Create the Document Term Matrix using TFIDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
sparse_matrix2= tfidf_vec.fit_transform(documents)

In [9]:
doc_term_matrix2 = sparse_matrix2.todense()
df2 = pd.DataFrame(doc_term_matrix2, 
                  columns=tfidf_vec.get_feature_names(), 
                  index=['doc_1', 'doc_2', 'doc_3'])

In [10]:
df2

Unnamed: 0,across,actuator,algorithms,an,and,apparatus,artificial,at,based,battery,...,the,to,transmission,travel,use,valve,vehicles,waste,whr,with
doc_1,0.0,0.168452,0.0,0.085408,0.066327,0.0,0.0,0.0,0.0,0.0,...,0.331636,0.099491,0.112302,0.0,0.0,0.0,0.0,0.0,0.0,0.085408
doc_2,0.085083,0.0,0.0,0.064708,0.100503,0.042541,0.0,0.0,0.032354,0.0,...,0.502514,0.100503,0.0,0.0,0.0,0.212707,0.0,0.042541,0.170166,0.0
doc_3,0.0,0.0,0.121866,0.0,0.17994,0.0,0.121866,0.060933,0.092682,0.243732,...,0.323892,0.071976,0.0,0.121866,0.060933,0.0,0.060933,0.0,0.0,0.046341


In [11]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df1, df1))

[[1.         0.39124822 0.33810571]
 [0.39124822 1.         0.54836016]
 [0.33810571 0.54836016 1.        ]]


doc1-doc1:1<br>
doc1-doc2:0.39<br>
doc1-doc3:0.33


In [12]:
# for tfidf
print(cosine_similarity(df2, df2))

[[1.         0.22040014 0.181574  ]
 [0.22040014 1.         0.35666664]
 [0.181574   0.35666664 1.        ]]


In [13]:
print(cosine_similarity(df1, df2))

[[0.97800802 0.30874895 0.24667818]
 [0.27775698 0.97428903 0.416671  ]
 [0.24628059 0.46241639 0.9763846 ]]


In [14]:
import gensim
# upgrade gensim if you can't import softcossim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess

In [15]:
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])

In [16]:
# Download the FastText model
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

In [17]:
# Prepare the similarity matrix

similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

In [18]:
# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(simple_preprocess(doc_patent1))
sent_2 = dictionary.doc2bow(simple_preprocess(doc_patent2))
sent_3 = dictionary.doc2bow(simple_preprocess(doc_patent3))

In [19]:
sentences = [sent_1, sent_2, sent_3]

In [20]:
# Compute soft cosine similarity
print(softcossim(sent_1, sent_2, similarity_matrix))

0.5970213005009702


In [21]:
print(softcossim(sent_1, sent_3, similarity_matrix))

0.5562908793513186


In [22]:
print(softcossim(sent_2, sent_3, similarity_matrix))

0.7073904982640719
