# Topic Modeling for Patent data

In [1]:
import pandas as pd
import numpy as np
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [2]:
 %pylab

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from gensim import corpora, models, similarities
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# A raw corpus of different patent abstracts related electric vehicle and smart devices

In [6]:
raw_corpus = ["A self-propelled electric vehicle includes a wheeled frame having a quick connect and disconnect hitch for drivingly connecting the vehicle to a variety of wheeled devices. An individual drive for each of a pair of ground-contacting wheels includes a separate, reversible motor and a power transmission train coupled to each wheel which carries an inturned extension over which a transmission member is trained",
             "A robotic device has a base and at least one finger having at least two links that are connected in series on rotary joints with at least two degrees of freedom. A brushless motor and an associated controller are located at each joint to produce a rotational movement of a link. Wires for electrical power and communication serially connect the controllers in a distributed control network",
             "Method and system for remote monitoring of high-risk patients using artificial intelligence. A plurality of high-risk patients can be simultaneously monitored without patient intervention. A patient hears questions in the doctor's voice at each monitoring encounter and responds.The patient's responses are recorded at a remote central monitoring station and can be analyzed on line or later. ",
             "The utility model provides an electric automobile fills automatic robot in pond of discharging of getting of battery swapping station, a serial communication port, include: and a frame. Running gear sets up in the bottom of frame, is transverse movement, elevating system sets up in the centre of frame, is longitudinal motion, absorb battery mechanism, set up on elevating system, absorb the battery",              
             "An automated vehicle charging system, that may be done within a service type station, to provide for charging, recharging, or even discharging, of the batteries of an electric vehicle, and generally will include a dispenser, having a cabinet containing all of the instrumentation desired for furnishing the provision of current information relative to the charging of a vehicle",
             "This invention overcomes the disadvantages of the prior art by providing a human/machine interface (HMI) for use with machine vision systems (MVSs) that provides the machine vision system processing functionality at the sensor end of the system, and uses a communication interface to exchange control, image and analysis information with a standardized, preferably portable device that can be removed from the MVS during runtime",
             "A human-machine interface can detect when a user's ear is pulled back to initiate a plurality of procedures. Such procedures include turning on a TV using a laser attached to the user, starting an additional procedure by speaking a command, communicating with other users in environments which have high ambient noise, and interacting with the internet.",
             "The invention belongs to the technical field of automatic agricultural equipment, and particularly relates to a flexibly operable hand-eye mode spraying robot device which comprises a spray nozzle, a camera, a large mechanical arm, a small mechanical arm, a manipulator, a controller, a power source and a variable spray system.",
             "A relational artificial intelligence system is invented and developed. It comprises a relational automatic knowledge acquisition system and a relational reasoning system. The relational automatic knowledge acquisition system is a relational learning system which discovers knowledges from spreadsheet-formed databases and generates relational knowledge bases using inductive learning technique"]

In [7]:
type(raw_corpus)

list

# Corpus

In [8]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in raw_corpus]

In [9]:
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
processed_corpus

[['electric',
  'vehicle',
  'includes',
  'wheeled',
  'having',
  'connect',
  'vehicle',
  'wheeled',
  'an',
  'each',
  'includes',
  'motor',
  'power',
  'transmission',
  'each',
  'which',
  'an',
  'which',
  'transmission',
  'is'],
 ['device',
  'at',
  'least',
  'having',
  'at',
  'least',
  'two',
  'that',
  'are',
  'on',
  'with',
  'at',
  'least',
  'two',
  'motor',
  'an',
  'are',
  'at',
  'each',
  'power',
  'communication',
  'connect'],
 ['system',
  'remote',
  'monitoring',
  'high-risk',
  'patients',
  'using',
  'artificial',
  'plurality',
  'high-risk',
  'patients',
  'can',
  'be',
  'patient',
  'patient',
  'at',
  'each',
  'monitoring',
  'are',
  'at',
  'remote',
  'monitoring',
  'can',
  'be',
  'on',
  'or'],
 ['provides',
  'an',
  'electric',
  'automatic',
  'robot',
  'battery',
  'station,',
  'communication',
  'sets',
  'up',
  'frame,',
  'is',
  'elevating',
  'system',
  'sets',
  'up',
  'frame,',
  'is',
  'absorb',
  'battery'

In [10]:
from gensim import corpora

In [11]:
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(63 unique tokens: ['an', 'connect', 'each', 'electric', 'having']...)


# Bag of words_Corpus

In [12]:
print(dictionary.token2id)

{'an': 0, 'connect': 1, 'each': 2, 'electric': 3, 'having': 4, 'includes': 5, 'is': 6, 'motor': 7, 'power': 8, 'transmission': 9, 'vehicle': 10, 'wheeled': 11, 'which': 12, 'are': 13, 'at': 14, 'communication': 15, 'device': 16, 'least': 17, 'on': 18, 'that': 19, 'two': 20, 'with': 21, 'artificial': 22, 'be': 23, 'can': 24, 'high-risk': 25, 'monitoring': 26, 'or': 27, 'patient': 28, 'patients': 29, 'plurality': 30, 'remote': 31, 'system': 32, 'using': 33, 'absorb': 34, 'automatic': 35, 'battery': 36, 'elevating': 37, 'frame,': 38, 'provides': 39, 'robot': 40, 'sets': 41, 'station,': 42, 'system,': 43, 'up': 44, 'charging': 45, 'include': 46, 'information': 47, 'by': 48, 'from': 49, 'interface': 50, 'invention': 51, 'machine': 52, 'vision': 53, 'arm,': 54, 'comprises': 55, 'mechanical': 56, 'spray': 57, 'system.': 58, 'acquisition': 59, 'knowledge': 60, 'learning': 61, 'relational': 62}


In [13]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
bow_corpus

[[(0, 2),
  (1, 1),
  (2, 2),
  (3, 1),
  (4, 1),
  (5, 2),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 2),
  (10, 2),
  (11, 2),
  (12, 2)],
 [(0, 1),
  (1, 1),
  (2, 1),
  (4, 1),
  (7, 1),
  (8, 1),
  (13, 2),
  (14, 4),
  (15, 1),
  (16, 1),
  (17, 3),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 1)],
 [(2, 1),
  (13, 1),
  (14, 2),
  (18, 1),
  (22, 1),
  (23, 2),
  (24, 2),
  (25, 2),
  (26, 3),
  (27, 1),
  (28, 2),
  (29, 2),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1)],
 [(0, 1),
  (3, 1),
  (6, 2),
  (15, 1),
  (18, 1),
  (32, 1),
  (34, 2),
  (35, 1),
  (36, 3),
  (37, 2),
  (38, 2),
  (39, 1),
  (40, 1),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 3)],
 [(0, 2),
  (3, 1),
  (4, 1),
  (10, 2),
  (19, 1),
  (23, 1),
  (27, 1),
  (42, 1),
  (43, 1),
  (45, 2),
  (46, 1),
  (47, 1)],
 [(14, 1),
  (15, 1),
  (16, 1),
  (19, 2),
  (21, 2),
  (23, 1),
  (24, 1),
  (32, 1),
  (39, 1),
  (43, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 2),
  (51, 1),
  (52, 2),
  (53, 2)],
 [(0, 1),
  (6, 1),
  (12, 

# Vectorize

In [14]:
from gensim import models
# train the model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" string
tfidf[dictionary.doc2bow("system minors".lower().split())]

[(32, 1.0)]

In [15]:
# ii-Transformation

In [16]:
tfidf = models.TfidfModel(bow_corpus) # step 1 -- initialize a model

In [17]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

[(0, 0.363988296598431), (1, 0.9314035215412129)]


In [18]:
# First only vector is used. Now we need to use whole corpus
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.1277905072854186), (1, 0.1635004883639025), (2, 0.23884893969527404), (3, 0.11942446984763702), (4, 0.11942446984763702), (5, 0.4776978793905481), (6, 0.0881520370325309), (7, 0.1635004883639025), (8, 0.11942446984763702), (9, 0.4776978793905481), (10, 0.327000976727805), (11, 0.4776978793905481), (12, 0.1763040740650618)]
[(0, 0.057335418503846144), (1, 0.1467146367138215), (2, 0.10716370259053697), (4, 0.10716370259053697), (7, 0.1467146367138215), (8, 0.10716370259053697), (13, 0.293429273427643), (14, 0.42865481036214786), (15, 0.10716370259053697), (16, 0.10716370259053697), (17, 0.6429822155432219), (18, 0.07910186824656905), (19, 0.10716370259053697), (20, 0.42865481036214786), (21, 0.10716370259053697)]
[(2, 0.09031868083813135), (13, 0.12365261863216138), (14, 0.1806373616762627), (18, 0.06666787558806005), (22, 0.12365261863216138), (23, 0.1806373616762627), (24, 0.1806373616762627), (25, 0.3612747233525254), (26, 0.5419120850287882), (27, 0.12365261863216138), (28, 0.

### Fitting LSI model
Here we convert/transform tf-idf i.e vectorized corpus into **semantic form**. 

In [19]:
#Now working on LSI
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [20]:
lsi.print_topics(2)

[(0,
  '0.342*"with" + 0.285*"interface" + 0.215*"by" + 0.210*"at" + 0.209*"include" + 0.206*"machine" + 0.206*"vision" + 0.201*"vehicle" + 0.200*"can" + 0.190*"charging"'),
 (1,
  '-0.425*"vehicle" + -0.316*"charging" + -0.305*"includes" + -0.305*"wheeled" + -0.305*"transmission" + 0.230*"with" + 0.219*"interface" + 0.172*"vision" + 0.172*"machine" + -0.164*"electric"')]

In [21]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)

[(0, 0.34804502357463024), (1, -0.7217752825746777)]
[(0, 0.42740737630548503), (1, -0.04870117665578337)]
[(0, 0.38115154970617793), (1, 0.14909617060952413)]
[(0, 0.18841327164059885), (1, -0.11823704808271902)]
[(0, 0.45967188680589893), (1, -0.5448290275792899)]
[(0, 0.6514988852522544), (1, 0.38587444580507174)]
[(0, 0.6579736601614207), (1, 0.32953353150985754)]
[(0, 0.1335770045478273), (1, 0.018142975472399354)]
[(0, 0.13740045882683471), (1, 0.12571047952484052)]


## Validation part-Similarity matrix

In [22]:
doc = "electric vehicle robotics"
vec_bow = dictionary.doc2bow(doc.lower().split())
# convert the query to LSI space
vec_lsi = lsi[vec_bow]
print(vec_lsi)

[(0, 0.2847566241711839), (1, -0.5894595002088943)]


In [23]:
# Initializing query structures
# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus_lsi]) 

In [24]:
# Performing queries
# perform a similarity query against the corpus
sims = index[vec_lsi] 
# print (document_number, document_similarity) 2-tuples
print(list(enumerate(sims))) 

[(0, -0.90172046), (1, 0.90784705), (2, 0.93656766), (3, 0.83246964), (4, 0.46616384), (5, 0.94339275), (6, 0.94050026), (7, 0.9247922), (8, 0.95148015)]


In [25]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
# print sorted (document number, similarity score) 2-tuples
print(sims)

[(8, 0.95148015), (5, 0.94339275), (6, 0.94050026), (2, 0.93656766), (7, 0.9247922), (1, 0.90784705), (3, 0.83246964), (4, 0.46616384), (0, -0.90172046)]


# Model-2

## Latent Dirichlet Allocation, LDA

In [26]:
type(raw_corpus)

list

In [27]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                )
dtm_tf = tf_vectorizer.fit_transform(raw_corpus)
print(dtm_tf.shape)

(9, 223)


In [28]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(raw_corpus)
print(dtm_tfidf.shape)

(9, 223)


In [29]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=20,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

# Visualizing the models

In [30]:
pyLDAvis.display(pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer))

In [31]:
pyLDAvis.display(pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer))

**With sklearn installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default PCoA is not satisfactory.**

In [33]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [34]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')