In [131]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

In [132]:
import os

In [133]:
dataSet = {}
fp = 'scraping/patentData/'
for item in os.listdir(fp):
    with open(fp+item) as file1:
        dataSet[item.split('_')[0]]=(file1.read(),item.split('_')[1][0])
    

In [134]:
arts =[]
tags = []
regexClaims = 'Claims(.+)Description'
regexDesc = 'Description(.+)'
for item in dataSet:
    arts.append(re.findall(regexClaims,dataSet[item][0],flags=re.DOTALL)[0])
    tags.append(dataSet[item][1])

In [135]:
X_train, X_test, y_train, y_test = train_test_split(arts, tags, random_state=0, test_size=0.7)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [136]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [137]:
model_dbow = Doc2Vec(dm=1, vector_size=100, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 305545.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 44108.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 164189.84it/s]
100%|███████████████████████████████████

In [138]:
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 100, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 100, 'Test')

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)
logreg = logreg.fit(train_vectors_dbow, y_train)
y_pred = logreg.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=['0','1']))



accuracy 0.7419354838709677
              precision    recall  f1-score   support

           0       0.72      0.95      0.82        19
           1       0.83      0.42      0.56        12

   micro avg       0.74      0.74      0.74        31
   macro avg       0.78      0.68      0.69        31
weighted avg       0.76      0.74      0.72        31



In [139]:
dataSet.keys()

dict_keys(['10011566', '10016430', '10017502', '10047072', '20020031481', '20020037258', '20030215401', '20040028930', '20040034120', '20040086467', '20060052396', '20060106015', '20070099928', '20070099997', '20070105090', '20080051433', '20080234263', '20100016340', '20100120798', '20100210617', '6162816', '6174855', '6197341', '6207188', '6219997', '6221887', '6244118', '6252113', '6253920', '6262028', '6273084', '6275294', '6288250', '6291458', '6291493', '6291494', '6303788', '6350877', '6730685', '7579344', '7989460', '8038006', '8153128', '8235213'])

In [140]:
re.findall('Claims(.+)Description',dataSet['10011566'][0],flags=re.DOTALL)[0]

'\n\n\nThe invention claimed is:\n\n1. A compound of formula (I): ##STR00307## wherein: R.sup.1 is H or (CO)R.sup.4; R.sup.2 is C.sub.1-6 alkyl, cyclopropyl, CH.sub.2-cyclopropyl, or NR.sup.5R.sup.6, wherein said C.sub.1-6 alkyl is optionally substituted with OH or C.sub.1-6 alkoxy and said CH.sub.2-cyclopropyl is optionally substituted with halo, OH, CN or C.sub.1-6 alkoxy; R.sup.3 is H, C.sub.1-6 alkyl, C.sub.1-6 alkoxy, halo or CN; R.sup.4 is: H; C.sub.1-6 alkyl optionally substituted with (R.sup.7).sub.a; C.sub.3-7 cycloalkyl optionally substituted with halo, C.sub.1-6 alkyl, OH, CN, C.sub.1-6 alkoxy, or C.sub.1-3 alkyl-OR.sup.8; heterocycloalkyl optionally substituted with C.sub.1-6 alkyl or OH; C.sub.1-6 alkoxy; or NHR.sup.13; a is 1, 2 or 3; R.sup.6 is H, C.sub.1-6 alkyl or C.sub.3-7 cycloalkyl; R.sup.7 is independently selected from the group consisting of halo, OH, CN, C.sub.1-6 alkoxy, NR.sup.9R.sup.10, C.sub.3-7 cycloalkyl, heterocycloalkyl and aryl, wherein said C.sub.3-7 c

In [141]:
[len(x) for x in arts]

[406925,
 1252,
 7387,
 1275,
 4192,
 3524,
 7339,
 5256,
 5293,
 5008,
 43229,
 9566,
 2174,
 1621,
 14033,
 6824,
 27312,
 16888,
 26490,
 22154,
 2439,
 6207,
 5495,
 1981,
 2076,
 1169,
 5721,
 4232,
 2237,
 16682,
 3136,
 12350,
 2200,
 5990,
 1159,
 3676,
 2002,
 1841,
 4493,
 9242,
 6253,
 2763,
 2491,
 6138]