In [None]:
#Import libraries

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils
from keras.layers import Activation
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint
import random
import sys
import io
import tensorflow as tf

import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re 
from nltk.corpus import stopwords   
from nltk.tokenize import word_tokenize, sent_tokenize 
import nltk

warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
## Connection to the Drive

from google.colab import drive
drive.mount('/content/drive')
dataroot = "/content/drive/Shared drives/ING3 IA: Use Case 1 (NLP Patent)"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Data selection

data = pd.read_csv("/content/drive/Shareddrives/ING3 IA: Use Case 1 (NLP Patent)/data_clean/all_fr_join_publications.csv")
#data = pd.read_csv("/content/drive/Shareddrives/ING3 IA: Use Case 1 (NLP Patent)/data_clean/35_en_reduce_publications.csv")

In [None]:
# Reduce data to claims and description + clean the text

data = data[['claim', 'description']]
data.dropna(axis=0,inplace=True)

data['description'] = [BeautifulSoup(abstr).get_text() for abstr in data['description']]
data['claim'] = [BeautifulSoup(claim).get_text() for claim in data['claim']]

In [None]:
## Function to clean a dataframe column of strings

def text_cleaner(text):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)    
    #newString = re.sub(r"'s\b","",newString)
    #newString = re.sub(",", "", newString) 
    tokens = [w for w in newString.split()]
    return (" ".join(tokens)).strip()

# Lists of all cleaned claims and abstracts
cleaned_claim = []
for t in data['claim']:
    cleaned_claim.append(text_cleaner(t))

cleaned_descr = []
for t in data['description']:
    cleaned_descr.append(text_cleaner(t))

#LSTM

In [None]:
# Define the dataframe that will be used by the model

df = pd.DataFrame(list(zip(cleaned_claim, cleaned_descr)), columns =['claim', 'descr']) 
#df = df.head(3000)
df

Unnamed: 0,claim,descr
0,a method for transmitting a channel sounding r...,technical fieldembodiments of the disclosure r...
1,a method for transmitting a sounding reference...,technical fieldthe present invention relates t...
2,a self-contained subframe configuration method...,technical fieldembodiments of the present inve...
3,"a method for transmitting information, compris...",technical fieldthe present invention relates t...
4,an information transmission method based on a ...,technical fieldthe present disclosure relates ...
...,...,...
2995,a computer-implemented method of assembling pa...,field of the inventionthe invention relates to...
2996,"a system, comprising:a scriber to form a plura...",cross reference to related applicationsthis in...
2997,a method for writing data comprising a sequenc...,technical fieldthe present inventive concept r...
2998,"a method, comprising:identifying a function de...",claim of prioritythis application claims prior...


In [None]:
# Get the descriptions

descr = df['descr']
descr = descr[:int(len(descr) * 0.01)]
descr = ' '.join(map(str, descr))

descr[:100]

'technical fieldembodiments of the disclosure relate to the field of wireless communication, and more'

In [None]:
#len(descr)

3215460

In [None]:
# Get unique characters

chars = sorted(list(set(descr)))
print('Count of unique characters (i.e., features):', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

Count of unique characters (i.e., features): 77


In [None]:
#Get sequences

#maxlen of the string
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(descr) - maxlen, step):
    sentences.append(descr[i: i + maxlen])
    next_chars.append(descr[i + maxlen])
print('Number of sequences:', len(sentences), "\n")

print(sentences[:10], "\n")
print(next_chars[:10])

Number of sequences: 1071807 

['technical fieldembodiments of the disclo', 'hnical fieldembodiments of the disclosur', 'cal fieldembodiments of the disclosure r', ' fieldembodiments of the disclosure rela', 'eldembodiments of the disclosure relate ', 'embodiments of the disclosure relate to ', 'odiments of the disclosure relate to the', 'ments of the disclosure relate to the fi', 'ts of the disclosure relate to the field', 'of the disclosure relate to the field of'] 

['s', 'e', 'e', 't', 't', 't', ' ', 'e', ' ', ' ']


In [None]:
# Build x and y for the training

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [None]:
# Define the LSTM model

model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(LSTM(128, activation='relu'))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

In [None]:
# Define an optimizer

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, logs):
    # Function invoked for specified epochs. Prints generated text.
    # Using epoch+1 to be consistent with the training epochs printed by Keras
    if epoch+1 == 1 or epoch+1 == 10:
        print()
        print('----- Generating text after Epoch: %d' % epoch)

        start_index = random.randint(0, len(descr) - maxlen - 1)
        for diversity in [0.2, 0.3, 0.4, 0.5]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = descr[start_index: start_index + maxlen]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()
    else:
        print()
        print('----- Not generating text after Epoch: %d' % epoch)

generate_text = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, 
                             monitor='loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')

# fit model using our gpu
with tf.device('/gpu:0'):
    model.fit(x, y,
              batch_size=128,
              epochs=1,
              verbose=2,
              callbacks=[generate_text, checkpoint])

8374/8374 - 2246s - loss: 88127964184576.0000

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "nt, the case where the number of user te"
nt, the case where the number of user teg dtsh m,ctercshrcts bc,rrhtflq,tc aectrlr aehssqralmcpch  t eg.dhpi ccln elnscslqp gc eiaebttcchcrelotcl ,lcloogltuax pnqctrtcsren cntslrclolyfcrel  rheclsr c vglccho o acocaul,por, s gcem,ccorsdggcncss   vmcehk eoc .cct mrt gheqcclholss t   mmcttts  latsulsrem cat cccpo chtlyrtmm cdcgyr cl  arc  0 rmpt cohceonn i rbs  grg csccstl ccgonvlt,aepyce ep c  m lclttrlcf hlsrhc  ccts2 elpl l ce gcmp ho 
----- diversity: 0.3
----- Generating with seed: "nt, the case where the number of user te"
nt, the case where the number of user teg dtsh m,ctercshrcts bc,rrhtflq,tc aectrlr aehssqralmcpch  t eg.dhpi ccln elnscslqp gc eiaebttcchcrelotcl ,lcloogltuax pnqctrtcsren cntslrclolyfcrel  rheclsr c vglccho o acocaul,por, s gcem,ccorsdggcncss   vmcehk eoc .cct mrt gheqcclholss t   mmcttts 