In [0]:
import nltk
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader,TensorDataset
import pandas as pd
import itertools
import copy
from langdetect import detect
from langdetect import detect_langs
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import json
import torch.nn as nn

from sklearn.model_selection import train_test_split
stemmer = nltk.stem.PorterStemmer()
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
!pip install langdetect



In [0]:
torch.manual_seed(4)
np.random.seed(10)

In [0]:
x = lambda a,b : list(itertools.chain(a, b))

In [0]:
empty = lambda empty: np.nan if empty=='' else empty

In [0]:
class ConvolutionalNetwork(nn.Module): #the classification net
    def __init__(self):
        super().__init__()
        self.stream1_conv = nn.Conv2d(1, 32, (3,5), 1)  # (input filters, output filters, kernel size, stride)
        self.stream2_conv = nn.Conv2d(1, 32, (4,5), 1)
        self.stream3_conv = nn.Conv2d(1, 32,(5,5), 1)
        
 
        self.fc1 = nn.Linear(32*514+32*513+32*512, 1)   # Note this is specific
        self.sigmoid = nn.Sigmoid()

    def forward(self, X):
        
        X1 = F.relu(self.stream1_conv(X))
        X2 = F.relu(self.stream2_conv(X))
        X3 = F.relu(self.stream3_conv(X))
        
        
        #X1 = F.max_pool2d(X1, 5, 3)
        #X2 = F.max_pool2d(X2, 5, 4)
        #X3 = F.max_pool2d(X3, 5, 5)
        
        X1 = X1.view(-1, 32*514*1)
        X2 = X2.view(-1, 32*513*1)
        X3 = X3.view(-1, 32*512*1)
        
        X_ = torch.cat((X1, X2),1)
        X_ = torch.cat((X_, X3),1)
        X_ = self.fc1(X_)
        return self.sigmoid(X_)


In [0]:
class SkimGram(nn.Module):#the embedding-representation formation net
    def __init__(self, emb_szs, vocabulary_size):
        super().__init__()
        self.u = nn.Linear(vocabulary_size,emb_szs,)
        self.v = nn.Linear(emb_szs,vocabulary_size)
    
    def forward(self,x):
        x= self.u(x)
        x =  self.v(x)
        log_softmax = F.log_softmax(x, dim=0)
        return log_softmax
    
    def representation(self,x): #obtain embedding
        x=self.u(x)
        return x


In [0]:
#Perform all relevant preprocessing of the data from natural language perspective and 
class Preprocess(object):
    def __init__(self,threshold=0.7):
        self.df = pd.DataFrame({'A' : [np.nan]})
        self.df.dropna(inplace=True)
        self.threshold=threshold
        self.df_values = pd.DataFrame({'A' : [np.nan]})
        self.df_values.dropna(inplace=True)


        self.idx_pairs = []
        self.vocabulary = [1]
        self.idx_pairs_Doc = []
        self.new_data=[]

    def Reading(self,input_file):
        #read from input file data set and re-format for model input
        with open(input_file, encoding="utf8") as f:
            
            data = f.readlines()
            data = [json.loads(line) for line in data] #convert string to dictionary format

        object_value = [i['object'] for i in data]#first category in the data , this is related to the subject of the patent, such as title, summary etc
        actor_value = [i['actor'] for i in data]#second category in the data, that is the authors and related detailes
        df_objects= pd.DataFrame(object_value,columns='filingDate id objectType publicationDate publicationNumber status summary title'.split()) #convert data to pandas series

        actors=[]
        dictionar={}
        check=[]
        temp=[] #convert actor data to pandas series
        for actor in actor_value:

            appending_location = [i['location']['displayName'] for i in actor]

            appending_location = "|".join(appending_location)

            appending_name = [i['name'] for i in actor]
            check=x(check,appending_name)
            appending_name = "|".join(appending_name)

            dictionar['name']=appending_name
            dictionar['location']=appending_location
            temp.append(dictionar)
            actors.append(copy.copy(temp[0]))

        df_actors= pd.DataFrame(actors,columns='name location'.split())
        #define outputs bases on Patent=1 and anything else as 0, as asked from the question
        df_objects.at[df_objects['status']=='Patented Case', 'status'] = 1
        df_objects.at[df_objects['status']!=1, 'status'] = 0
        #foreign languages, in this case French
        df_objects.at[:, 'summary'] =df_objects['summary'].apply(self.language_filter)
        #Obtain key dates from submission date, note publication data and number are not considered since they are marked after the patent has been classified
        df_objects['filingDate'] = df_objects['filingDate'].apply(self.timestamp)

        if self.df.empty: #for the initial data set
            self.df=pd.concat( [df_objects , df_actors], axis=1)
          #self.df.drop('A',axis=1,inplace = True)
        else:
            
           #for the case of storing additional data then simply concatenate them to the old data
            df = pd.concat( [df_objects , df_actors], axis=1)
            self.df = pd.concat( [self.df , df], axis=0)
        #create the patent document from all the input data
        self.merge_data()
    
    def language_filter(self,corpus):
        #remove any French language from the text
        if corpus=='':
            return ''
        qq=detect_langs(corpus)
        if qq[0].lang=='en':
            if qq[0].prob>0.9:
                return corpus
            else:
                ENcorpus = corpus.split('(FR)')[0]
                return ENcorpus
        else:

            return corpus.split('(FR)')[0]
      
    def timestamp(self,times): #obtain month day and year
        
        if type(times)==str:
            if times=='':
                return ''

            tmp = times.split()

            pq = tmp[1] + ' ' + tmp[2] + ', ' + tmp[-1] 
            return pq
        else:
            return np.nan
      
    def merge_data(self,flag=0):
    #creates patent document (the entire input to the model), flag=0 for the starting data, flag=1 for new data 
        if flag==0:#pre-processing for the case of retraining
            self.df_values = pd.DataFrame({'A' : [np.nan]})
            self.df_values.dropna(inplace=True)
            columns=['id','filingDate','location','name','title','summary']
            merge_data= self.df[columns.pop(0)]
            for i in columns:
                merge_data = merge_data + ' | ' + self.df[i] #merge each piece of information using a vertical column as seperator

            merge_data = pd.concat([merge_data, self.df['status']], axis=1)
            merge_data.columns= ['x_input', 'y_output']

            if self.df_values.empty:
                self.df_values=merge_data

            else:
                self.df_values = pd.concat( [self.df_values , merge_data], axis=1)
        else: #pre-processing for the case of prediction

            columns=['id','filingDate','location','name','title','summary']
            merge_data= self.new_data[columns.pop(0)]
            for i in columns:
                merge_data = merge_data + ' | ' + self.new_data[i]
              #for the case of prediction there is no output label hence only need to encode in case of validation based on new data
            if 'status' in self.new_data.columns:
                merge_data = pd.concat([merge_data, self.new_data['status']], axis=1)
                merge_data.columns= ['x_input', 'y_output']
            else:
                merge_data.columns= ['x_input']
            return merge_data

    def preprocess_input(self,entries=[],flag=0 ):
        
        #create tokens and vocabulary out of the input data and then encode the text data for the skim gram and classifier net"""
        if flag==0:
            entries = self.df_values['x_input']

        token_entries = [nltk.word_tokenize(str(entry)) for entry in self.df_values['x_input']]

        clean_token_entries = [[word for word in token_entry if word.lower() not in  stopwords.words('english')] for token_entry in token_entries]
        window_range=1
        vocabulary = []

        for entry in clean_token_entries:
            for token in entry:
                if token not in vocabulary:
                    vocabulary.append(token)

        vocabulary.append('<e>')#this is a padding token in order to make each document have the same input size
        word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
        idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

        vocabulary_size = len(vocabulary)
        window_size = 2
        idx_pairs = []
        idx_pairs_Doc=[]
        for entry in clean_token_entries:
            indices = [word2idx[w] for w in entry]
            indices_Doc = []

            for center_word in range(len(indices)):
                for window in range(-window_range,window_range+1):
                    check=center_word+window

                    if check < 0 or check > len(indices)-1 or check == center_word:
                        continue

                    context_word_idx = indices[center_word]
                    idx_pairs.append((indices[check], context_word_idx))
                    indices_Doc.append([indices[check], context_word_idx])
            idx_pairs_Doc.append(indices_Doc)

        idx_pairs.append((word2idx['<e>'],word2idx['<e>']))       
        idx_pairs = np.array(idx_pairs)


        lengths = [len(i) for i in np.array(idx_pairs_Doc)]
        padding_token = [word2idx['<e>'],word2idx['<e>']]
        self.padding_token = padding_token
        for i in range(len(idx_pairs_Doc)):
            copies = [padding_token]*(max(lengths)-len(idx_pairs_Doc[i]))

            idx_pairs_Doc[i] = idx_pairs_Doc[i]+copies

            idx_pairs_Doc[i] = [np.array(tuple(k)) for k in idx_pairs_Doc[i]]

        idx_pairs_Doc = np.array(idx_pairs_Doc)
        if flag==0:
            self.idx_pairs = idx_pairs #for skim gram model
            self.vocabulary = vocabulary
            self.idx_pairs_Doc = idx_pairs_Doc#for classification model
        else:
            self.idx_pairs_new = idx_pairs #for skim gram model
            self.vocabulary_new = vocabulary
            self.idx_pairs_Doc_new = idx_pairs_Doc #for classification model
    def get_input_layer(self,word_idx):
        #convert to one hot vector based on index on the vocabulary"
        x = torch.zeros(len(self.vocabulary)).float()
        x[int(word_idx.data.numpy()[0])] = 1.0
        #x[int(word_idx)] = 1.0
        return x


In [0]:
class DataPrep(Preprocess):
    def __init__(self):
        Preprocess.__init__(self)

    def Segment(self):
        #Test-train data set split for skim-gram 
        X =  np.random.permutation(self.idx_pairs)
        #y = self.df_values['y_output'].values

        test_set = round(len(X)/5)
        self.X_test = X[:test_set]
        self.X_train = X[test_set+1::]

        self.input_Train = []
        self.output_Train = [] 
        for (data, target)in self.X_train:
            self.input_Train.append(data)
            self.output_Train.append(target)

        self.input_Train.append(self.padding_token[0]) 
        self.output_Train.append(self.padding_token[1])
        self.input_Test = []
        self.output_Test = [] 
        for (data, target)in self.X_test:
            self.input_Test.append(data)
            self.output_Test.append(target)
        #Test-train data set split for convolution net  
        temp=[]
        concant=[]
        y = np.array(self.df_values['y_output'])
        
        for i in range(0,len(y)):
            concant.append(self.idx_pairs_Doc[i])
            concant.append(y[i])
            temp.append(concant)
            concant=[]
        
        qq =  np.random.permutation(temp)

        X_conv=[]
        y_conv=[]
        for i in qq: 
            X_conv.append(i[0])
            y_conv.append(i[1])
        
        X_conv = np.array(X_conv)
        y_conv = np.array(y_conv)
        #X_conv = self.idx_pairs_Doc
        #y_conv = self.df_values['y_output']


        test_set = round(len(X_conv)/5)
        self.X_test_conv = X_conv[:test_set]
        self.y_test_conv = y_conv[:test_set]

        self.X_train_conv = X_conv[test_set+1::]
        self.y_train_conv = y_conv[test_set+1::]

        self.input_Train_conv = []
        self.output_Train_conv = [] 
        
        for (data, target)in zip(*(self.X_train_conv, self.y_train_conv)):
            self.input_Train_conv.append(data)
            self.output_Train_conv.append(target)
        
        self.input_Test_conv = []
        self.output_Test_conv = [] 
        for (data, target)in zip(*(self.X_test_conv, self.y_test_conv)):
            self.input_Test_conv.append(data)
            self.output_Test_conv.append(target)
        
    def install_data(self,input_data):
        #Adds additional data to the original pile of data
        #Json file input is implied, labels are to be included there as well
        self.Reading(input_data)

    def New_Data_Read(self,input_file,FLAG=0):
        
        #this is for predicting or validating on a new data set
        #first insert the data set from json file and then preprocess the data, similar to reading data but modified for prediction as well
        self.new_data=[]

        with open(input_file, encoding="utf8") as f:

            data = f.readlines()
            data = [json.loads(line) for line in data] #convert string to dict format

        object_value = [i['object'] for i in data]
        actor_value = [i['actor'] for i in data]
        df_objects= pd.DataFrame(object_value,columns='filingDate id objectType publicationDate publicationNumber status summary title'.split())

        actors=[]
        dictionar={}
        check=[]
        temp=[]
        for actor in actor_value:

            appending_location = [i['location']['displayName'] for i in actor]

            appending_location = "|".join(appending_location)

            appending_name = [i['name'] for i in actor]
            check=x(check,appending_name)
            appending_name = "|".join(appending_name)

            dictionar['name'] = appending_name
            dictionar['location'] = appending_location
            temp.append(dictionar)
            actors.append(copy.copy(temp[0]))

        df_actors= pd.DataFrame(actors,columns='name location'.split())
        if FLAG==0:#validation case
            df_objects.at[df_objects['status']=='Patented Case', 'status'] = 1
            df_objects.at[df_objects['status']!=1, 'status'] = 0
        else:#prediction case
            if 'status' in df_objects.columns:
                df_objects.drop('status',axis=1,inplace = True)

        df_objects.at[:, 'summary'] =df_objects['summary'].apply(self.language_filter)
        df_objects['filingDate'] = df_objects['filingDate'].apply(self.timestamp)
        self.new_data=pd.concat( [df_objects , df_actors], axis=1)
        merged_data = self.merge_data(flag=1)

        self.preprocess_input(merged_data,flag=1)
        if 'y_output' in merged_data.columns:
            self.y_values = merged_data['y_output']
            self.segment_new_data(flag=1) #validation case
        else:
            self.segment_new_data(flag=0) #prediction case

    def segment_new_data(self,flag=1):
     #Test-train data set split for convolution net  
        temp=[]
        concant=[]
        if flag==1: #validation
            y = np.array( self.y_values)

            for i in range(len(y)):
                concant.append(self.idx_pairs_Doc_new[i])
                concant.append(y[i])
                temp.append(concant)

            qq =  np.random.permutation(temp)

            X_new=[]
            y_new=[]
            for i in qq: 
                X_new.append(i[0])
                y_new.append(i[1])
            self.X_new=[]
            self.y_new=[]
            for (data, target)in zip(*(X_new, y_new)):
                self.X_new.append(data)
                self.y_new.append(target)
            self.X_new = np.array(X_new)
            self.y_new = np.array(y_new)
        else:#prediction
            y = np.array( self.idx_pairs_Doc_new)

            for i in range(len(y)):
                concant.append(self.idx_pairs_Doc_new[i])
                temp.append(concant)

            qq =  np.random.permutation(temp)

            X_new=[]
            for i in qq: 
                X_new.append(i)
            self.X_new=[]
            for data in X_new:
                self.X_new.append(data)

            self.X_new = np.array(X_new)

In [0]:
class Method(DataPrep):
    
    def __init__(self,embedding_dims = 5):
        DataPrep.__init__(self)
        self.embedding_dims=embedding_dims

    def define_models(self):
        #initialize models"
        self.skim_gram = SkimGram(self.embedding_dims,len(self.vocabulary))
        self.model = ConvolutionalNetwork()

    def continuous_learning(self):
        A_matrix=[]
        count=0
        B_matrix=[]
        for (X_train, y_train) in zip(*(self.input_Train_conv,self.output_Train_conv)):
            count+=1

            inputX = self.data_Representation(X_train)

            if count==1:
                A_matrix=inputX.reshape(-1,516*5)
            else:
                A_matrix= torch.cat((A_matrix, inputX.reshape(-1,516*5)), 0)

            B_matrix.append(y_train)

        A = A_matrix.data.numpy()
        u, sigma, v = np.linalg.svd(A, full_matrices=True)
        Ui = np.linalg.norm(u,2,axis=1)
        randm=np.random.choice(len(Ui), len(Ui), replace=False).argsort()
        temp=[self.input_Train_conv[i] for i in randm]
        self.input_Train_conv=temp
        temp=[self.output_Train_conv[i] for i in randm]
        self.output_Train_conv=temp
  
    def accuracy(self,test_data, model):
        #takes data and model as input"
      # Testing the model and returning the accuracy on the given dataset
        total = 0
        correct = 0

        with torch.no_grad():
            for (data, target)in (test_data):
                x = Variable(self.get_input_layer(data)).float()
                y_true = Variable(torch.from_numpy(np.array([target])).long())

                y_pred = model(x)

                total += 1

                pred = y_pred.data.max()
                correct += pred.eq(target.data.view_as(pred)).cpu().sum()

        return float(correct) / total 
    def accuracy_conv(self,test_data, model):
        #takes data and model as input"
        total = 0
        correct = 0

        with torch.no_grad():
            for (data, target)in zip(*(test_data[0],test_data[1])):
                inputX = self.data_Representation(data)
                y_true = torch.tensor(target, dtype=torch.float) 
                y_pred = model(inputX.view(1,1,inputX.shape[0],inputX.shape[1]))

                total += 1

                pred = y_pred.data.numpy()[0][0]
                if pred>=0.5:
                    pred=1
                else:
                    pred=0
                correct +=(pred==target)

        return float(correct) / total 

    def training_skim_gram(self, learning_rate=0.001,bts=1, num_epochs=3):

        testloader = DataLoader( TensorDataset(torch.Tensor(self.input_Test),torch.Tensor(self.output_Test)), batch_size=bts, shuffle=False)

        trainloader = DataLoader(TensorDataset(torch.Tensor(self.input_Train),torch.Tensor(self.output_Train)), batch_size=bts, shuffle=True)
        optimizer = torch.optim.SGD(self.skim_gram.parameters(), lr=learning_rate)  
      # Training skim gram
        self.epoch_accuracies_SG = []
        self.epoch_model_SG = []
        self.losses_SG=[]
        best_epoch = 0
        best_accuracy = 0.0
        
        for epoch in range(num_epochs):
            for (data, target)in trainloader:

                x = Variable( self.get_input_layer(data) ).float()
                y_true = Variable(torch.from_numpy(np.array([target])).long())

                  # Forward + Backward + Optimize
                optimizer.zero_grad()

                y_pred = self.skim_gram(x)
                loss = F.nll_loss(y_pred.view(1,-1), y_true)

                loss.backward()
                optimizer.step()
            self.losses_SG.append(loss)
            self.epoch_accuracies_SG.append(self.accuracy(testloader, self.skim_gram))
            self.epoch_model_SG.append(copy.copy(self.skim_gram))
            if self.epoch_accuracies_SG[-1] > best_accuracy: #early stopping implementation
                best_accuracy = self.epoch_accuracies_SG[-1] 
                best_epoch = epoch
                torch.save(self.skim_gram.state_dict(), 'skim_gram.pt') #save best weight values

          #early stopping here   
            if (len(self.epoch_accuracies_SG)>10 and self.epoch_accuracies_SG[epoch-9]>self.epoch_accuracies_SG[-1]):
                break

       # self.skim_gram.load_state_dict(torch.load('skim_gram.pt')) #load best weight values
        return best_accuracy, best_epoch 
  
    def data_Representation(self,x):
        
        #document entry, reproduce document based on skim gram representation to produce embedding for the classification net"
        x=list(x)
        if x:
            with torch.no_grad():
                first_datapoint = x.pop(0)
                xinput= Variable(self.get_input_layer(torch.Tensor(first_datapoint))).float()
                doc = self.skim_gram.representation(xinput)
                doc = doc.reshape(1,-1)
                x=np.array(x)
            for data in x:
                xinput= Variable(self.get_input_layer(torch.Tensor(data))).float()
                representation = self.skim_gram.representation(xinput)
                doc = torch.cat((doc, representation.view(1,-1)),0)
            return doc
        else:
      
          return torch.zeros(25,25) 
    
    def training_ConvolutionNet(self,learning_rate=0.001, num_epochs=15):
        #document entry, reproduce document based on skim gram representation to produce embedding for the classification net"
        epochs = num_epochs
        train_losses = []
        test_losses = []

        self.epoch_accuracies_conv = []
        self.epoch_model_conv = []
        self.losses_conv = []
        best_epoch = 0
        best_accuracy = 0.0
        criterion = nn.BCELoss()
        optimizer = torch.optim.SGD(self.model.parameters(), lr=0.001)
        for epoch in range(epochs):

          # Run the training 
          for (X_train, y_train) in zip(*(self.input_Train_conv,self.output_Train_conv)):

            inputX = self.data_Representation(X_train)
            y_train = torch.tensor(y_train, dtype=torch.float) 
            y_pred = self.model(inputX.view(1,1,inputX.shape[0],inputX.shape[1]))

            loss = criterion(y_pred, y_train )

            # Update parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
          self.losses_conv.append(loss)
          self.epoch_accuracies_conv.append(self.accuracy_conv([self.input_Test_conv,self.output_Test_conv], self.model))
          self.epoch_model_conv.append(copy.copy(self.model))
          if self.epoch_accuracies_conv[-1] > best_accuracy: #early stopping implementation
              best_accuracy = self.epoch_accuracies_conv[-1]
              best_epoch = epoch
              torch.save(self.skim_gram.state_dict(), 'conv_model.pt')#save best weight values
                # Print interim results
          if (len(self.epoch_accuracies_conv)>10 and self.epoch_accuracies_conv[epoch-9]>self.epoch_accuracies_conv[-1]):
              break
          train_losses.append(loss)

        #self.skim_gram.load_state_dict(torch.load('conv_model.pt'))#load best weight values
        return best_accuracy, best_epoch
  
    def prediction(self,test_data,model):
        #perform prediction on new data points"
        total = 0
        correct = 0
        output = []
        with torch.no_grad():
            for k,data in enumerate(test_data):
                for j in data:
                    
                    inputX = self.data_Representation(j)

                    y_pred = model(inputX.view(1,1,inputX.shape[0],inputX.shape[1]))

                    pred = y_pred.data.numpy()[0][0]

                    if pred>=0.5:
                        pred=1
                    else:
                        pred=0
                    output.append(pred)
        return output
    def Test_on_New_Data(self):
        #perform validation on new data points"
        check = self.accuracy_conv([ self.X_new, self.y_new],self.model)
        if check<self.threshold:
            print('Need to retrain')
        return check
    def Predict(self):
    
    #perform prediction on new data points"
        return self.prediction( self.X_new,self.model)


In [0]:
#set threshold value in the class as input
architecture = Method()
architecture.Reading('uspto.json')
architecture.preprocess_input()
architecture.define_models()
architecture.Segment()

In [0]:
architecture.training_skim_gram()

In [0]:
architecture.losses_SG

In [0]:
architecture.epoch_model_SG

In [0]:
architecture.continuous_learning()

In [0]:
architecture.training_ConvolutionNet(learning_rate=0.01)

In [0]:
architecture.losses_conv

In [0]:
architecture.epoch_accuracies_conv

In [0]:
plt.plot(range(len(architecture.losses_SG)), architecture.losses_SG)
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.title('Training  Loss vs epoch for skim-gram model')

In [0]:
plt.plot(range(len(architecture.losses_conv)), architecture.losses_conv)
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.title('Training Loss vs epoch for convonlutional neural net model')

In [0]:
plt.plot(range(len(architecture.losses_conv)), architecture.epoch_accuracies_conv)
plt.ylabel('Test Accuracy ')
plt.xlabel('epoch')
plt.title('Test Accuracy (in scale 0-1) vs epoch for convonlutional neural net model')

***How to call API for new data validation(provides labels) or prediction(no labels)***

In [0]:
architecture.New_Data_Read('uspto.json')#for validation data

In [0]:
architecture.Test_on_New_Data() #validation

In [0]:
architecture.New_Data_Read('uspto.json',FLAG=1)#for predictive data

In [0]:
architecture.Predict()#predict on new data

In [0]:
architecture.df