In [39]:
"""
Questions for Olivier-

1. How was this data collected? Read off of patents by a person? Entered by those filing the patent?
Where is the company information?
How to make sense of the location id? (is that the company?)

2. Is there some virtual database of patent applications in document form?
Stuff like layout LM that they use for automated invoicing could auto get this info if there's some big database of stuff 
in document form that otherwise people would have to go through?

3. On a first pass, you see some data inconsistencies (non english alphabet letters, entry errors, etc)
What's the process on data cleaning this?

4. I saw your cool paper on evaluation stuff. I didn't have time to dive in yet but that obviously is a wonderful standard for
evaluating everything. This isn't really a question just wanted to aknowledge that.
"""

"\nQuestions for Olivier-\n\n1. How was this data collected? Read off of patents by a person? Entered by those filing the patent?\nWhere is the company information?\nHow to make sense of the location id? (is that the company?)\n\n2. Is there some virtual database of patent applications in document form?\nStuff like layout LM that they use for automated invoicing could auto get this info if there's some big database of stuff \nin document form that otherwise people would have to go through?\n\n3. On a first pass, you see some data inconsistencies (non english alphabet letters, entry errors, etc)\nWhat's the process on data cleaning this?\n\n4. I saw your cool paper on evaluation stuff. I didn't have time to dive in yet but that obviously is a wonderful standard for\nevaluating everything. This isn't really a question just wanted to aknowledge that.\n"

In [40]:
#First, get the data from the dissambioguation
import pandas as pd
import numpy as np
from IPython.display import display

data_size = 500000

#import data

raw_inv = pd.read_csv("g_inventor_not_disambiguated.tsv", sep="\t", dtype=str, nrows = data_size)
raw_inv = raw_inv.sort_values(by=["raw_inventor_name_last"])
display(raw_inv)


Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id
222806,6495058,3,0eayxg6ntbpb7i43or67xz50b,Gabrielle Holly,(Spangler) Detzel,FALSE,5mvvk75d3r1yecdwqkm47hbsc
61259,10828027,0,fl:da_ln:(tarinelli)racenet-1,Danyel,(Tarinelli) Racenet,FALSE,6bpux38ynbiuau8xgyykwjumi
35750,6920319,3,fl:ma_ln:agren-1,Mattias,?gren,FALSE,tycghlb8ec7e39nv26yaj4k4n
316994,8688758,0,fl:pe_ln:ahgren-1,Per,?hgren,FALSE,kab0alj30ub6sqq5essuf2lwn
362101,8372339,2,fl:ma_ln:ahlund-1,Mats-?ke,?hlund,FALSE,4hp04uyj6gy2anenelldzqe6x
...,...,...,...,...,...,...,...
4162,8601984,1,fl:da_ln:vreb-1,Dag,Øvrebø,FALSE,vw03ylcos0gzw2e8k7cvijkl4
478656,9990505,0,fl:er_ln:uner-1,Eric Ridvan,Üner,FALSE,8di7ozemfh90jscsgoruxkr7b
225302,6610666,0,0egryt9joungc4nyibksf2jrt,Jim,åkerblom,FALSE,dp67qweekbbwzw1tp5tn9g5ig
300292,6468680,1,fl:le_ln:akesson-1,Leif,åkesson,FALSE,pazrzq9n71i6s2sws3qy17788


In [41]:

#Levenstein distance implementation- inspired from the internet

from functools import lru_cache

def lev_dist(a, b):
    @lru_cache(None) 
    def min_dist(s1, s2):
        if s1 == len(a) or s2 == len(b):
            return len(a) - s1 + len(b) - s2
        if a[s1] == b[s2]:
            return min_dist(s1 + 1, s2 + 1)
        return 1 + min(
            min_dist(s1, s2 + 1),      
            min_dist(s1 + 1, s2),      
            min_dist(s1 + 1, s2 + 1),  
        )

    return min_dist(0, 0)

In [42]:

#compare inventors to row above, creates semi realistic scenario where last names will generally be the same
#a scenario where some pre built filter has generally eliminated matches that do not have the same last name
'''
This should be vectorized, iterating through dfs is bad practice

'''
comp_arr = np.zeros((len(raw_inv.index), 4))
y_arr = np.zeros((len(raw_inv.index), 1))
ind = -1
for index, row in raw_inv.iterrows():
    curr = row
    if ind==-1:
        hold = curr
        ind = ind + 1
        continue
    if (type(curr['raw_inventor_name_first']) == str) and (type(hold['raw_inventor_name_first']) == str):
        comp_arr[ind,0] = lev_dist(curr['raw_inventor_name_first'].lower(), hold['raw_inventor_name_first'].lower()) #firstname diff
    if (type(curr['raw_inventor_name_last']) == str) and (type(hold['raw_inventor_name_last']) == str):
        comp_arr[ind,1]= lev_dist(curr['raw_inventor_name_last'].lower(), hold['raw_inventor_name_last'].lower()) #last name diff
    if (type(curr['deceased_flag']) == str) and (type(hold['deceased_flag']) == str):
        comp_arr[ind,2] = (curr['deceased_flag'].lower() == hold['deceased_flag'].lower()) #is living status same?
    if (type(curr['rawlocation_id']) == str) and (type(hold['rawlocation_id']) == str):
        comp_arr[ind,3] = (curr['rawlocation_id'].lower() == hold['rawlocation_id'].lower()) #is location same?
    if (type(curr['inventor_id']) == str) and (type(hold['inventor_id']) == str):
        y_arr[ind,0] = (curr['inventor_id'].lower() == hold['inventor_id'].lower()) #is inventor same?
    ind = ind + 1
    hold = curr
num_matches = np.sum(y_arr)
print(num_matches)
#in this example, it's ~37k matches, so it's something like 8% of the dataset is matches

37046.0


In [43]:
#Split into test and train, convert to torch, and set up batches
from sklearn.model_selection import train_test_split
X = comp_arr
y = y_arr
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
class Data(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32))
        self.len = self.X.shape[0]
       
    def __getitem__(self, index):
        return self.X[index], self.y[index]
   
    def __len__(self):
        return self.len   
batch_size = 64
train_data = Data(X_train, y_train)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

test_data = Data(X_test, y_test)
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)


In [44]:
#Set up the model class ie. the feedforward layer
from torch import nn
from torch import optim

input_dim = 4
hidden_dim = 10
output_dim = 1
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.layer_1 = nn.Linear(input_dim, hidden_dim)
        nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="relu")
        self.layer_2 = nn.Linear(hidden_dim, output_dim)
       
    def forward(self, x):
        x = torch.nn.functional.relu(self.layer_1(x))
        x = torch.nn.functional.sigmoid(self.layer_2(x))
        return x
       
model = NeuralNetwork(input_dim, hidden_dim, output_dim)

learning_rate = 0.1
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [45]:
#Training loop
num_epochs = 25
loss_values = []


for epoch in range(num_epochs):
    for X, y in train_dataloader:
        optimizer.zero_grad()
        pred = model(X)
        loss = loss_fn(pred, y)
        loss_values.append(loss.item())
        loss.backward()
        optimizer.step()
    print(epoch)

print("Done")

#crazy that running locally the training takes not even 2 minutes


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Done


In [46]:

'''I do a simple accuracy calculation here but obviously the next step is to use the library that was in that paper
because this is a terrible way of evaluating, specifically for entity resolution'''

correct = 0
total = 0
with torch.no_grad():
    for X, y in test_dataloader:
        outputs = model(X)
        predicted = np.where(outputs < 0.5, 0, 1)
        total += y.size(0)
        correct += (predicted == y.numpy()).sum().item()

print(f'Accuracy {100 * correct // total}%')


Accuracy 98%
