In [2]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
%load_ext google.cloud.bigquery

import os
import re
import json
import chardet
import codecs
import time

# Disambiguation of Attorney Names

## 1. Creating a table of raw attorney names

In this step, we first remove any non-alphabetic charactersand then we create a table containing raw attorney names and the processed ones.

In [2]:
client = bigquery.Client()
job_config = bigquery.QueryJobConfig()
job_config.use_query_cache = False
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set Destination
dataset_id = 'data_preparation'
table_id = '5_attorney_raw'
table_ref = client.dataset(dataset_id).table(table_id)

job_config.destination = table_ref

query="""
WITH t1 AS(
SELECT *
FROM(
    SELECT UPPER(REGEXP_REPLACE(
                                REGEXP_REPLACE(
                                              REGEXP_REPLACE(correspondence_name_line_1, r'[^a-zA-Z\s]+', ''), 
                                              r'[\s]+', ' '),
                                r'(^\s+)|(\s+$)', ''
                                ) 
                ) AS lawyer,
            correspondence_name_line_1 AS raw_lawyer
    FROM `patents-public-data.uspto_oce_pair.correspondence_address` 
    GROUP BY correspondence_name_line_1 
)
GROUP BY raw_lawyer, lawyer
ORDER BY lawyer DESC
)

SELECT *
FROM t1
"""

query_job = client.query(query, location='US', job_config=job_config)
print('Query job has {} started!'.format(query_job.job_id))
query_job.result()
print('Job has finished!')

Query job has b311b384-530a-436f-921d-e27effcd44d2 started!
Job has finished!


### Extracting the table into a CSV file

In [None]:
# Exctracting table
client = bigquery.Client()

# Set Source table
project_id = 'usptobias'
dataset_id = 'data_preparation'
table_id = '5_attorney_raw'
table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

# Set Destination
dest_bucket = 'uspto-data'
dest_folder = 'data_preparation'
dest_file_name = '5_attorney_raw.csv'
dest_uri = "gs://{0}/{1}/{2}".format(dest_bucket, dest_folder, dest_file_name)

extract_job = client.extract_table(table_ref, dest_uri, location='US')
print('Extract job has {} started!'.format(extract_job.job_id))
extract_job.result()
print('Job has finished and table {} has been exported to {} bucket!'.format(dest_file_name, dest_bucket))

# 2. Disambiguating Using Standardization Rules

***Source***: The standardization rules has been downloaded from the following link:  
https://sites.google.com/site/patentdataproject/Home/posts/namestandardizationroutinesuploaded

We then preprocessed the rules to prepare them for our purpose. The preprocessed rules can be found in the `./stdname_rules/` directory.

In [3]:
# Loading "5_attorney_raw" table in a Pandas dataframe
## You need to first download "5_attorney_raw.csv" into the './data/' folder (located in the current path) ...
## ... from "uspto-data/data_preparation" GCP Bucket
data_folder = './data/'
df_rawlawyer = pd.read_csv(data_folder+'5_attorney_raw.csv', low_memory=False)
print('Number of records: {:,}'.format(df_rawlawyer.shape[0]))
df_rawlawyer.head(2)

Number of records: 383,806


Unnamed: 0,lawyer,raw_lawyer
0,ZVI BEKERMAN,ZVI BEKERMAN
1,ZOE D ZIAKA AND,ZOE D. ZIAKA AND


In [4]:
# Adding trailing and ending space (for using the rule-based disambiguation)
df_rawlawyer = df_rawlawyer.dropna()
df_rawlawyer.lawyer = df_rawlawyer.lawyer.apply(lambda x: " " + x + " ")
df_rawlawyer.shape

(383743, 2)

In [25]:
# Extracting the standard rule files
from zipfile import ZipFile
data_folder = './data/'
with ZipFile(data_folder+'stdname_rules.zip', 'r') as file_ref:
    file_ref.extractall(data_folder+'stdname_rules/')
    
files = sorted(os.listdir(data_folder+'stdname_rules/'))

In [27]:
# Loading standard rules into a dictionary
pattern = r'^.*\"(.*?)\".*?\"(.*?)\"'
std_mapper = dict()

decoding = [(2, 1), 
            (1, 2), 
            (1, 2), 
            (2, 1), 
            (1, 2), 
            (1, 2)]

for dec, file in zip(decoding, files):
    encoding = chardet.detect(open(data_folder+'stdname_rules/'+file, "rb").read())['encoding']
    with codecs.open(data_folder+'stdname_rules/'+file, 'r', encoding=encoding) as text_file:
        lines = text_file.readlines()
        for line in lines:
            key = (re.match(pattern, line)[dec[0]]).rstrip()
            value = (re.match(pattern, line)[dec[1]]).rstrip()
            std_mapper[key] = value

In [28]:
df_mapper = pd.DataFrame(std_mapper, index=['mapped']).T.reset_index(drop=False).rename(columns={'index':'initial'})
df_mapper.mapped = ' '
df_mapper.initial = df_mapper.initial.apply(lambda x: x+' ')
std_mapper = df_mapper.dropna().set_index('initial')['mapped'].to_dict()

df_mapper.head(3)

Unnamed: 0,initial,mapped
0,& BRO,
1,& BROTHER,
2,& C,


In [None]:
# Starting standardization
start_t = time.perf_counter()
df_rawlawyer.lawyer = df_rawlawyer.lawyer.replace(std_mapper, regex=True).replace(std_mapper, regex=True)
end_t = time.perf_counter()
diff_t = end_t - start_t
print('Total running time was {:,.0f} hours and {:.0f} minutes!'.format(diff_t//3600, (diff_t%3600)//60))

In [None]:
# Stripping the spaces
df_rawlawyer.lawyer = df_rawlawyer.lawyer.str.strip()

# Getting unique disambiguated lawyers
df_lawyer_id = df_rawlawyer[['lawyer']].drop_duplicates().reset_index(drop=True).copy()
# Adding unique ID to each lawyer
df_lawyer_id = df_lawyer_id.reset_index(drop=False).rename(columns={'index':'lawyer_id'})
df_lawyer_id.lawyer_id = df_lawyer_id.lawyer_id + 100000

print('Number of unique lawyers: {:,}'.format(df_lawyer_id.shape[0]))
df_lawyer_id.head(2)

In [None]:
df_lawyer_merger = pd.merge(df_rawlawyer, df_lawyer_id, on=['lawyer'], how='left')
print('Number of records: {:,}'.format(df_lawyer_merger.shape[0]))
df_lawyer_merger.head(3)

In [None]:
# Saving the resulting dataframes
df_lawyer_id.to_csv('./data/5_attorneyId.csv', encoding='utf-8', index=False)
df_lawyer_merger.to_csv('./data/5_attorney_disambiguated.csv', encoding='utf-8', index=False)

## 3. Creating the BigQuery tables using the disambiguated attorney names

In [31]:
# Creating "5_attorneyID" table
# Creating "lawyer_id_fung" table
bq_client = bigquery.Client()

schema = [
    bigquery.SchemaField('attorney_id', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('attorney', 'STRING', 'NULLABLE', None, ())
]
dataset_id = 'data_preparation'
dataset_ref = bq_client.dataset(dataset_id)
dest_table_name = '5_attorneyID'

job_config = bigquery.LoadJobConfig()
job_config.schema = schema
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://uspto-data/data_preparation/5_attorneyId.csv"

load_job = bq_client.load_table_from_uri(
    uri, dataset_ref.table(dest_table_name), job_config=job_config
)  
print("Starting job {}".format(load_job.job_id))
load_job.result()
print('Job has finished!')

Starting job a74cafd4-62ef-486f-b39f-b42a8066f58c
Job has finished!


In [33]:
# Creating "5_attorney_disambiguated" table
# Creating "lawyer_id_fung" table
bq_client = bigquery.Client()

schema = [
    bigquery.SchemaField('attorney', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('raw_attorney', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('attorney_id', 'STRING', 'NULLABLE', None, ())
]

# Setting the destination table path
dataset_id = 'data_preparation'
dataset_ref = bq_client.dataset(dataset_id)
dest_table_name = '5_attorney_disambiguated'

job_config = bigquery.LoadJobConfig()
job_config.schema = schema
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://uspto-data/data_preparation/5_attorney_disambiguated.csv"

load_job = bq_client.load_table_from_uri(
    uri, dataset_ref.table(dest_table_name), job_config=job_config
)  
print("Starting job {}".format(load_job.job_id))

load_job.result()
print('Job has finished!')

Starting job 8b558291-172d-486e-8700-5a78986c6323
Job has finished!


## 4. Creating the final table: `5_appln_attorney`

In [34]:
client = bigquery.Client()
job_config = bigquery.QueryJobConfig()
job_config.use_query_cache = False
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set Destination
project_id = 'usptobias'
dataset_id = 'data_preparation'
table_id = '5_appln_attorney'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

query="""
WITH rlawyerAppln_table AS(
SELECT 
    application_number AS appln_nr,
    correspondence_name_line_1 AS raw_attorney,
    correspondence_region_code AS attorney_region_code,
    correspondence_country_code AS attorney_country_code
FROM `patents-public-data.uspto_oce_pair.correspondence_address`
), lawyerMerger_table AS(
    SELECT attorney, raw_attorney, attorney_id
    FROM `{0}.{1}.5_attorney_disambiguated`
)

SELECT appln_nr, attorney, attorney_id, attorney_region_code, attorney_country_code
FROM rlawyerAppln_table
LEFT JOIN lawyerMerger_table USING(raw_attorney)
WHERE attorney IS NOT NULL
""".format(project_id, dataset_id)

query_job = client.query(query, location='US', job_config=job_config)
print('Query job has {} started!'.format(query_job.job_id))
query_job.result()
print('Job has finished!')

Query job has 5ba0d91a-70bd-43c4-8a0a-77a2fc725415 started!
Job has finished!
