In [1]:
import pandas as pd
import os
import glob
import re
import numpy as np
from tqdm import tqdm

# Counting Number of Claims and their text size

## 1. Checking the Validity of Claims data in Patent dataset

### 1.1. One-to-One Correspondence between Publications and Claims
Here, we first count the number of publications that doesn't have any claims data, and then will check if there are only one claim data point per publication (we expect to get one claim per publication, since all of publications exist together in that field)

In [4]:
%%bigquery

SELECT 
    COUNT(CASE WHEN claims.text='' THEN 1 END) nb_claims_null,
    COUNT(claims.text) nb_total

FROM `patents-public-data.patents.publications_201809`, UNNEST(claims_localized) as claims
LIMIT 10

Unnamed: 0,nb_claims_null,nb_total
0,78,12844747


So, there are only **78** publications without any data for claims and thus shows that we have the claims data for almost all of the publications.

In [5]:
%%bigquery

SELECT
    COUNT(CASE WHEN nb_claims>1 THEN 1 END) nb_invalid,
    COUNT(publn_nr) nb_total
FROM (
    SELECT 
        publication_number publn_nr,
        COUNT(claims) nb_claims
    FROM `patents-public-data.patents.publications_201809`, UNNEST(claims_localized) as claims
    GROUP BY publn_nr
)
LIMIT 10

Unnamed: 0,nb_invalid,nb_total
0,0,12844747


Again, we can see that here we have one-to-one coreespondence between publication and claims for all of the publications. Thus, the quality of data is very good.

### 1.2. Structure of numbering of the claims

Here, we will first check the structure of claims for all publications after the year **2000**:

In [6]:
%%bigquery df_claims

WITH table_a AS(
SELECT 
    publication_number publn_nr, application_number appln_nr, country_code cntr_code,
    cl.text claim_text,
    kind_code, application_kind appln_kind, pct_number, publication_date publn_date
FROM `patents-public-data.patents.publications`, UNNEST(claims_localized) cl
WHERE cl.text!=''
AND publication_date>20000000
)

SELECT *
FROM table_a
LIMIT 10

In [7]:
df_claims.head(2)

Unnamed: 0,publn_nr,appln_nr,cntr_code,claim_text,kind_code,appln_kind,pct_number,publn_date
0,US-2016143905-A1,US-201514982807-A,US,1 - 11 . (canceled) \n \n \n 1...,A1,A,,20160526
1,US-2016175259-A1,US-201514804168-A,US,What is claimed is: \n \n 1 . A na...,A1,A,,20160623


In [30]:
df_claims.iloc[0,3]

'1 - 11 . (canceled) \n     \n     \n         12 . A pharmaceutical composition comprising a heterocyclic compound represented by the formula (I) or a pharmaceutically acceptable salt thereof as an active ingredient and a pharmaceutically acceptable carrier: \n       \n         \n           \n           \n               \n               \n           \n         \n         wherein \n         A is a lower alkylene group; \n       \n       \n         \n           \n           \n               \n               \n           \n         \n         in the monocyclic heterocycle containing Q is \n       \n       \n         \n           \n           \n               \n               \n           \n         \n         wherein \n         R 2′  is the following group \n       \n       \n         \n           \n           \n               \n               \n           \n         \n         wherein \n         Y 1′  is a lower alkylene group, \n         R 3′  is \n         (1) an alkyl group, \n       

In [10]:
df_claims.iloc[0,3][:200]+'...'

'1 - 11 . (canceled) \n     \n     \n         12 . A pharmaceutical composition comprising a heterocyclic compound represented by the formula (I) or a pharmaceutically acceptable salt thereof as an active...'

In [14]:
%%bigquery

WITH table_a AS(
SELECT 
    publication_number publn_nr, application_number appln_nr, country_code cntr_code,
    cl.text claim_text,
    kind_code, application_kind appln_kind, pct_number, publication_date publn_date
FROM `patents-public-data.patents.publications`, UNNEST(claims_localized) cl
WHERE cl.text!=''
AND publication_date>20000000
)

SELECT *
FROM table_a
WHERE REGEXP_EXTRACT(claim_text, r'^(\d+)\..*$') IS NOT NULL
LIMIT 2

Unnamed: 0,publn_nr,appln_nr,cntr_code,claim_text,kind_code,appln_kind,pct_number,publn_date
0,US-7332611-B2,US-38215706-A,US,1. A N-thiolated 2-oxazolidinone compound sele...,B2,A,,20080219
1,US-7758960-B2,US-38298809-A,US,1. A fiber having filaments comprising a mixtu...,B2,A,,20100720


In [21]:
%%bigquery

WITH table_a AS(
SELECT 
    publication_number publn_nr, application_number appln_nr, country_code cntr_code,
    cl.text claim_text,
    kind_code, application_kind appln_kind, pct_number, publication_date publn_date
FROM `patents-public-data.patents.publications`, UNNEST(claims_localized) cl
WHERE cl.text!=''
AND publication_date>20000000
)

SELECT *
FROM table_a
WHERE REGEXP_EXTRACT(claim_text, r'^(\d+)\s+\-.*$') IS NOT NULL
LIMIT 2

Unnamed: 0,publn_nr,appln_nr,cntr_code,claim_text,kind_code,appln_kind,pct_number,publn_date
0,US-2012321743-A1,US-201113521186-A,US,1 - 15 . (canceled),A1,A,PCT/US2011/026204,20121220
1,US-2018354590-A1,US-201715619507-A,US,1 - The embodiments of the invention in which ...,A1,A,,20181213


In [22]:
%%bigquery

WITH table_a AS(
SELECT 
    publication_number publn_nr, application_number appln_nr, country_code cntr_code,
    cl.text claim_text,
    kind_code, application_kind appln_kind, pct_number, publication_date publn_date
FROM `patents-public-data.patents.publications`, UNNEST(claims_localized) cl
WHERE cl.text!=''
AND publication_date>20000000
)

SELECT *
FROM table_a
WHERE  REGEXP_EXTRACT(claim_text, r'^(\d+)\s+\..*$') IS NOT NULL
LIMIT 2

Unnamed: 0,publn_nr,appln_nr,cntr_code,claim_text,kind_code,appln_kind,pct_number,publn_date
0,US-2011203022-P1,US-65881310-V,US,1 . A new variety of Azalea plant named ‘MNIHA...,P1,V,,20110818
1,US-2011183341-A1,US-201113075062-A,US,1 . A method of diagnosing a subject as suffer...,A1,A,,20110728


In [29]:
%%bigquery

WITH table_a AS(
SELECT 
    publication_number publn_nr, application_number appln_nr, country_code cntr_code,
    cl.text claim_text,
    kind_code, application_kind appln_kind, pct_number, publication_date publn_date
FROM `patents-public-data.patents.publications`, UNNEST(claims_localized) cl
WHERE cl.text!=''
AND publication_date>20000000
)

SELECT
    COUNT(CASE WHEN REGEXP_EXTRACT(claim_text, r'^(\d+).*$') IS NOT NULL OR 
                    REGEXP_EXTRACT(claim_text, r'^(\d+).*$')!=''
               THEN 1 END) AS nb_cases,
    COUNT(publn_nr) nb_total
    
FROM table_a
LIMIT 2

Unnamed: 0,nb_cases,nb_total
0,18429,10597145


As we can see, for the majority of publications, the structure of claims starts with an opening (e.g. "What is claimed is:...") and then followed by `\n` (new line) and numbering for each claim.
Out of **~106M** patents, only **18K** starts with a number, which we can safely ignore.

## 2. Creating Raw Table For Extraction

Here, we will create the raw table to later extract to CSV files and run the Python algorithm on it.

In [9]:
client = bigquery.Client()
# Creating Job Config
job_config = bigquery.QueryJobConfig()
#job_config.dry_run = True
job_config.use_query_cache = False
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set the destination table
dataset_id = 'data_preparation'
table_id = '9_google_patents_claims_raw'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref

query="""
WITH table_a AS(
SELECT * EXCEPT(row_num)
FROM(
    SELECT 
        publication_number publn_nr, 
        application_number appln_nr, 
        country_code,
        cl.text claim_text,
        kind_code, 
        application_kind appln_kind, 
        publication_date as publn_date,
        filing_date,
        grant_date,
        pct_number pct_nr, 
        ROW_NUMBER() OVER(PARTITION BY application_number ORDER BY publication_date ASC) row_num
    FROM `patents-public-data.patents.publications`, UNNEST(claims_localized) cl
    WHERE cl.text!=''
    AND publication_date>20000000 and publication_date<20150000
)
WHERE row_num=1
)
SELECT 
    appln_nr,
    publn_nr,
    country_code, 
    kind_code, 
    appln_kind,
    publn_date, 
    filing_date,
    grant_date,
    pct_nr,
    claim_text
FROM 
    table_a
"""

# Defining the query
query_job = client.query(query, location='US', job_config=job_config)

query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x1ec2d884710>

### 2.2. Exporting to CSV

In [11]:
### Exporting the Final Table
client = bigquery.Client()

# Set Source table
project_id = 'usptobias'
dataset_id = 'data_preparation'
table_id = '9_google_patents_claims_raw'
table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

# Set Destination
dest_bucket = 'uspto-data'
dest_folder = 'data_preparation'
dest_file_name = '2_google_patents_claims_raw.csv_*.gz'
dest_uri = "gs://{0}/{1}/{2}".format(dest_bucket, dest_folder, dest_file_name)

job_config = bigquery.job.ExtractJobConfig()
job_config.compression = 'GZIP'

extract_job = client.extract_table(table_ref, dest_uri, location='US', job_config = job_config)
print('Extract job has {} started!'.format(extract_job.job_id))
extract_job.result()
print('Job has finished and table {} has been exported to {} bucket!'.format(dest_file_name, dest_bucket))

Extract job has b8fe65e5-d18f-48cd-b875-e0691f4b3357 started!
Job has finished and table 2_google_patents_claims_raw.csv_*.gz has been exported to uspto-data bucket!


## 2.3. Copying CSV files from Google Bucket

In [None]:
!gsutil cp gs://uspto-data/data_preparation/2_google_patents_claims_raw.csv_*.gz ./data/

# 3. Main Algorithm to Count Number of Independent Claims Per Application ID and Their Average Number of Words

In [2]:
path = './data/'
all_files = glob.glob(path + "2_google_patents_claims_raw.csv_*.gz")

In [4]:
def mean(list_):
    return round(np.mean(np.array(list_)), 2)

## 3.1. Running the algorithm on a sample of dataset

In [7]:
df_raw = pd.read_csv(all_files[0], compression='gzip').iloc[:100]

In [9]:
df_raw.claim_text = df_raw.claim_text.str.replace(r'\s+', ' ')
df_ext = df_raw.claim_text.str.extractall(r'(\d+\s*\..*?)(?=\s+\d+\s*\.|$)').rename(columns={0:'extracts'})
df_ext.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,extracts
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,"1. A flash memory device, comprising: a memory..."
0,1,2. The flash memory device according to claim ...
0,2,3. The flash memory device according to claim ...
0,3,4. The flash memory device according to claim ...
0,4,5. The flash memory device according to claim ...
1,0,1 . An electronic accessory for an electronic ...
1,1,"2 . The electronic accessory of claim 1 , wher..."
1,2,"3 . The electronic accessory of claim 1 , wher..."
1,3,"4 . The electronic accessory of claim 1 , wher..."
1,4,"5 . The electronic accessory of claim 1 , wher..."


In [10]:
word_avg = []
claim_counts = []
idx = []
for row in df_ext.groupby(level=0):
    sents_len = []
    n_indp = 0
    for r in row[1].reset_index(level=0, drop=True).iterrows():
        txt = r[1].loc['extracts']
        if not re.match(r'.*(?i)(claim\s*\d+).*', txt):
            word_p = re.compile(r'\w+')
            sents_len.append(len(word_p.findall(txt)))
            n_indp += 1
    word_avg.append(mean(sents_len))
    claim_counts.append(n_indp)
    idx.append(row[0])
df_ = pd.merge(df_raw.drop(columns=['claim_text']),
               pd.DataFrame(dict(idx=idx, word_avg=word_avg, claim_counts=claim_counts)), 
               left_index=True, right_on='idx', how='inner').drop(columns=['idx'])
df_.head(3)

Unnamed: 0,appln_nr,publn_nr,country_code,kind_code,appln_kind,publn_date,filing_date,grant_date,pct_nr,word_avg,claim_counts
0,US-28810699-A,US-6115289-A,US,A,A,20000905,19990408,20000905,,189.0,1
1,US-201213725332-A,US-2013109227-A1,US,A1,A,20130502,20121221,0,,229.0,3
2,US-17116005-A,US-2007006105-A1,US,A1,A,20070104,20050630,0,,72.25,4


## 3.2. Running the algorithm on the whole dataset

In [None]:
df_raw = pd.DataFrame()

for file in all_files:
    f_name = re.match(r'(.*).gz', file)[1]
    df_raw = pd.read_csv(file, compression='gzip')
    
    df_raw.claim_text = df_raw.claim_text.str.replace(r'\s+', ' ')
    df_ext = df_raw.claim_text.str.extractall(r'(\d+\s*\..*?)(?=\s+\d+\s*\.|$)').rename(columns={0:'extracts'})

    word_avg = []
    claim_counts = []
    idx = []
    for row in tqdm(df_ext.groupby(level=0)):
        sents_len = []
        n_indp = 0
        for r in row[1].reset_index(level=0, drop=True).iterrows():
            txt = r[1].loc['extracts']
            if not re.match(r'.*(?i)(claim\s*\d+).*', txt):
                word_p = re.compile(r'\w+')
                sents_len.append(len(word_p.findall(txt)))
                n_indp += 1
        word_avg.append(mean(sents_len))
        claim_counts.append(n_indp)
        idx.append(row[0])
    df_ = pd.merge(df_raw.drop(columns=['claim_text']),
                   pd.DataFrame(dict(idx=idx, word_avg=word_avg, claim_counts=claim_counts)), 
                   left_index=True, right_on='idx', how='inner').drop(columns=['idx'])
    df_.to_csv(f_name+'.csv', encoding='utf-8', index=False)

  if not re.match(r'.*(?i)(claim\s*\d+).*', txt):
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 73734/73734 [03:34<00:00, 343.79it/s]
100%|██████████| 73806/73806 [03:35<00:00, 342.46it/s]
100%|██████████| 74073/74073 [03:34<00:00, 344.82it/s]
100%|██████████| 73481/73481 [03:36<00:00, 338.75it/s]
100%|██████████| 73064/73064 [03:35<00:00, 338.66it/s]
100%|██████████| 74085/74085 [03:44<00:00, 330.55it/s]
100%|██████████| 73695/73695 [03:36<00:00, 340.50it/s]
100%|██████████| 73640/73640 [03:39<00:00, 335.28it/s]
100%|██████████| 73910/73910 [03:39<00:00, 337.42it/s]
100%|██████████| 73822/73822 [03:38<00:00, 338.43it/s]
100%|██████████| 73702/73702 [03:37<00:00, 339.55it/s]
100%|██████████| 74030/74030 [03:37<00:00, 339.61it/s]
 58%|█████▊    | 42455/73779 [02:02<01:29, 348.66it/s]

## 3.3. Copying the export files to Bucket

In [None]:
!gsutil cp 2_google_patents_claims_raw.csv_*.csv gs://uspto-data/data_preparation/

# 4. Creating Final table in Google BigQuery

In [18]:
# Before running this line, make sure that you have defined the environment variable...
# ..."GOOGLE_APPLICATION_CREDENTIALS" which points to the JSON file containing authentication key
client = bigquery.Client()
# Initializing the Job_config
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
dataset_ref = client.dataset('data_preparation')

In [20]:
client = bigquery.Client()

schema_Name_org = [
    bigquery.SchemaField('appln_nr', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('publn_nr', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_code', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('kind_code', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('appln_kind', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('publn_date', 'INTEGER', 'NULLABLE', None, ()),
    bigquery.SchemaField('filing_date', 'INTEGER', 'NULLABLE', None, ()),
    bigquery.SchemaField('grant_date', 'INTEGER', 'NULLABLE', None, ()),
    bigquery.SchemaField('pct_nr', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('word_avg', 'FLOAT', 'NULLABLE', None, ()),
    bigquery.SchemaField('claim_counts', 'INTEGER', 'NULLABLE', None, ())
]

dataset_id = 'data_preparation'
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_Name_org

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table('9_google_patents_claims')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/data_preparatio/2_google_patents_claims_raw.csv_*.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
load_job.result()
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x1ec2d8b1da0>