In [1]:
from google.cloud import bigquery

from open_patstat.utils.gcp import create_table, load_gcs_file, delete_table
from open_patstat.utils.schema import Schema

In [2]:
client = bigquery.Client()

In [3]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
dataset_ref = client.dataset('patstat')

# 1. Loading Tables

## 1.1. Table "tls201" (main table)

In [4]:
create_table(client,
             dataset_id='patstat',
             table_id='tls201',
             schema=Schema().tls201)

In [5]:
table_ref = dataset_ref.table('tls201_s')
job_config.schema = Schema().tls201
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn_01/tls201_*.gz', 
              table_ref, job_config)
#load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn_01/tls201_part01.txt', 
#              table_ref, job_config)

Starting job lgs-da4c45c0-e673-4ba5-bfff-b103083da844
Job took 9.093069076538086 seconds


## 1.2. Table "tls209_appln_ipc" (IPC codes assigned to each application)

In [7]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls209',
             schema=Schema().tls209)

In [8]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls209')
job_config.schema = Schema().tls209
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls209_*.gz', 
              table_ref, job_config)

Starting job lgs-88f2b615-3abd-4d3d-93ad-ef855e397bcf
Job took 267.2587592601776 seconds


In [9]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls209_s',
             schema=Schema().tls209)

In [10]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls209_s')
job_config.schema = Schema().tls209
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls209_part01.txt', 
              table_ref, job_config)

Starting job lgs-97d56603-15b6-4d21-ba28-016a6117f4bc
Job took 3.600205659866333 seconds


## 1.3. Table "tls204" (the priority status of patents)

In [5]:
create_table(client,
             dataset_id='patstat',
             table_id='tls204',
             schema=Schema().tls204)

In [6]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls204')
job_config.schema = Schema().tls204
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls204_*.gz', 
              table_ref, job_config)

Starting job lgs-816820f0-9966-4b84-a2ae-25fc42b4fe62
Job took 216.11920499801636 seconds


In [9]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls204_s',
             schema=Schema().tls204)

In [10]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls204_s')
job_config.schema = Schema().tls204
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls204_part01.txt', 
              table_ref, job_config)

Starting job lgs-506b7785-bd98-4b75-a829-28c88a20187f
Job took 6.117683410644531 seconds


## 1.4. Table "tls207": the correspondence between patent application and inventors

In [7]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls207',
             schema=Schema().tls207)

In [8]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls207')
job_config.schema = Schema().tls207
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls207_*.gz', 
              table_ref, job_config)

Starting job lgs-ad379b9e-f584-4829-8ae0-ebed6eb950af
Job took 379.3931813240051 seconds


In [11]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls207_s',
             schema=Schema().tls207)

In [12]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls207_s')
job_config.schema = Schema().tls207
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls207_part01.txt', 
              table_ref, job_config)

Starting job lgs-15d0c099-f0c0-4706-9e15-af4fcc411147
Job took 5.712496280670166 seconds


## 1.5. Table "tls206": Details on names and addresses of applicants

In [None]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls206',
             schema=Schema().tls206)

In [None]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls206')
job_config.schema = Schema().tls206
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls206_*.gz', 
              table_ref, job_config)

In [13]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls206_s',
             schema=Schema().tls206)

In [15]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls206_s')
job_config.schema = Schema().tls206
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls206_part01.txt', 
              table_ref, job_config)

Starting job lgs-a1667418-f504-487d-94f9-0f9eead2d590
Job took 8.971823930740356 seconds


## 1.6. Table "tls211": 
## Information on the patent offices of destination (publication authorities) of all INPADOC family members and excludes the PCT publication authority (WO)

In [5]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls211',
             schema=Schema().tls211)

In [6]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls211')
job_config.schema = Schema().tls211

In [7]:
load_job = client.load_table_from_uri(
        source_uris='gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls211_*.gz',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )

In [8]:
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x7f45f5fd0940>

In [None]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls211_s',
             schema=Schema().tls211)

In [4]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls211_s',
             schema=Schema().tls211)

THIS TABLE ALREADY EXISTS IN usptobias:patstat


In [None]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls211_s')
job_config.schema = Schema().tls211
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls211_part01.txt', 
              table_ref, job_config)

## 1.7. Table "tls212": Citations

In [None]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls212',
             schema=Schema().tls212)

In [None]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls212')
job_config.schema = Schema().tls212
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls212_*.gz', 
              table_ref, job_config)

## 1.8. Checking to see if the data is loaded correctly on the tables

In [None]:
# Check table creation and displays meta data
# Run in command line
bq show my-dataset:patstat.tls201

## 1.9. Creating Lawyers information

In [15]:
schema_lawyer_matching = [
    bigquery.SchemaField('lawyer_id', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('attorney', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('original_name', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('type', 'STRING', 'NULLABLE', None, ())
        ]

In [16]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_lawyer_matching
dataset_ref = client.dataset('adding_data')

In [17]:
table_ref = dataset_ref.table('15_lawyer_matching')
#job_config.schema = Schema().tls201
load_gcs_file(client, 'gs://uspto-data/lawyers_final.csv', 
              table_ref, job_config)
#load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn_01/tls201_part01.txt', 
#              table_ref, job_config)

Starting job lgs-18a7c671-01f1-4e84-bb72-16c2573b9bcd
Job took 9.51806378364563 seconds


In [18]:
schema_lawyer_corr = [
    bigquery.SchemaField('appln_nr', 'STRING', 'REQUIRED', None, ()),
    bigquery.SchemaField('attorney', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('type', 'STRING', 'NULLABLE', None, ())
        ]

In [19]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_lawyer_corr
dataset_ref = client.dataset('adding_data')

In [20]:
table_ref = dataset_ref.table('15_lawyer_corr')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/lawyers_corr_final.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )

load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x7f6e9e3b3320>

In [None]:
load_job.errors

## 1.10. Creating Name Ethnicity Table

In [14]:
schema_Name_eth = [
    bigquery.SchemaField('index', 'INTEGER', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first_har', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last_har', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_origin', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_origin_alt', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('score_origin', 'FLOAT', 'NULLABLE', None, ()),
    bigquery.SchemaField('sub_region_origin', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('top_region_origin', 'STRING', 'NULLABLE', None, ()),
        ]

In [15]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 300
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_Name_eth
dataset_ref = client.dataset('usptobias_dataset')

In [16]:
table_ref = dataset_ref.table('6_names_origin')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/origin_processed.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x7fb45c5d6f28>

In [9]:
load_job.errors

## 1.11. Creating Names-Country Gender Table

In [34]:
schema_Name_gen = [
    bigquery.SchemaField('index', 'INTEGER', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_code', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first_har', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last_har', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('predicted_gender', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('gender_scale', 'FLOAT', 'NULLABLE', None, ()),
    bigquery.SchemaField('score_gender', 'FLOAT', 'NULLABLE', None, ())
]

In [35]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 300
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_Name_gen
dataset_ref = client.dataset('usptobias_dataset')

In [36]:
table_ref = dataset_ref.table('6_names_gender')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/genderCountry_processed.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x7fb45c575ef0>

In [None]:
load_job.errors[:10]

## 1.12. Creating Final Names-Origin Table

In [15]:
schema_Name_org = [
    bigquery.SchemaField('index', 'INTEGER', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_code', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first_har', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last_har', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_origin', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('sub_region_origin', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('top_region_origin', 'STRING', 'NULLABLE', None, ())
]

In [16]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 300
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_Name_org
dataset_ref = client.dataset('usptobias_dataset')

In [17]:
table_ref = dataset_ref.table('6_names_origin')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/final_origin_processed.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x7fc174952320>

# 1.13. Creating Countries - GPD per Capita table

In [64]:
import pandas as pd
df_ = pd.read_csv('./data/countries_gdp_per_capita.csv', skiprows=2,
                  header=1, usecols=['Country Name', 'Country Code', '2012'])\
        .rename(columns={'Country Name':'country_name', 'Country Code':'country_code', '2012':'gdppc'})

df_map = pd.read_csv("./data/country-codes.csv", usecols=['ISO3166-1-Alpha-3', 'ISO3166-1-Alpha-2'])\
           .rename(columns={'ISO3166-1-Alpha-3':'iso3', 'ISO3166-1-Alpha-2':'iso2'})
df_map = (df_map[df_map.iso3!=df_map.iso2]).drop_duplicates().set_index('iso3')
df_['country_code'] = df_.country_code.map(df_map.to_dict()['iso2'])

print(df_.shape[0])
display(df_.head(2))
df_.to_csv('./data/countries_gdp_per_capita_2012.csv', index=False, encoding='utf-8')

264


Unnamed: 0,country_name,country_code,gdppc
0,Aruba,AW,35498.982089
1,Afghanistan,AF,1806.76393


In [2]:
import pandas as pd
df_1 = pd.read_csv('./data/countries_gdp_per_capita_2012.csv')

df_1.head()

Unnamed: 0,country_name,country_code,gdppc
0,Aruba,AW,35498.982089
1,Afghanistan,AF,1806.76393
2,Angola,AO,6772.528333
3,Albania,AL,10526.255328
4,Andorra,AD,


In [8]:
df_1.sort_values(by=['gdppc'], ascending=False)

Unnamed: 0,country_name,country_code,gdppc
144,"Macao SAR, China",MO,126618.471775
198,Qatar,QA,122674.723654
142,Luxembourg,LU,91622.177417
29,Brunei Darussalam,BN,83725.421026
125,Kuwait,KW,80209.814532
206,Singapore,SG,78978.822854
175,Norway,NO,65447.495825
50,Cayman Islands,KY,63666.612362
210,San Marino,SM,61073.159245
6,United Arab Emirates,AE,58961.202011


In [78]:
schema_Name_org = [
    bigquery.SchemaField('country_name', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_code', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('gdppc', 'FLOAT', 'NULLABLE', None, ())
]

In [79]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_Name_org
dataset_ref = client.dataset('adding_data')

In [80]:
table_ref = dataset_ref.table('17_countries_gdp')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/countries_gdp_per_capita_2012.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x231bfb646a0>

# 1.14. Creating transactions-category tabel

This table has been extracted from the [USPTO PAIR dataset webpage - Appendix B](https://www.uspto.gov/sites/default/files/documents/Appendix%20B.pdf)

In [82]:
df_ = pd.read_csv('./data/transaction_category3.csv')
df_.head(3)

Unnamed: 0,code,category
0,DOCK,EX
1,FWDX,EX
2,WIDS,AA


In [83]:
schema_Name_org = [
    bigquery.SchemaField('transaction', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('category', 'STRING', 'NULLABLE', None, ())
]

In [84]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_Name_org
dataset_ref = client.dataset('adding_data')

In [85]:
table_ref = dataset_ref.table('18_transaction_category')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/transaction_category3.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x231c01d3358>