In [1]:
# Import essential Python libraries
import pandas as pd

from tqdm.notebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

In [2]:
g_patent_tsv = "../data/g_patent.tsv"
g_us_citation_tsv = "../data/g_us_patent_citation.tsv"
cpc_group_tsv = "../data/cpc_group.tsv"
g_cpc_current_tsv = "../data/g_cpc_current.tsv"

g_patent_csv = "../data/v2/g_patent.csv"
g_us_citation_csv = "../data/v2/g_us_patent_citation.csv"
cpc_group_csv = "../data/v2/cpc_group.csv"
g_cpc_current_csv = "../data/v2/g_cpc_current.csv"

In [3]:
# Load the g patents tsv in to a pandas dataframe with chunks. Use tqdm to track progress in the jupyter notebook

g_patents_df = pd.DataFrame()
print('Loading the g_patent.tsv file...')
for chunk in tqdm(pd.read_csv(g_patent_tsv, sep='\t', chunksize=1000000, low_memory=False), total=9):
    g_patents_df = pd.concat([g_patents_df, chunk])

print("Columns of the g_patents_df dataframe:")
display(g_patents_df.columns)

# drop the columns that are not needed
g_patents_df.drop(columns=['withdrawn'], inplace=True)

print("Sample of the g_patents_df dataframe:")
display(g_patents_df.head(5))

# Save the dataframe to a csv file
print('Saving the g_patent.csv file...')
g_patents_df.to_csv(g_patent_csv, index=False)
print('Done!')

Loading the g_patent.tsv file...


  0%|          | 0/9 [00:00<?, ?it/s]

Columns of the g_patents_df dataframe:


Index(['patent_id', 'patent_type', 'patent_date', 'patent_title',
       'patent_abstract', 'wipo_kind', 'num_claims', 'withdrawn', 'filename'],
      dtype='object')

Sample of the g_patents_df dataframe:


Unnamed: 0,patent_id,patent_type,patent_date,patent_title,patent_abstract,wipo_kind,num_claims,filename
0,10000000,utility,2018-06-19,Coherent LADAR using intra-pixel quadrature de...,A frequency modulated (coherent) laser detecti...,B2,20,ipg180619.xml
1,10000001,utility,2018-06-19,Injection molding machine and mold thickness c...,The injection molding machine includes a fixed...,B2,12,ipg180619.xml
2,10000002,utility,2018-06-19,Method for manufacturing polymer film and co-e...,The present invention relates to: a method for...,B2,9,ipg180619.xml
3,10000003,utility,2018-06-19,Method for producing a container from a thermo...,The invention relates to a method for producin...,B2,18,ipg180619.xml
4,10000004,utility,2018-06-19,"Process of obtaining a double-oriented film, c...",The present invention relates to provides a do...,B2,6,ipg180619.xml


Saving the g_patent.csv file...
Done!


In [4]:
# Load the g citations tsv in to a pandas dataframe with chunks. Use tqdm to track progress in the jupyter notebook

g_citation_df = pd.DataFrame()
print('Loading the g_us_patent_citation.tsv file...')
for chunk in tqdm(pd.read_csv(g_us_citation_tsv, sep='\t', chunksize=1000000, low_memory=False), total=129):
    g_citation_df = pd.concat([g_citation_df, chunk])

print("Columns of the g_citation_df dataframe:")
display(g_citation_df.columns)

# rename the columns to match the neo4j import database schema
# g_citation_df.rename(columns={'patent_id': ':START_ID', "citation_patent_id": ":END_ID", "citation_sequence":"citation_sequence:str", "citation_date": "date:neo4j.time.Date", "record_name": "record_name:str"}, inplace=True)

# drop the columns that are not needed
g_citation_df.drop(columns=["wipo_kind", "citation_category"], inplace=True)

print("Sample of the g_citation_df dataframe:")
display(g_patents_df.head(5))

# Use the information from the g_citation_df dataframe create a new column in the g_patents_df dataframe that contains a list of each cited patent for each patent
# Create a new column in the g_patents_df dataframe that contains a list of each cited patent for each patent
print('Creating a new column in the g_patents_df dataframe that contains a list of each cited patent for each patent...')
g_patents_df["cited_patents"] = g_patents_df["patent_id"].apply(lambda x: list(g_citation_df[g_citation_df["patent_id"] == x]["citation_patent_id"]))
print('Done!')

# Save the g_patent_citations.csv dataframe to a csv file
print('Saving the g_patent_citations.csv dataframe to a csv file...')
g_patents_df.to_csv(g_patent_csv, index=False)
print('Done!')

Loading the g_us_patent_citation.tsv file...


  0%|          | 0/129 [00:00<?, ?it/s]

Columns of the g_citation_df dataframe:


Index(['patent_id', 'citation_sequence', 'citation_patent_id', 'citation_date',
       'record_name', 'wipo_kind', 'citation_category'],
      dtype='object')

Sample of the g_citation_df dataframe:


Unnamed: 0,patent_id,patent_type,patent_date,patent_title,patent_abstract,wipo_kind,num_claims,filename
0,10000000,utility,2018-06-19,Coherent LADAR using intra-pixel quadrature de...,A frequency modulated (coherent) laser detecti...,B2,20,ipg180619.xml
1,10000001,utility,2018-06-19,Injection molding machine and mold thickness c...,The injection molding machine includes a fixed...,B2,12,ipg180619.xml
2,10000002,utility,2018-06-19,Method for manufacturing polymer film and co-e...,The present invention relates to: a method for...,B2,9,ipg180619.xml
3,10000003,utility,2018-06-19,Method for producing a container from a thermo...,The invention relates to a method for producin...,B2,18,ipg180619.xml
4,10000004,utility,2018-06-19,"Process of obtaining a double-oriented film, c...",The present invention relates to provides a do...,B2,6,ipg180619.xml


Creating a new column in the g_patents_df dataframe that contains a list of each cited patent for each patent...


KeyboardInterrupt: 

In [None]:
cpc_groups = pd.read_csv(cpc_group_tsv, sep='\t', low_memory=False)

# Save the dataframe to a csv file
print('Saving the cpc_group.csv file...')
cpc_groups.to_csv(cpc_group_csv, index=False)
print("Done!")

Saving the cpc_group.csv file...
Done!


In [None]:
g_cpc = pd.DataFrame()
print('Loading the g_cpc_current.tsv file...')
for chunk in tqdm(pd.read_csv(g_cpc_current_tsv, sep='\t', chunksize=1000000, low_memory=False), total=49):
    g_cpc = pd.concat([g_cpc, chunk])

# Use the information from the g_cpc dataframe create a new column in the g_patents_df dataframe that contains a list of each cpc_subclass for each patent
# Create a new column in the g_patents_df dataframe that contains a list of each cpc_subclass for each patent
print('Creating a new column in the g_patents_df dataframe that contains a list of each cpc_subclass for each patent...')
g_patents_df['cpc_subclasses'] = g_patents_df['patent_id'].map(g_cpc.groupby('patent_id')['cpc_subclass_symbol'].apply(list))
print('Done!')

# Save the g_patent_citations.csv dataframe to a csv file
print('Saving the g_patent_citations_cpc.csv dataframe to a csv file...')
g_patents_df.to_csv(g_patent_csv, index=False)
print('Done!')

Loading the g_cpc_current.tsv file...


  0%|          | 0/49 [00:00<?, ?it/s]

Creating a new column in the g_patents_df dataframe that contains a list of each cpc_subclass for each patent...
Saving the g_patent_citations_cpc.csv dataframe to a csv file...
Done!
