In [6]:
# Import essential Python libraries
import pandas as pd

from tqdm.notebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

In [None]:
g_patent_tsv = "../data/g_patent.tsv"
g_us_citation_tsv = "../data/g_us_patent_citation.tsv"
cpc_group_tsv = "../data/cpc_group.tsv"
g_cpc_current_tsv = "../data/g_cpc_current.tsv"

g_patent_csv = "../data/g_patent.csv"
g_us_citation_csv = "../data/g_us_patent_citation.csv"
cpc_group_csv = "../data/cpc_group.csv"
g_cpc_current_csv = "../data/g_cpc_current.csv"

triplet_training_csv = "../data/triplet_training.csv"

## Neo4j database query:
batch_size = 100000 # Define the size of each batch
num_batches = 10 # Define the number of batches to execute

In [None]:
# Load the g patents tsv in to a pandas dataframe with chunks. Use tqdm to track progress in the jupyter notebook

g_patents_df = pd.DataFrame()
print('Loading the g_patent.tsv file...')
for chunk in tqdm(pd.read_csv(g_patent_tsv, sep='\t', chunksize=1000000, low_memory=False), total=9):
    g_patents_df = pd.concat([g_patents_df, chunk])

# rename the columns to match the neo4j import database schema
g_patents_df.rename(columns={'patent_id': 'patentId:ID', "patent_title": "title:str", "patent_abstract":"abstract:str", "num_claims": "num_claims:int", "patent_type":"patent_type:str", "patent_date": "date:neo4j.time.Date", "filename": "filename:str"}, inplace=True)

# Add a column ":LABEL" and fill it with the string "Patent"
g_patents_df[':LABEL'] = 'Patent'

# Add a column "import_date:neo4j.time.DateTime" and fill it with the current date and time
g_patents_df['import_date:neo4j.time.DateTime'] = pd.to_datetime('now')

# drop the columns that are not needed
g_patents_df.drop(columns=['withdrawn'], inplace=True)

print(g_patents_df.head(5))

# Save the dataframe to a csv file
print('Saving the g_patent.csv file...')
g_patents_df.to_csv(g_patent_csv, index=False)

In [None]:
# Load the g citations tsv in to a pandas dataframe with chunks. Use tqdm to track progress in the jupyter notebook

g_citation_df = pd.DataFrame()
print('Loading the g_us_patent_citation.tsv file...')
for chunk in tqdm(pd.read_csv(g_us_citation_tsv, sep='\t', chunksize=1000000, low_memory=False), total=129):
    g_citation_df = pd.concat([g_citation_df, chunk])

# rename the columns to match the neo4j import database schema
g_citation_df.rename(columns={'patent_id': ':START_ID', "citation_patent_id": ":END_ID", "citation_sequence":"citation_sequence:str", "citation_date": "date:neo4j.time.Date", "record_name": "record_name:str"}, inplace=True)

# Add a column ":LABEL" and fill it with the string "Patent"
g_citation_df[':TYPE'] = 'cites'

# Add a column "import_date:neo4j.time.DateTime" and fill it with the current date and time
g_citation_df['import_date:neo4j.time.DateTime'] = pd.to_datetime('now')

# drop the columns that are not needed
g_citation_df.drop(columns=["wipo_kind", "citation_category"], inplace=True)

print(g_patents_df.head(5))

# Save the dataframe to a csv file
print('Saving the g_citation.csv file...')
g_citation_df.to_csv(g_us_citation_csv, index=False)

In [3]:
cpc_groups = pd.read_csv(cpc_group_tsv, sep='\t', low_memory=False)

# rename the columns to match the neo4j import database schema
cpc_groups.rename(columns={
    'id': 'cpcId:ID',
    "title": "title"
    }, inplace=True)

# Add a column ":LABEL" and fill it with the string "CPCGroup"
cpc_groups[':LABEL'] = 'CPCGroup'

# Save the dataframe to a csv file
print('Saving the cpc_group.csv file...')
cpc_groups.to_csv(cpc_group_csv, index=False)

Saving the cpc_group.csv file...


In [4]:
g_cpc = pd.DataFrame()
print('Loading the g_cpc_current.tsv file...')
for chunk in tqdm(pd.read_csv(g_cpc_current_tsv, sep='\t', chunksize=1000000, low_memory=False), total=49):
    g_cpc = pd.concat([g_cpc, chunk])

# rename the columns to match the neo4j import database schema
g_cpc.rename(columns={
    'patent_id': ':END_ID',
    "cpc_subclass": ":START_ID",
}, inplace=True)

# Add a column ":LABEL" and fill it with the string "CPCGroup"
g_cpc[':TYPE'] = 'classifies'

# Add a column "import_date:neo4j.time.DateTime" and fill it with the current date and time
g_cpc['import_date'] = pd.to_datetime('now')

# drop the columns that are not needed
g_cpc.drop(columns=["cpc_symbol_position"], inplace=True)

print(g_cpc.head(5))

# Save the dataframe to a csv file
print('Saving the g_cpc.csv file...')
g_cpc.to_csv(g_cpc_current_csv, index=False)

Loading the g_cpc_current.tsv file...


  0%|          | 0/9 [00:00<?, ?it/s]

    :END_ID  cpc_sequence cpc_section cpc_class :START_ID      cpc_group  \
0   4796895             1           F       F16      F16H      F16H61/00   
1  10913199             0           B       B29      B29C      B29C55/08   
2   5208443             0           B       B29      B29C     B29C65/366   
3   7830588             6           G       G09      G09G  G09G2310/0275   
4   7232943             1           A       A01      A01H       A01H5/10   

      cpc_type       :TYPE                import_date  
0  inventional  classifies 2023-06-21 14:37:29.759365  
1  inventional  classifies 2023-06-21 14:37:29.759365  
2  inventional  classifies 2023-06-21 14:37:29.759365  
3   additional  classifies 2023-06-21 14:37:29.759365  
4  inventional  classifies 2023-06-21 14:37:29.759365  
Saving the g_cpc.csv file...


In [13]:
# Herader changer

import fileinput

# Modify the header row in the CSV file
for line in fileinput.input('g_patent.csv', inplace=True):
    if fileinput.isfirstline():
        # line = line.replace('title:str', 'title')
        # line = line.replace('abstract:str', 'abstract')
        # line = line.replace('patent_type:str', 'patent_type')
        # line = line.replace('filename:str', 'filename')
        line = line.replace('date:neo4j.time.Date', 'date')
        line = line.replace('import_date:neo4j.time.Date', 'import_date')
    print(line, end='')

for line in fileinput.input("g_citation.csv", inplace=True):
    if fileinput.isfirstline():
        # line = line.replace('citation_sequence:str', 'citation_sequence')
        # line = line.replace('record_name:str', 'record_name')
        line = line.replace('date:neo4j.time.Date', 'date')
        line = line.replace('import_date:neo4j.time.Date', 'import_date')
    print(line, end="")

To import the csv Files into neo4j, use the import admin tool: https://neo4j.com/docs/operations-manual/current/tutorial/neo4j-admin-import/

Possible and working import command:
<!-- ```
bin/neo4j-admin database import full --nodes=import/g_patent.csv --relationships=import/g_citation.csv --multiline-fields=true --skip-bad-relationships --bad-tolerance=100000000 --skip-duplicate-nodes patentsview
``` -->

```
bin/neo4j-admin database import full --nodes=import/g_patent.csv --nodes=import/cpc_group.csv --relationships=import/g_citation.csv --relationships=import/g_cpc.csv --multiline-fields --skip-bad-relationships --bad-tolerance=100000000 --skip-duplicate-nodes patentsviewcpc
```

Adhere to the following steps:
1. Stop the server
2. Run the admin -import tool and use a new database name (e.g. patentsview). You can also choose an existing one but with the flag --overwrite-destination.
3. Start the server
4. In the console switch to the System database using the pulldown control
5. Run the following command at the system prompt: create database aDatabaseName (should be the same name set in #2 above)
6. Switch to the database just created using the console pulldown control

When the import is finished, return here to generate the contrastive learning triplets from neo4j.

In [26]:
import neo4j

# Connect to the neo4j database
driver = neo4j.GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "your_password"))
session = driver.session(database="patentsview")

# Define an empty list to store the results
results = []

# Execute the query in batches
for i in range(num_batches):
    # Define the start and end indices for the batch
    start_index = i * batch_size
    end_index = (i + 1) * batch_size
    
    # Define the query for the batch
    query = f"""
        CALL {{
            MATCH (anchor:Patent)
            WITH DISTINCT anchor SKIP {start_index} LIMIT {batch_size}
            RETURN DISTINCT anchor
        }}
        CALL {{
            WITH anchor
            MATCH (anchor)-[:cites]->(cited:Patent)<-[:classifies]-(cpc:CPCGroup)-[:classifies]->(anchor)
            WHERE anchor <> cited
            WITH DISTINCT cited, cpc LIMIT 1
            RETURN DISTINCT cited, cpc
        }}
        CALL {{
            WITH anchor, cited, cpc
            MATCH (other:Patent)
            WHERE NOT EXISTS {{
                MATCH (anchor)-[:cites]->(other:Patent)<-[:classifies]-(cpc)-[:classifies]->(anchor)
                WHERE anchor <> other
            }}
            WITH DISTINCT other LIMIT 1
            RETURN DISTINCT other
        }}
        RETURN anchor.patentId AS a_id, anchor.title AS a_title, anchor.abstract AS a_abstract, cited.patentId AS p_id, cited.title AS p_title, cited.abstract AS p_abstract, other.patentId AS n_id, other.title AS n_title, other.abstract AS n_abstract, cpc.cpcId AS cpc_id
    """
    
    # Execute the query and append the results to the list
    df = pd.DataFrame(session.run(query).data())
    results.append(df)

    print(f"Fetched {len(df)} rows from the database for batch {i+1}")

# Concatenate the results into a single dataframe
df = pd.concat(results)

print(f"Concatenated {len(df)} rows from all batches")

# Reduce the dataframe to remove duplicates in the a_id column
df = df.drop_duplicates(subset=['a_id'])

print(f"Reduced the dataframe to {len(df)} rows after removing duplicates")

# Save the df to a csv file
print('Saving the df to csv...')
df.to_csv(triplet_training_csv, index=False)

Fetched 8217 rows from the database for batch 1
Fetched 8278 rows from the database for batch 2
Fetched 8264 rows from the database for batch 3
Fetched 8316 rows from the database for batch 4
Fetched 8089 rows from the database for batch 5
Fetched 8269 rows from the database for batch 6
Fetched 8261 rows from the database for batch 7
Fetched 8126 rows from the database for batch 8
Fetched 8181 rows from the database for batch 9
Fetched 8324 rows from the database for batch 10
Concatenated 82325 rows from all batches
Reduced the dataframe to 82325 rows after removing duplicates
