In [1]:
import json
import duckdb
from google import genai
from google.genai import types
from datetime import datetime
import re

In [2]:
def get_api_key() -> str:
    """Gets the users Google Gemini api key from the config file

    Args:
        None

    Returns:
        The Google Gemini api key of the user
    """
    with open("../config.json", "r") as config_file:
        config = json.load(config_file)
    return config.get("gemini_api_key")

def write_log(msg: str, logfile: str):
    """Writes a message to the log file.

    Args:
        msg: The message to write to the log file
        logfile: The name of the log file

    Returns:
        None
    """
    file_path = f"../logs/{logfile}"
    with open(file_path, "a") as log_file:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_file.write(f"{timestamp}\n{msg}\n\n")

In [3]:
client = genai.Client(api_key=get_api_key())

# Connect to the database
con = duckdb.connect(database='patent_database', read_only=False)

In [4]:
def create_label_table(reset: bool = False):
    """Creates a table in the database to store the labels

    Args:
        reset: Whether to reset the table if it already exists
    Returns:
        None
    """
    if reset:
        con.execute("DROP TABLE IF EXISTS labels")

    con.execute("""
        CREATE TABLE IF NOT EXISTS labels (
            han_id INTEGER,
            firm_id INTEGER REFERENCES firm_names(firm_id),
            label INTEGER
        )
    """)

def insert_label(han_id: int, firm_id: int, label: int):
    """Inserts a label into the label table

    Args:
        han_id: The id of the han record
        firm_id: The id of the firm record
        label: The label of the record
    Returns:
        None
    """
    con.execute(f"""
        INSERT INTO labels
        VALUES ({han_id}, {firm_id}, {label})
    """)

In [5]:
def call_gemini_api(input_data: str) -> dict:
    """Calls the Google Gemini API to determine if the name is a match to the han_name, person_name and psn_name

    Args:
        input_data: The data to be sent to the API
    Returns:
        The response from the API
    """

    prompt = f""" Your task is to determine if a given company name ('name') matches any of the provided company names from the PATSTAT database ('han_name', 'person_name', 'psn_name'). Consider variations in spelling and abbreviations You must be very thorough in your analysis.

    Input Data (JSON):
    {input_data}

    Output (JSON):
    {{\n\"firm_id\": \"{{firm_id}}\",\n    \"han_id\": \"{{han_id}}\",\n    \"label\": \"{{label}}\" <--- The value of 'label' MUST be either '0' or '1'. '0' indicates no match, and '1' indicates a match.\n}}",

    "description": "This prompt instructs the model to perform company name matching, comparing a given name against PATSTAT names and outputting a JSON object with the firm_id, han_id, and a label indicating a match (1) or no match (0). The model is instructed to be thorough.
    """

    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=prompt,
        config=types.GenerateContentConfig(
        temperature=0.0
        )
    )

    try:
        # Preprocess the model response
        text = response.text
        pattern = r"json\s*(\{.*?\})\s*"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            json_str = match.group(1).strip()
            data = json.loads(json_str)
            return data
        else:
            print("Error: Gemini API did not return valid JSON.")
            write_log(f"Error: Gemini API did not return valid JSON.\n\n{response.text}", "label_training_api_call_log.txt")
    except json.JSONDecodeError:
        print("Error: Gemini API did not return valid JSON.")
        write_log(f"Error: Gemini API did not return valid JSON.\n\n{response.text}", "label_training_api_call_log.txt")


def process_gemini_response(response: dict, row: dict):
    """Processes the response from the Gemini API

    Args:
        response: The response from the Gemini API
        row: The row of data that was sent to the API
    Returns:
        None
    """
    han_id = int(response.get("han_id"))
    firm_id = int(response.get("firm_id"))
    # Check if the han_id and firm_id match the input data
    if han_id != row.get("han_id") or firm_id != row.get("firm_id"):
        write_log(f"Error: han_id or firm_id do not match the input data.\n\n{response}", "label_training_api_call_log.txt")
        return
    label = response.get("label")
    print(f"han_id: {han_id}, firm_id: {firm_id}, label: {label}")
    insert_label(han_id, firm_id, label)

In [6]:
def process_data():
    """Processes the data in the database

    Args:
        None
    Returns:
        None
    """
    sql = """
    SELECT DISTINCT firm_id, han_id, similarity, name, han_name, person_name, psn_name FROM patstat_firm_match
    JOIN firm_names USING(firm_id)
    JOIN patstat_data USING(han_id)
    WHERE similarity >= 0.9
    """
    data = con.execute(sql).fetchdf()
    for _, row in data.iterrows():
        # If the name jaro-winkler similarity is >= .99, we assume it is a match
        if row['similarity'] >= 0.99:
            insert_label(row['han_id'], row['firm_id'], 1)
        else:
            response = call_gemini_api(row.to_json())
            process_gemini_response(response, row)

In [141]:
if __name__ == "__main__":
    create_label_table(True)
    process_data()

han_id: 868544, firm_id: 5017, label: 1
han_id: 1384087, firm_id: 6729, label: 1
han_id: 1432953, firm_id: 6881, label: 1
han_id: 4653424, firm_id: 12495, label: 1
han_id: 369580, firm_id: 1933, label: 1
han_id: 10961, firm_id: 9575, label: 1
han_id: 1904196, firm_id: 13308, label: 1
han_id: 2409972, firm_id: 11581, label: 1
han_id: 1514031, firm_id: 13307, label: 1
han_id: 4276287, firm_id: 1384, label: 1
han_id: 2596447, firm_id: 12359, label: 1
han_id: 1904196, firm_id: 13308, label: 1
han_id: 1981073, firm_id: 9114, label: 1
han_id: 1438789, firm_id: 10592, label: 0
han_id: 1614162, firm_id: 12547, label: 1
han_id: 177565, firm_id: 1165, label: 1
han_id: 422812, firm_id: 2553, label: 1
han_id: 2449203, firm_id: 11764, label: 1
han_id: 1166688, firm_id: 5952, label: 1
han_id: 1527459, firm_id: 7221, label: 1
han_id: 1222770, firm_id: 6142, label: 1
han_id: 2853133, firm_id: 13313, label: 1
han_id: 2413635, firm_id: 11595, label: 1
han_id: 3516936, firm_id: 14924, label: 1
han_id: 46

In [154]:
sql = """
    SELECT DISTINCT firm_id, han_id, similarity, name, han_name, person_name, psn_name FROM patstat_firm_match
    JOIN firm_names USING(firm_id)
    JOIN patstat_data USING(han_id)
    WHERE similarity >= 0.91
    """
test_data = con.execute(sql).fetchdf()

In [30]:
process_gemini_response(call_gemini_api(test_data.loc[0].to_json()))

han_id: 868544, firm_id: 5017, label: 1


In [26]:
test_data.loc[0].to_json()

'{"firm_id":5017,"han_id":868544,"similarity":1.0,"name":"FMC CORP","han_name":"FMC CORP","person_name":"FMC Corporation","psn_name":"FMC CORPORATION"}'

In [142]:
# Verify, that there are no han_ids with multiple firm_ids and vice versa
sql = """
    SELECT han_id, COUNT(DISTINCT firm_id) AS distinct_firm_ids
    FROM labels
    WHERE label = 1
    GROUP BY han_id
    HAVING COUNT(DISTINCT firm_id) > 1;
"""

con.execute(sql).fetchdf()


Unnamed: 0,han_id,distinct_firm_ids


In [143]:
sql = """
    SELECT firm_id, COUNT(DISTINCT han_id) AS distinct_han_ids
    FROM labels
    WHERE label = 1
    GROUP BY firm_id
    HAVING COUNT(DISTINCT han_id) > 1;
"""

con.execute(sql).fetchdf()

Unnamed: 0,firm_id,distinct_han_ids
0,7221,2
1,9575,2
2,96,2
3,2094,2


In [67]:
con.execute("SELECT count(DISTINCT firm_id) FROM labels").fetchdf()

Unnamed: 0,count(DISTINCT firm_id)
0,251


In [144]:
sql = """
    SELECT DISTINCT han_id, firm_id, label, name, han_name, person_name, psn_name FROM labels
    JOIN firm_names USING(firm_id)
    JOIN patstat_data USING(han_id)
    WHERE firm_id = 12622
"""

con.execute(sql).fetchdf()

Unnamed: 0,han_id,firm_id,label,name,han_name,person_name,psn_name
0,2709504,12622,1,SIEMENS AG,SIEMENS AG,Siemens Aktiengesellschaft,SIEMENS


In [146]:
sql = """
    SELECT DISTINCT han_id, firm_id, similarity, label, name, han_name, person_name, psn_name FROM labels
    JOIN firm_names USING(firm_id)
    JOIN patstat_data USING(han_id)
    JOIN patstat_firm_match USING(han_id, firm_id)
    WHERE label = 1
    AND similarity < 0.92
"""

con.execute(sql).fetchdf()

Unnamed: 0,han_id,firm_id,similarity,label,name,han_name,person_name,psn_name
0,4529991,10432,0.910185,1,ORION ENGINEERED CARBONS SA,ORION ENGINEERED CARBONS IP GMBH & CO KG,ORION ENGINEERED CARBONS IP GMBH & CO. KG,ORION ENGINEERED CARBONS IP & COMPANY
1,348886,2094,0.915714,1,BRITISH AMERICAN TOBACCO PLC,BRITISH AMERICAN TOBACCO INVESTMENTS LTD,BRITISH AMERICAN TOBACCO (INVESTMENTS) LTD,BRITISH AMERICAN TOBACCO (INVESTMENTS)
2,2220996,10616,0.915789,1,PAR TECHNOLOGY CORP,PA TECH,PA TECHNOLOGIES,PA TECHNOLOGY
3,1594010,6967,0.919048,1,INTRA-CELLULAR THERAPIES INC,INTRA CELLULAR THERAPIES INC,"INTRA-CELLULAR THERAPIES, INC.",INTRA-CELLULAR THERAPIES
4,348886,2094,0.915714,1,BRITISH AMERICAN TOBACCO PLC,BRITISH AMERICAN TOBACCO INVESTMENTS LTD,BRITISH AMERICAN TOBACCO (INVESTMENTS)LIMITED,BRITISH AMERICAN TOBACCO (INVESTMENTS)
5,1010737,5194,0.914161,1,FUJI SEAL INTERNATIONAL INC,FUJI SEAL INT INC,"Fuji Seal International, Inc.",FUJI SEAL INTERNATIONAL
6,3516936,14924,0.912,1,VOESTALPINE AG,VOESTALPINE METAL FORMING GMBH,VOESTALPINE METAL FORMING GMBH,VOESTALPINE METAL FORMING
7,2531957,12055,0.911111,1,SAMIL,SAMI LABS LTD,SAMI LABS LIMITED,SAMI LABS
8,153580,1080,0.91413,1,ASAHI KASEI CORP,ASAHI KASEI PHARMA CORP,ASAHI KASEI PHARMA CORPORATION,ASAHI KASEI PHARMA CORPORATION
9,1257687,14455,0.913636,1,UNILEVER PLC,UNILEVER NV,UNILEVER NV,UNILEVER


In [147]:
con.execute("select * from patstat_firm_match where similarity < 0.91").fetchdf()

Unnamed: 0,han_id,firm_id,similarity
0,4594826,11529,0.835167
1,113103623,3501,0.886555
2,1543981,7033,0.873904
3,4753705,1904,0.847475
4,558024,14039,0.836667
...,...,...,...
608,2795411,12862,0.836963
609,810155,7256,0.858333
610,157358073,2617,0.832407
611,157528900,9524,0.834127


In [157]:
sql = """
SELECT DISTINCT firm_id, han_id, similarity, name, han_name, person_name, psn_name FROM patstat_firm_match
JOIN firm_names USING(firm_id)
JOIN patstat_data USING(han_id)
WHERE similarity >= 0.99
"""

con.execute(sql).fetchdf()

Unnamed: 0,firm_id,han_id,similarity,name,han_name,person_name,psn_name
0,5017,868544,1.00000,FMC CORP,FMC CORP,FMC Corporation,FMC CORPORATION
1,6729,1384087,1.00000,INCYTE CORP,INCYTE CORP,INCYTE CORPORATION,INCYTE
2,6881,1432953,1.00000,INTEL CORP,INTEL CORP,Intel Corporation,INTEL CORPORATION
3,1933,369580,1.00000,BLUEPRINT MEDICINES CORP,BLUEPRINT MEDICINES CORP,BLUEPRINT MEDICINES CORPORATION,BLUEPRINT MEDICINES
4,11581,2409972,1.00000,REDX PHARMA PLC,REDX PHARMA PLC,REDX PHARMA PLC,REDX PHARMA
...,...,...,...,...,...,...,...
97,5017,868544,1.00000,FMC CORP,FMC CORP,FMC CORP,FMC CORPORATION
98,6314,6013,1.00000,HONDA MOTOR CO LTD,HONDA MOTOR CO LTD,HONDA MOTOR CO LTD,HONDA MOTOR COMPANY
99,12321,514247,1.00000,SEB SA,SEB SA,SEB SA,SEB
100,8003,1699,0.98022,L'AIR LIQUIDE SOCIETE ANONYME POUR L'ETUDE ET ...,L AIR LIQUIDE SA POUR L ETUDE & L EXPLOITATION...,L`AIR LIQUIDE SOCIETE ANONYME POUR L`ETUDE ET ...,L'AIR LIQUIDE SOCIETE ANONYME POUR L'ETUDE ET ...
