In [42]:
import json
import time

import duckdb
from duckdb import ConstraintException
from duckdb import BinderException
from google import genai
from google.genai import types
from google.genai.errors import ClientError
from google.genai.errors import ServerError
from datetime import datetime
import re
from tqdm import tqdm

In [43]:
def get_api_key() -> str:
    """Gets the users Google Gemini api key from the config file

    Args:
        None

    Returns:
        The Google Gemini api key of the user
    """
    with open("../config.json", "r") as config_file:
        config = json.load(config_file)
    return config.get("gemini_api_key")

def write_log(msg: str, logfile: str):
    """Writes a message to the log file.

    Args:
        msg: The message to write to the log file
        logfile: The name of the log file

    Returns:
        None
    """
    file_path = f"../logs/{logfile}"
    with open(file_path, "a") as log_file:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_file.write(f"{timestamp}\n{msg}\n\n")

In [44]:
client = genai.Client(api_key=get_api_key())

# Connect to the database
con = duckdb.connect(database='patent_database', read_only=False)

In [45]:
def create_label_table(reset: bool = False):
    """Creates a table in the database to store the labels

    Args:
        reset: Whether to reset the table if it already exists
    Returns:
        None
    """
    if reset:
        con.execute("DROP TABLE IF EXISTS labels")

    con.execute("""
        CREATE TABLE IF NOT EXISTS labels (
            han_id INTEGER,
            firm_id INTEGER REFERENCES firm_names(firm_id),
            label INTEGER
        )
    """)

def insert_label(han_id: int, firm_id: int, label: int):
    """Inserts a label into the label table

    Args:
        han_id: The id of the han record
        firm_id: The id of the firm record
        label: The label of the record
    Returns:
        None
    """
    try:
        con.execute(f"""
            INSERT INTO labels
            VALUES ({han_id}, {firm_id}, {label})
        """)
    except ConstraintException as e:
        # Entry already in Database
        pass
    except BinderException as e:
        # None column (faulty gemini response)
        msg = f"""
            Faulty Response for:
            han_id: {han_id}
            firm_id: {firm_id}
            label: {label}
            """
        write_log(msg, "insert_exception_log.txt")

In [46]:
def call_gemini_api(input_data: str) -> dict:
    """Calls the Google Gemini API to determine if the name is a match to the han_name, person_name and psn_name

    Args:
        input_data: The data to be sent to the API
    Returns:
        The response from the API
    """

    prompt = f""" Your task is to determine if a given company name ('name') matches any of the provided company names from the PATSTAT database ('han_name', 'person_name', 'psn_name'). You must be very thorough in your analysis. Assume that the provided names are accurate and free of spelling errors. Focus on identifying exact or near-exact matches, considering only common and accepted abbreviations. Do NOT consider minor variations or potential spelling mistakes as valid matches.

    Input Data (JSON):
    {input_data}

    Output (JSON):
    {{\n\"firm_id\": \"{{firm_id}}\",\n    \"han_id\": \"{{han_id}}\",\n    \"label\": \"{{label}}\" <--- The value of 'label' MUST be either '0' or '1'. '0' indicates no match, and '1' indicates a match.\n}}",

    "description": "This prompt instructs the model to perform company name matching, comparing a given name against PATSTAT names and outputting a JSON object with the firm_id, han_id, and a label indicating a match (1) or no match (0). The model is instructed to be thorough.
    """
    response = None
    successful_api_call = False
    i = 0
    while not successful_api_call:
        try:
            response = client.models.generate_content(
                model='gemini-2.0-flash-thinking-exp-01-21',
                contents=prompt,
                config=types.GenerateContentConfig(
                temperature=0.0
                )
            )
            successful_api_call = True
        except (ClientError, ServerError) as e:
            i += 1
            if i == 5:
                error = f"""Failed to call the gemini api 5 times\n
                            Error: {e}\n
                            Input data: {input_data}"""
                write_log(error, "api_call_error.txt")
                return {} # We return here to avoid a none response
            else:
                time.sleep(30) # Sleep for 12 seconds, to stay in quota limit
                continue

    try:
        # Preprocess the model response
        text = response.text
        pattern = r"json\s*(\{.*?\})\s*"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            json_str = match.group(1).strip()
            data = json.loads(json_str)
            return data
        else:
            print("Error: Gemini API did not return valid JSON.")
            write_log(f"Error: Gemini API did not return valid JSON.\n\n{response.text}", "label_training_api_call_log.txt")
    except json.JSONDecodeError:
        print("Error: Gemini API did not return valid JSON.")
        write_log(f"Error: Gemini API did not return valid JSON.\n\n{response.text}", "label_training_api_call_log.txt")


def process_gemini_response(response: dict, row: dict):
    """Processes the response from the Gemini API

    Args:
        response: The response from the Gemini API
        row: The row of data that was sent to the API
    Returns:
        None
    """
    han_id = int(response.get("han_id"))
    firm_id = int(response.get("firm_id"))
    # Check if the han_id and firm_id match the input data
    if han_id != row.get("han_id") or firm_id != row.get("firm_id"):
        write_log(f"Error: han_id or firm_id do not match the input data.\n\n{response}", "label_training_api_call_log.txt")
        return
    label = response.get("label")
    # print(f"han_id: {han_id}, firm_id: {firm_id}, label: {label}")
    insert_label(han_id, firm_id, label)

In [47]:
def process_data():
    """Processes the data in the database

    Args:
        None
    Returns:
        None
    """
    sql = """
    SELECT DISTINCT firm_id, han_id, similarity, name, han_name, person_name, psn_name FROM patstat_firm_match
    JOIN firm_names USING(firm_id)
    JOIN patstat_data USING(han_id)
    WHERE similarity >= 0.9
    """
    data = con.execute(sql).fetchdf()
    # We sample a set of 30% to later on train our machine learning model on this data
    # data = data.sample(frac=0.3, random_state=42)
    for _, row in tqdm(data.iterrows(), total=len(data), desc="Processing rows"):
        # If the name jaro-winkler similarity is >= .99, we assume it is a match
        han_id = row["han_id"]
        firm_id = row["firm_id"]
        if row['similarity'] >= 0.99:
            insert_label(han_id, firm_id, 1)
        else:
            successful_answer = False
            while not successful_answer:
                try:
                    response = call_gemini_api(row.to_json())
                    successful_answer = True
                    process_gemini_response(response, row)
                except TypeError as e:
                    continue
            

In [48]:
if __name__ == "__main__":
    create_label_table()
    process_data()

Processing rows:   2%|▏         | 724/30668 [1:04:44<44:37:38,  5.37s/it]


TypeError: expected string or bytes-like object

In [41]:
sql = """
    SELECT DISTINCT han_id, firm_id, similarity, label, name, han_name, person_name, psn_name FROM labels
    JOIN firm_names USING(firm_id)
    JOIN patstat_data USING(han_id)
    JOIN patstat_firm_match USING(han_id, firm_id)
    WHERE label = 0
    AND similarity > 0.95
"""

con.execute(sql).fetchdf()

Unnamed: 0,han_id,firm_id,similarity,label,name,han_name,person_name,psn_name
0,2050741,9459,0.985714,0,NANSIN CO LTD,NANSHIN CO LTD,"NANSHIN CO., LTD.",NANSHIN
1,1814772,7702,0.951515,0,KOAS CO LTD,KOWA CO LTD,"KOWA COMPANY, LTD.",KOWA COMPANY
2,3343903,15235,0.951961,0,WISE ITECH CO LTD,WIRE TECH CO LTD,"WIRE TECHNOLOGY CO., LTD.",WIRE TECHNOLOGIES
3,881136,8818,0.955863,0,MEIHO ENTERPRISE CO LTD,EISO ENTERPRISE CO LTD,"EISO ENTERPRISE CO., LTD.",EISO ENTERPRISE COMPANY
4,610060,3503,0.966667,0,DAISEKI CO LTD,DAIKI CO LTD,"DAIKI CO., LTD.",DAIKI
...,...,...,...,...,...,...,...,...
271,1075206,4661,0.974074,0,EXASOL AG,EXALOS AG,EXALOS AG,EXALOS AG
272,3045856,13835,0.957143,0,THERMO FISHER SCIENTIFIC INC,THERMO FISHER SCIENTIFIC SPA,THERMO FISHER SCIENTIFIC SPA,THERMO FISHER SCIENTIFIC
273,4393273,8508,0.977778,0,MAKUS INC,MAKU INC,MAKU INC.,MAKU
274,1166688,5952,0.961713,0,HANA PHARM CO LTD,HANMI PHARMA CO LTD,"HANMI PHARMACEUTICAL CO., LTD.","HANMI PHARMACEUTICAL CO., LTD."


In [12]:
# Verify, that there are no han_ids with multiple firm_ids and vice versa
sql = """
    SELECT han_id, COUNT(DISTINCT firm_id) AS distinct_firm_ids
    FROM labels
    WHERE label = 1
    GROUP BY han_id
    HAVING COUNT(DISTINCT firm_id) > 1;
"""

con.execute(sql).fetchdf()


Unnamed: 0,han_id,distinct_firm_ids


In [13]:
sql = """
    SELECT 
    SELECT firm_id, COUNT(DISTINCT han_id) AS distinct_han_ids
    FROM labels
    WHERE label = 1
    GROUP BY firm_id
    HAVING COUNT(DISTINCT han_id) > 1;
"""

con.execute(sql).fetchdf()

Unnamed: 0,firm_id,distinct_han_ids
0,7675,2
1,15517,2
2,1588,2
3,12622,2
4,7707,2
5,9140,2
6,3490,2
7,9575,2
8,6630,2
9,97,2


In [16]:
sql = """
        SELECT DISTINCT
        l.han_id,
        l.firm_id,
        pm.similarity,
        l.label,
        f.name,
        p.han_name,
        p.person_name,
        p.psn_name
    FROM labels l
    JOIN firm_names f USING (firm_id)
    JOIN patstat_data p USING (han_id)
    JOIN patstat_firm_match pm USING (han_id, firm_id)
    WHERE l.firm_id IN (
        SELECT firm_id
        FROM labels
        WHERE label = 1
        GROUP BY firm_id
        HAVING COUNT(DISTINCT han_id) > 1
    );
"""

con.execute(sql).fetchdf()

Unnamed: 0,han_id,firm_id,similarity,label,name,han_name,person_name,psn_name
0,554748,4255,0.948214,1,ELI LILLY AND CO,ELI LILLY & CO,ELI LILLY & COMPANY,ELI LILLY & COMPANY
1,3553888,15517,1.000000,1,YUSHIN PRECISION EQUIPMENT CO LTD,YUSHIN PRECISION EQUIPMENT CO LTD,"YUSHIN PRECISION EQUIPMENT CO., LTD.",YUSHIN PRECISION EQUIPMENT COMPANY
2,115209503,12622,0.981818,1,SIEMENS AG,"SIEMENS A.G, A BERLIN ET A MUNICH (ALLEMAGNE),","SIEMENS A.G, A BERLIN ET A MUNICH (ALLEMAGNE),",SIEMENS
3,617737,3466,1.000000,1,DAI NIPPON TORYO CO LTD,DAI NIPPON TORYO CO LTD,"DAI NIPPON TORYO CO., LTD",DAINIPPON TORYO COMPANY
4,10961,9575,0.933333,1,NESTLE SA,SOCIETE DES PRODUITS NESTLE SA,SOCIÉTÉ DES PRODUITS NESTLÉ S.A,SOCIÉTÉ DES PRODUITS NESTLÉ S.A
...,...,...,...,...,...,...,...,...
109,3553888,15517,1.000000,1,YUSHIN PRECISION EQUIPMENT CO LTD,YUSHIN PRECISION EQUIPMENT CO LTD,"YUSHIN PRECISION EQUIPMENT CO., LTD",YUSHIN PRECISION EQUIPMENT COMPANY
110,522078,7707,0.942857,1,KOBE STEEL LTD,CO LTD KOBE SEIKO SHO KOBE STEEL LTD,KK KOBE SEIKO SHO (KOBE STEEL LTD),KOBE STEEL
111,554748,4255,0.948214,1,ELI LILLY AND CO,ELI LILLY & CO,ELI LILLY AND CO,ELI LILLY & COMPANY
112,160829727,752,0.911111,1,AMGEN INC,АМДЖЕН ИНК.,АМДЖЕН ИНК.,AMGEN


In [67]:
con.execute("SELECT count(DISTINCT firm_id) FROM labels").fetchdf()

Unnamed: 0,count(DISTINCT firm_id)
0,251


In [144]:
sql = """
    SELECT DISTINCT han_id, firm_id, label, name, han_name, person_name, psn_name FROM labels
    JOIN firm_names USING(firm_id)
    JOIN patstat_data USING(han_id)
    WHERE firm_id = 12622
"""

con.execute(sql).fetchdf()

Unnamed: 0,han_id,firm_id,label,name,han_name,person_name,psn_name
0,2709504,12622,1,SIEMENS AG,SIEMENS AG,Siemens Aktiengesellschaft,SIEMENS


In [14]:
sql = """
    SELECT DISTINCT han_id, firm_id, similarity, label, name, han_name, person_name, psn_name FROM labels
    JOIN firm_names USING(firm_id)
    JOIN patstat_data USING(han_id)
    JOIN patstat_firm_match USING(han_id, firm_id)
    WHERE label = 1
    AND similarity < 0.92
"""

con.execute(sql).fetchdf()

Unnamed: 0,han_id,firm_id,similarity,label,name,han_name,person_name,psn_name
0,147724660,14816,0.9,1,VIEW INC,"VIEW, INC.","VIEW, INC.",VIEW
1,2052883,7771,0.916892,1,KONINKLIJKE PHILIPS NV,KON FILIPS NV,KONINKLEJKE FILIPS N. V.,PHILIPS ELECTRONICS
2,339950,1649,0.901527,1,BEE VECTORING TECHNOLOGIES INTERNATIONAL INC,BEE VECTORING TECH INC,BEE VECTORING TECHNOLOGY INC.,BEE VECTORING TECHNOLOGY INC.
3,192155990,5017,0.906667,1,FMC CORP,FMC CORPORATION,FMC CORPORATION,FMC CORPORATION
4,1646006,6416,0.908333,1,HUNTSMAN CORP,HUNTSMAN CORP AU PTY LTD,HUNTSMAN CORPORATION AUSTRALIA PTY LIMITED.,HUNTSMAN CORPORATION AUSTRALIA PTY
5,175297341,8407,0.9,1,LYFT INC,'LYFT,'LYFT,LYFT
6,249577,1588,0.914286,1,BASF SE,BASF,BASF,BASF (BADISCHE ANILIN & SODA FABRIK)
7,112929843,11604,0.9,1,REGIS CORP,REGIS GESELLSCHAFT M.B.H.,REGIS GESELLSCHAFT M.B.H.,REGIS
8,1547035,7419,0.919688,1,KAKEN PHARMACEUTICAL CO LTD,KAKEN PHARMA CO LTD,KAKEN PHARMACEUTICAL CO. LTD.,KAKEN PHARMACEUTICAL CO. LTD.
9,192143660,12572,0.918462,1,SHINWA CO LTD,"SHINWA CO., LTD.","SHINWA CO., LTD.","SHINWA CO., LTD."


In [147]:
con.execute("select * from patstat_firm_match where similarity < 0.91").fetchdf()

Unnamed: 0,han_id,firm_id,similarity
0,4594826,11529,0.835167
1,113103623,3501,0.886555
2,1543981,7033,0.873904
3,4753705,1904,0.847475
4,558024,14039,0.836667
...,...,...,...
608,2795411,12862,0.836963
609,810155,7256,0.858333
610,157358073,2617,0.832407
611,157528900,9524,0.834127


In [10]:
sql = """
SELECT DISTINCT firm_id, han_id, similarity, name, han_name, person_name, psn_name FROM patstat_firm_match
JOIN firm_names USING(firm_id)
JOIN patstat_data USING(han_id)
WHERE similarity >= 0.99
"""

con.execute(sql).fetchdf()

Unnamed: 0,firm_id,han_id,similarity,name,han_name,person_name,psn_name
0,7723,1683687,1.000000,KOITO MANUFACTURING CO LTD,KOITO MANUFACTURING CO LTD,"Koito Manufacturing Co., Ltd.",KOITO MANUFACTURING COMPANY
1,12719,2707040,1.000000,SINTOKOGIO LTD,SINTOKOGIO LTD,"Sintokogio, Ltd.",SINTOKOGIO
2,13849,1528192,1.000000,THK CO LTD,THK CO LTD,"THK Co., Ltd.",THK COMPANY
3,6239,721954,1.000000,HITACHI LTD,HITACHI LTD,"Hitachi, Ltd.",HITACHI
4,7070,1414787,1.000000,ISHIHARA SANGYO KAISHA LTD,ISHIHARA SANGYO KAISHA LTD,"Ishihara Sangyo Kaisha, Ltd.",ISHIHARA SANGYO KAISHA
...,...,...,...,...,...,...,...
4382,9825,2178666,1.000000,NIPPON FILCON CO LTD,NIPPON FILCON CO LTD,"NIPPON FILCON CO., LTD",NIPPON FILCON COMPANY
4383,5396,115238747,0.990909,GENERAL DYNAMICS CORP,"GENERAL DYNAMICS CORP., ONE ROCKEFELLER PLAZA,...","GENERAL DYNAMICS CORP., ONE ROCKEFELLER PLAZA,...","GENERAL DYNAMICS CORP., ONE ROCKEFELLER PLAZA,..."
4384,772,119480,1.000000,AMPHASTAR PHARMACEUTICALS INC,AMPHASTAR PHARMACEUTICALS INC,"AMPHASTAR PHARMACEUTICALS, INC",AMPHASTAR PHARMACEUTICALS
4385,8776,191510847,1.000000,MEDINCELL SA,MEDINCELL SA,MEDINCELL SA,MEDINCELL SA
