In [2]:
import pandas as pd
import duckdb as db

# We're using the google style guide for python (https://google.github.io/styleguide/pyguide.html)

In [3]:
# establish a connection to the database
con = db.connect(database='patent_database', read_only=False)

In [8]:
# Read the data from the CSV file
firm_names_data = pd.read_stata('../../files_maris/katharina_patents_name_matching/all_names_incl_isin.dta').reset_index()

In [9]:
con.sql("show tables")

┌─────────┐
│  name   │
│ varchar │
├─────────┤
│ 0 rows  │
└─────────┘

In [14]:
# con.sql("DROP TABLE IF EXISTS patstat_firm_match;")
# con.sql("DROP TABLE IF EXISTS firm_isin;")
# con.sql("DROP TABLE IF EXISTS firm_names;")
# con.sql("DROP TABLE IF EXISTS firm_data_complete;")
# con.sql("DROP TABLE IF EXISTS patstat_data;")
# con.sql("DROP TABLE IF EXISTS patstat_test_data;")

In [10]:
def create_base_tables(firm_names_data: pd.DataFrame, reset: bool = False):
    """Creates the base tables for the name matching process. This includes the firm_names_data table and the patstat_data table. Also create firm_data_complete table.

    Args:
        reset: If True, the tables will be dropped if they already exist.
        firm_names_data: The data containing the names and ISINs of the firms.

    Returns:
        None
    """
    if reset:
        # -- 1. Drop old tables if they exist
        con.sql("DROP TABLE IF EXISTS firm_isin;")
        con.sql("DROP TABLE IF EXISTS firm_names;")
        con.sql("DROP TABLE IF EXISTS firm_data_complete;")

    # -- 2. Create firm_names table with a generated surrogate key for each distinct name
    # We do this to get rid of duplicates for name matching
    sql = """
        CREATE TABLE IF NOT EXISTS firm_names (
            firm_id INTEGER PRIMARY KEY,
            name VARCHAR(255) NOT NULL
        );
    """
    con.sql(sql)

    sql = """
        INSERT INTO firm_names (firm_id, name)
        SELECT ROW_NUMBER() OVER (ORDER BY name) AS firm_id,
               name
        FROM (
            SELECT DISTINCT UPPER(name) AS name
            FROM firm_names_data
        ) AS sub;
    """
    con.sql(sql)

    # -- 3. Create firm_isin table, referencing the firm_id via a JOIN on the name
    # We do this, since some firms have multiple ISINs (e.g. SHINWA CO LTD). This could be due to different classes of shares / or subsidiaries?
    sql = """
        CREATE TABLE IF NOT EXISTS firm_isin (
           ISIN VARCHAR(255) PRIMARY KEY NOT NULL,
           firm_id INTEGER REFERENCES firm_names(firm_id)
        );
    """
    con.sql(sql)

    sql = """
        INSERT INTO firm_isin (ISIN, firm_id)
        SELECT i.ISIN, f.firm_id
        FROM (
           SELECT DISTINCT ISIN, UPPER(name) AS name
           FROM firm_names_data
        ) AS i
        JOIN firm_names AS f
        ON i.name = f.name;
    """
    con.sql(sql)

    # -- 4. This table can be used to extract the bdvids of the subsidiaries via ISIN or name
    con.sql("CREATE TABLE IF NOT EXISTS firm_data_complete AS SELECT ISIN, subsidiarybvdid, UPPER(name) AS name FROM firm_names_data")

def create_match_table(reset: bool = False):
    """Creates the patstat_firm_match table in the database. This table is used to store the matches between the patstat data and the firm (id) data.

    Args:
        reset: If True, the table will be dropped if it already exists.

    Returns:
        None
    """
    if reset:
        con.sql("DROP TABLE IF EXISTS patstat_firm_match;")

    sql = """
    CREATE TABLE IF NOT EXISTS patstat_firm_match (
        pat_publn_id INTEGER NOT NULL, -- pat_publn_id from patstat_data tables -> @todo später hier INTEGER PRIMARY KEY, wenn alles geklärt
        firm_id INTEGER REFERENCES firm_names(firm_id), -- firm_id from firm_names table
        similarity FLOAT, -- similarity between the patstat name and the firm name
        pat_table VARCHAR(255) NOT NULL-- name of the patstat_data table, e.g. patstat_data_A_B
    );
    """
    con.sql(sql)

def create_patstat_database(patstat_data: pd.DataFrame, name: str, reset: bool = True):
    """Creates a table in the database with the patstat data.

    Args:
        patstat_data: The data containing the patstat data.
        name: The name (extension) of the table.
        reset: If True, the table will be dropped if it already exists.

    Returns:
        None
    """
    if reset:
        con.sql(f"DROP TABLE IF EXISTS patstat_{name}")
    sql = f"""
        CREATE TABLE IF NOT EXISTS patstat_{name} AS
        SELECT pat_publn_id,
               UPPER(han_name) AS han_name,
               UPPER(person_name) AS person_name,
               UPPER(psn_name) AS psn_name
        FROM patstat_data"""
    con.sql(sql)


In [11]:
def insert_match(pat_publn_id: int, firm_id: int, similarity: float, pat_table: str):
    """Inserts a match between a patstat publication id and a firm id into the patstat_firm_match table.

    Args:
        pat_publn_id: The patstat publication id.
        firm_id: The firm id.
        similarity: The similarity between the patstat name and the firm name.
        pat_table: The name of the patstat table.

    Returns:
        None
    """
    # print(f"Inserting match for pat_publn_id {pat_publn_id}, firm_id {firm_id}, similarity {similarity}, pat_table {pat_table}")
    sql = f"""
        INSERT INTO patstat_firm_match (pat_publn_id, firm_id, similarity, pat_table)
        VALUES (?, ?, ?, ?);
    """
    con.execute(sql, (pat_publn_id, firm_id, similarity, pat_table))

def get_matching_firm(patstat_name: str) -> pd.DataFrame:
    patstat_name = patstat_name.replace("'", "")
    # We use the Jaro-Winkler similarity since it is a good metric for string similarity. It
    sql = f"""
        SELECT
            firm_id,
            name,
            jaro_winkler_similarity(name, '{patstat_name}') AS similarity
        FROM firm_names
        WHERE similarity > 0.8
        ORDER BY similarity DESC
        LIMIT 1;
    """
    df = con.sql(sql).fetchdf()
    return df


In [13]:
def extract_before_first_comma(s: str) -> str:
    """Extracts the substring before the first comma in a string.

    Args:
        s: The input string.

    Returns:
        The substring before the first comma.
    """
    # Split on the first comma only (maxsplit=1)
    parts = s.split(',', 1)
    return parts[0]  # If there's no comma, split() returns [s]

def get_best_match(patstat_entry: pd.Series) -> pd.DataFrame:
    """Calculates the best name match for a patstat entry against our firm_names table.

    Args:
        patstat_entry: A single entry from the patstat data.

    Returns:
        The best match for the patstat entry.
    """

    df = None
    max_similarity = -1

    # We have to use the extract_before_first_comma function to get rid of the location information in the name
    df_han = get_matching_firm(extract_before_first_comma(patstat_entry['han_name']))
    df_psn = get_matching_firm(extract_before_first_comma(patstat_entry['psn_name']))
    df_person = get_matching_firm(extract_before_first_comma(patstat_entry['person_name']))
    # We check if we have a match with a similarity > 0.9 in any of the dataframes. They are sorted by the subjective importance (han_name > psn_name > person_name)
    # If we have a match with a similarity > 0.9, we instantly return it
    # @todo check if the subjective order is correct
    if len(df_han) > 0:
        similarity = df_han['similarity'].iloc[0]
        if similarity > 0.92:
            return df_han
        elif similarity > max_similarity:
            max_similarity = similarity
            df = df_han
    if len(df_psn) > 0:
        similarity = df_psn['similarity'].iloc[0]
        if similarity > 0.92:
            return df_psn
        elif similarity > max_similarity:
            max_similarity = similarity
            df = df_psn
    if len(df_person) > 0:
        similarity = df_person['similarity'].iloc[0]
        if similarity > 0.92:
            return df_person
        elif similarity > max_similarity:
            max_similarity = similarity
            df = df_person

    # If we don't have a match with a similarity > 0.9, we return the best match
    return df


def process_patstat_entry(patstat_entry: pd.Series, table_name: str):
    """Processes a single entry from the patstat data and tries to find a matching firm.

    Args:
        patstat_entry: A single entry from the patstat data.
        table_name: The name of the patstat table.

    Returns:
        None
    """
    data = get_best_match(patstat_entry)

    if data is not None:
        data = data.iloc[0]
        # We convert the similarity to a float, since it is a numpy float and duckdb doesn't like that (respectively, firm_id as int)
        insert_match(patstat_entry['pat_publn_id'], int(data['firm_id']), float(data['similarity']), table_name)
    else:
        insert_match(patstat_entry['pat_publn_id'], None, None, table_name)

In [14]:
def process_patstat_file(path: str, table_name: str):
    """Processes a patstat file and tries to find matching firms for each entry.

    Args:
        path: The path to the patstat file.
        table_name: The name of the patstat table.

    Returns:
        None
    """
    print(f"Processing table {table_name}. Path: {path}")
    patstat_data = pd.read_csv(path, sep=';')
    create_patstat_database(patstat_data, table_name)
    df = con.sql(f"SELECT * FROM patstat_{table_name}").fetchdf()
    length = len(df)
    for index, row in df.iterrows():
        if index % 1000 == 0:
            print(f"Processing entry {index}/{length}")
        process_patstat_entry(row, table_name)


In [33]:
con.sql("START TRANSACTION;")
create_base_tables(firm_names_data, True)
create_match_table(True)
process_patstat_file("../../files_maris/katharina_patents_name_matching/patent_download_Oct_2024/patents_P.csv", "patents_P")

Processing table patents_P. Path: ../../files_maris/katharina_patents_name_matching/patent_download_Oct_2024/patents_P.csv


  patstat_data = pd.read_csv(path, sep=';')


Processing entry 0/113729
Processing entry 1000/113729
Processing entry 2000/113729
Processing entry 3000/113729
Processing entry 4000/113729
Processing entry 5000/113729
Processing entry 6000/113729
Processing entry 7000/113729
Processing entry 8000/113729
Processing entry 9000/113729
Processing entry 10000/113729
Processing entry 11000/113729
Processing entry 12000/113729
Processing entry 13000/113729
Processing entry 14000/113729
Processing entry 15000/113729
Processing entry 16000/113729
Processing entry 17000/113729
Processing entry 18000/113729
Processing entry 19000/113729
Processing entry 20000/113729
Processing entry 21000/113729
Processing entry 22000/113729
Processing entry 23000/113729
Processing entry 24000/113729
Processing entry 25000/113729
Processing entry 26000/113729
Processing entry 27000/113729
Processing entry 28000/113729
Processing entry 29000/113729
Processing entry 30000/113729
Processing entry 31000/113729
Processing entry 32000/113729
Processing entry 33000/

In [26]:
sql = """
    SELECT * FROM patstat_firm_match
    JOIN firm_names USING(firm_id)
    JOIN patstat_patents_P USING(pat_publn_id)
    WHERE similarity > 0.9
    """
con.execute(sql).fetchdf()

Unnamed: 0,pat_publn_id,firm_id,similarity,pat_table,name,han_name,person_name,psn_name
0,478133428,6368,0.940741,patents_P,HOYA CORP,HOWA CORP,HOWA CORPORATION,HOWA
1,478133506,8360,0.905072,patents_P,LSI INDUSTRIES INC,LIMKACO INDUSTRIES INC.,LIMKACO INDUSTRIES INC.,LIMKACO INDUSTRIES
2,479911269,13691,0.917647,patents_P,TEIJIN LTD,TEIJIN PHARMA LTD,TEIJIN PHARMA LIMITED,TEIJIN PHARMA
3,479911271,11372,1.000000,patents_P,QUALCOMM INC,QUALCOMM INC,QUALCOMM INCORPORATED,QUALCOMM
4,479911279,2187,0.915455,patents_P,BYC CO LTD,BBHC CO LTD,"BBHC CO., LTD.",BBHC
...,...,...,...,...,...,...,...,...
15311,605806299,10114,0.901587,patents_P,NUVOTEC CO LTD,NUCTECH CO LTD,NUCTECH COMPANY LIMITED,NUCTECH COMPANY
15312,605806299,10114,0.901587,patents_P,NUVOTEC CO LTD,NUCTECH CO LTD,NUCTECH COMPANY LIMITED,NUCTECH COMPANY
15313,605812599,5387,0.902273,patents_P,GENEHCO INC,CNRS,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE,CNRS (CENTRE NATIONAL DE LA RECHERCHE SCIENTIF...
15314,605812599,5387,0.902273,patents_P,GENEHCO INC,INSERM INSTITUT NAL DE LA SANTE & DE LA RECHER...,INSERM - INSTITUT NATIONAL DE LA SANTÉ ET DE L...,INSERM (INSTITUT NATIONAL DE LA SANTE ET DE LA...


In [28]:
get_matching_firm('NATIONAL UNIVERSITY CORP TOTTORI UNIVERSITY')

Unnamed: 0,firm_id,name,similarity
0,9493,NATIONAL INSTRUMENTS CORP,0.853628
1,9495,NATIONAL PRESTO INDUSTRIES INC,0.852614
2,9502,NATIONAL WESTERN LIFE GROUP INC,0.850211
3,9486,NATIONAL ENERGY SERVICES REUNITED CORP,0.849279
4,9480,NATIONAL ASSET RECOVERY CORP,0.84139
5,9497,NATIONAL RETAIL PROPERTIES INC,0.832725
6,9500,NATIONAL VISION HOLDINGS INC,0.827784
7,9491,NATIONAL HEALTH INVESTORS INC,0.824693
8,4603,EUROMONEY INSTITUTIONAL INVESTOR PLC,0.753289
9,6920,INTERNATIONAL DISTRIBUTIONS SERVICES PLC,0.732834


In [32]:
con.sql('ROLLBACK;')
con.sql('show tables')

┌─────────┐
│  name   │
│ varchar │
├─────────┤
│ 0 rows  │
└─────────┘

In [12]:
con.sql("SELECT * from patstat_data").fetchdf()

CatalogException: Catalog Error: Table with name patstat_data does not exist!
Did you mean "patstat_test_data"?

In [191]:
con.sql("ROLLBACK;")

In [138]:
con.sql("select * from patstat_data_test WHERE person_name LIKE '%KODAK%' ").fetchdf()

Unnamed: 0,pat_publn_id,publn_date,publn_first_grant,appln_auth,granted,nb_applicants,nb_inventors,person_id,han_id,han_name,han_harmonized,person_name,psn_name,psn_id,applt_seq_nr,invt_seq_nr,person_ctry_code,person_address
0,314089889,1973-03-29,Y,BE,Y,1,2,15104528,115104528,"EASTMAN KODAK CY, 343 STATE STREET, ROCHESTER,...",0,"EASTMAN KODAK CY, 343 STATE STREET, ROCHESTER,...",KODAK,17494318,1,0,,
1,314089929,1973-04-02,Y,BE,Y,1,1,15104528,115104528,"EASTMAN KODAK CY, 343 STATE STREET, ROCHESTER,...",0,"EASTMAN KODAK CY, 343 STATE STREET, ROCHESTER,...",KODAK,17494318,1,0,,
2,314089955,1973-04-04,Y,BE,Y,1,2,15103494,115103494,"EASTMAN KODAK CY, 343, STATE STREET, ROCHESTER...",0,"EASTMAN KODAK CY, 343, STATE STREET, ROCHESTER...",KODAK,17494318,1,0,,
3,314095708,1973-05-03,Y,BE,Y,1,2,15104528,115104528,"EASTMAN KODAK CY, 343 STATE STREET, ROCHESTER,...",0,"EASTMAN KODAK CY, 343 STATE STREET, ROCHESTER,...",KODAK,17494318,1,0,,
4,314100892,1973-06-04,Y,BE,Y,1,1,15103494,115103494,"EASTMAN KODAK CY, 343, STATE STREET, ROCHESTER...",0,"EASTMAN KODAK CY, 343, STATE STREET, ROCHESTER...",KODAK,17494318,1,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,383135741,1973-08-23,Y,BE,Y,1,1,15108189,115108189,"EASTMAN KODAK CY, 343 STATE STREET, ROCHESTER,...",0,"EASTMAN KODAK CY, 343 STATE STREET, ROCHESTER,...",KODAK,17494318,1,0,,
117,530281747,2020-04-23,N,AU,Y,1,6,47660493,536862,KODAK ALARIS INC,2,KODAK ALARIS INC.,KODAK ALARIS,17494327,1,0,US,
118,592738119,2023-05-18,N,AU,N,1,3,47660493,536862,KODAK ALARIS INC,2,KODAK ALARIS INC.,KODAK ALARIS,17494327,1,0,US,
119,598314528,2023-09-07,Y,AU,Y,1,6,47660493,536862,KODAK ALARIS INC,2,KODAK ALARIS INC.,KODAK ALARIS,17494327,1,0,US,


In [85]:
get_matching_firm('EASTMAN KODAK CY')

Unnamed: 0,firm_id,name,similarity
0,4070,EASTMAN KODAK CO,0.975
1,4069,EASTMAN CHEMICAL CO,0.835108
2,1171,ASTMAX CO LTD,0.764744
3,15403,YAMANAKA CO LTD,0.746338
4,4135,EDAP TMS SA,0.730682
5,4208,ELASTRON SA,0.730682
6,7442,KANAME KOGYO CO LTD,0.729673
7,13131,STAMEN CO LTD,0.723339
8,8892,METABANK AT,0.719487
9,12804,SLOGA AD KAC,0.719444


In [97]:
con.sql("DROP TABLE IF EXISTS patstat_firm_match;")
create_base_tables(firm_names_data, True)
create_match_table(True)

In [99]:
con.sql("describe patstat_firm_match;")

┌──────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name  │ column_type │  null   │   key   │ default │  extra  │
│   varchar    │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ pat_publn_id │ INTEGER     │ NO      │ PRI     │ NULL    │ NULL    │
│ firm_id      │ INTEGER     │ YES     │ NULL    │ NULL    │ NULL    │
│ similarity   │ FLOAT       │ YES     │ NULL    │ NULL    │ NULL    │
│ pat_table    │ VARCHAR     │ NO      │ NULL    │ NULL    │ NULL    │
└──────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [41]:
con.sql("SELECT DISTINCT * from firm_names_data where name like '%BASF%'").fetchdf()

Unnamed: 0,name,isin
0,BASF SE,DE000BASF111


In [52]:
con.sql("SELECT COUNT(DISTINCT (ISIN, name) FROM firm_names_data").fetchdf()

Unnamed: 0,"count(DISTINCT main.row(ISIN, ""name""))"
0,15643


In [53]:
con.sql("SELECT COUNT(DISTINCT name) FROM firm_names_data").fetchdf()

Unnamed: 0,"count(DISTINCT ""name"")"
0,15638


In [21]:
#create_base_tables(firm_names_data, False)
con.sql("show tables;")

┌────────────────────┐
│        name        │
│      varchar       │
├────────────────────┤
│ firm_data_complete │
│ firm_isin          │
│ firm_names         │
└────────────────────┘

In [30]:
sql = """
    SELECT DISTINCT name, ISIN
    FROM  firm_data_complete
    WHERE name IN(
        SELECT
            name
        FROM firm_data_complete
        GROUP BY name
        HAVING COUNT(DISTINCT ISIN) > 1
    );
"""

con.execute(sql).fetchdf()

Unnamed: 0,name,isin
0,PEOPLES BANCORP INC,US70978T1079
1,PEOPLES BANCORP INC,US7097891011
2,CITIZENS FINANCIAL CORP,US1746132083
3,JTEC CORP,JP3386260008
4,SHOEI CO LTD,JP3360850006
5,CITIZENS FINANCIAL CORP,US17461K1016
6,SHOEI CO LTD,JP3360900009
7,SHINWA CO LTD,JP3384730002
8,SHINWA CO LTD,JP3384710004
9,JTEC CORP,JP3386660009
