In [2]:
import pandas as pd
import duckdb as db
from pandas import read_csv

# We're using the google style guide for python (https://google.github.io/styleguide/pyguide.html)

In [3]:
# establish a connection to the database
con = db.connect(database='patent_database', read_only=False)

In [8]:
# Read the data from the CSV file
firm_names_data = pd.read_stata('../../files_maris/katharina_patents_name_matching/all_names_incl_isin.dta').reset_index()

In [9]:
con.sql("show tables")

┌─────────┐
│  name   │
│ varchar │
├─────────┤
│ 0 rows  │
└─────────┘

In [14]:
# con.sql("DROP TABLE IF EXISTS patstat_firm_match;")
# con.sql("DROP TABLE IF EXISTS firm_isin;")
# con.sql("DROP TABLE IF EXISTS firm_names;")
# con.sql("DROP TABLE IF EXISTS firm_data_complete;")
# con.sql("DROP TABLE IF EXISTS patstat_data;")
# con.sql("DROP TABLE IF EXISTS patstat_test_data;")

In [10]:
def create_base_tables(firm_names_data: pd.DataFrame, reset: bool = False):
    """Creates the base tables for the name matching process. This includes the firm_names_data table and the patstat_data table. Also create firm_data_complete table.

    Args:
        reset: If True, the tables will be dropped if they already exist.
        firm_names_data: The data containing the names and ISINs of the firms.

    Returns:
        None
    """
    if reset:
        # -- 1. Drop old tables if they exist
        con.sql("DROP TABLE IF EXISTS firm_isin;")
        con.sql("DROP TABLE IF EXISTS firm_names;")
        con.sql("DROP TABLE IF EXISTS firm_data_complete;")

    # -- 2. Create firm_names table with a generated surrogate key for each distinct name
    # We do this to get rid of duplicates for name matching
    sql = """
        CREATE TABLE IF NOT EXISTS firm_names (
            firm_id INTEGER PRIMARY KEY,
            name VARCHAR(255) NOT NULL
        );
    """
    con.sql(sql)

    sql = """
        INSERT INTO firm_names (firm_id, name)
        SELECT ROW_NUMBER() OVER (ORDER BY name) AS firm_id,
               name
        FROM (
            SELECT DISTINCT UPPER(name) AS name
            FROM firm_names_data
        ) AS sub;
    """
    con.sql(sql)

    # -- 3. Create firm_isin table, referencing the firm_id via a JOIN on the name
    # We do this, since some firms have multiple ISINs (e.g. SHINWA CO LTD). This could be due to different classes of shares / or subsidiaries?
    sql = """
        CREATE TABLE IF NOT EXISTS firm_isin (
           ISIN VARCHAR(255) PRIMARY KEY NOT NULL,
           firm_id INTEGER REFERENCES firm_names(firm_id)
        );
    """
    con.sql(sql)

    sql = """
        INSERT INTO firm_isin (ISIN, firm_id)
        SELECT i.ISIN, f.firm_id
        FROM (
           SELECT DISTINCT ISIN, UPPER(name) AS name
           FROM firm_names_data
        ) AS i
        JOIN firm_names AS f
        ON i.name = f.name;
    """
    con.sql(sql)

    # -- 4. This table can be used to extract the bdvids of the subsidiaries via ISIN or name
    con.sql("CREATE TABLE IF NOT EXISTS firm_data_complete AS SELECT ISIN, subsidiarybvdid, UPPER(name) AS name FROM firm_names_data")

def create_match_table(reset: bool = False):
    """Creates the patstat_firm_match table in the database. This table is used to store the matches between the patstat data and the firm (id) data.

    Args:
        reset: If True, the table will be dropped if it already exists.

    Returns:
        None
    """
    if reset:
        con.sql("DROP TABLE IF EXISTS patstat_firm_match;")

    sql = """
    CREATE TABLE IF NOT EXISTS patstat_firm_match (
        pat_publn_id INTEGER NOT NULL, -- pat_publn_id from patstat_data tables -> @todo später hier INTEGER PRIMARY KEY, wenn alles geklärt
        firm_id INTEGER REFERENCES firm_names(firm_id), -- firm_id from firm_names table
        similarity FLOAT, -- similarity between the patstat name and the firm name
        pat_table VARCHAR(255) NOT NULL-- name of the patstat_data table, e.g. patstat_data_A_B
    );
    """
    con.sql(sql)

def create_patstat_database(patstat_data: pd.DataFrame, name: str, reset: bool = True):
    """Creates a table in the database with the patstat data.

    Args:
        patstat_data: The data containing the patstat data.
        name: The name (extension) of the table.
        reset: If True, the table will be dropped if it already exists.

    Returns:
        None
    """
    if reset:
        con.sql(f"DROP TABLE IF EXISTS patstat_{name}")
    sql = f"""
        CREATE TABLE IF NOT EXISTS patstat_{name} AS
        SELECT pat_publn_id,
               UPPER(han_name) AS han_name,
               UPPER(person_name) AS person_name,
               UPPER(psn_name) AS psn_name
        FROM patstat_data"""
    con.sql(sql)


In [36]:
def insert_match(pat_publn_id: int, firm_id: int, similarity: float, pat_table: str):
    """Inserts a match between a patstat publication id and a firm id into the patstat_firm_match table.

    Args:
        pat_publn_id: The patstat publication id.
        firm_id: The firm id.
        similarity: The similarity between the patstat name and the firm name.
        pat_table: The name of the patstat table.

    Returns:
        None
    """
    # print(f"Inserting match for pat_publn_id {pat_publn_id}, firm_id {firm_id}, similarity {similarity}, pat_table {pat_table}")
    sql = f"""
        INSERT INTO patstat_firm_match (pat_publn_id, firm_id, similarity, pat_table)
        VALUES (?, ?, ?, ?);
    """
    con.execute(sql, (pat_publn_id, firm_id, similarity, pat_table))

def get_matching_firm(patstat_name: str) -> pd.DataFrame:
    patstat_name = patstat_name.replace("'", "")
    # We use the Jaro-Winkler similarity since it is a good metric for string similarity. It
    sql = f"""
        SELECT
            firm_id,
            name,
            jaro_winkler_similarity(name, '{patstat_name}') AS similarity
        FROM firm_names
        WHERE similarity > 0.8
        ORDER BY similarity DESC
        LIMIT 10;
    """
    df = con.sql(sql).fetchdf()
    return df


In [37]:
def extract_before_first_comma(s: str) -> str:
    """Extracts the substring before the first comma in a string.

    Args:
        s: The input string.

    Returns:
        The substring before the first comma.
    """
    # Split on the first comma only (maxsplit=1)
    parts = s.split(',', 1)
    return parts[0]  # If there's no comma, split() returns [s]

def get_best_match(patstat_entry: pd.Series) -> pd.DataFrame:
    """Calculates the best name match for a patstat entry against our firm_names table.

    Args:
        patstat_entry: A single entry from the patstat data.

    Returns:
        The best match for the patstat entry.
    """

    df = None
    max_similarity = -1

    # We have to use the extract_before_first_comma function to get rid of the location information in the name
    df_han = get_matching_firm(extract_before_first_comma(patstat_entry['han_name']))
    df_psn = get_matching_firm(extract_before_first_comma(patstat_entry['psn_name']))
    df_person = get_matching_firm(extract_before_first_comma(patstat_entry['person_name']))
    # We check if we have a match with a similarity > 0.9 in any of the dataframes. They are sorted by the subjective importance (han_name > psn_name > person_name)
    # If we have a match with a similarity > 0.9, we instantly return it
    # @todo check if the subjective order is correct
    if len(df_han) > 0:
        similarity = df_han['similarity'].iloc[0]
        if similarity > 0.9:
            return df_han
        elif similarity > max_similarity:
            max_similarity = similarity
            df = df_han
    if len(df_psn) > 0:
        similarity = df_psn['similarity'].iloc[0]
        if similarity > 0.9:
            return df_psn
        elif similarity > max_similarity:
            max_similarity = similarity
            df = df_psn
    if len(df_person) > 0:
        similarity = df_person['similarity'].iloc[0]
        if similarity > 0.9:
            return df_person
        elif similarity > max_similarity:
            max_similarity = similarity
            df = df_person

    # If we don't have a match with a similarity > 0.9, we return the best match
    return df


def process_patstat_entry(patstat_entry: pd.Series, table_name: str):
    """Processes a single entry from the patstat data and tries to find a matching firm.

    Args:
        patstat_entry: A single entry from the patstat data.
        table_name: The name of the patstat table.

    Returns:
        None
    """
    data = get_best_match(patstat_entry)

    if data is not None:
        data = data.iloc[0]
        # We convert the similarity to a float, since it is a numpy float and duckdb doesn't like that (respectively, firm_id as int)
        insert_match(patstat_entry['pat_publn_id'], data['firm_id'], data['similarity'], table_name)
    else:
        insert_match(patstat_entry['pat_publn_id'], None, None, table_name)

In [38]:
def process_patstat_file(path: str, table_name: str):
    """Processes a patstat file and tries to find matching firms for each entry.

    Args:
        path: The path to the patstat file.
        table_name: The name of the patstat table.

    Returns:
        None
    """
    print(f"Processing table {table_name}. Path: {path}")
    patstat_data = pd.read_csv(path, sep=';')
    create_patstat_database(patstat_data, table_name)
    df = con.sql(f"SELECT * FROM patstat_{table_name}").fetchdf()
    length = len(df)
    for index, row in df.iterrows():
        if index % 1000 == 0:
            print(f"Processing entry {index}/{length}")
        process_patstat_entry(row, table_name)


In [32]:
con.sql("START TRANSACTION;")
create_base_tables(firm_names_data, True)
create_match_table(True)
# process_patstat_file("../../files_maris/katharina_patents_name_matching/patent_download_Oct_2024/patents_P.csv", "patents_P")

In [31]:
sql = """
    SELECT * FROM patstat_firm_match
    JOIN firm_names USING(firm_id)
    JOIN patstat_patents_P USING(pat_publn_id)
    WHERE similarity > 0.9
    """
# con.execute(sql).fetchdf()

In [47]:
sql = sql = """
    SELECT * FROM patstat_firm_match
    JOIN firm_names USING(firm_id)
    JOIN patstat_patents_P USING(pat_publn_id)
    WHERE similarity > 0.9
    AND name LIKE '%PFIZER%'
    """

# con.execute(sql).fetchdf()

Unnamed: 0,pat_publn_id,firm_id,similarity,pat_table,name,han_name,person_name,psn_name
0,605812521,10836,1.0,patents_P,PFIZER INC,PFIZER INC,PFIZER INC.,PFIZER
1,603509561,10836,1.0,patents_P,PFIZER INC,PFIZER INC,PFIZER INC.,PFIZER
2,602233505,10836,1.0,patents_P,PFIZER INC,PFIZER INC,PFIZER INC.,PFIZER
3,602233495,10836,1.0,patents_P,PFIZER INC,PFIZER INC,PFIZER INC.,PFIZER
4,601626819,10836,1.0,patents_P,PFIZER INC,PFIZER INC,PFIZER INC.,PFIZER
...,...,...,...,...,...,...,...,...
175,490807634,10836,1.0,patents_P,PFIZER INC,PFIZER INC,"PFIZER, INC.",PFIZER
176,576027687,10836,1.0,patents_P,PFIZER INC,INSERM INSTITUT NAL DE LA SANTE & DE LA RECHER...,INSERM (INSTITUT NATIONAL DE LA SANTÉ ET DE LA...,INSERM (INSTITUT NATIONAL DE LA SANTE ET DE LA...
177,573735084,10836,1.0,patents_P,PFIZER INC,INSERM INSTITUT NAL DE LA SANTE & DE LA RECHER...,INSERM (INSTITUT NATIONAL DE LA SANTÉ ET DE LA...,INSERM (INSTITUT NATIONAL DE LA SANTE ET DE LA...
178,547986664,10836,1.0,patents_P,PFIZER INC,INSERM INSTITUT NAL DE LA SANTE & DE LA RECHER...,INSERM (INSTITUT NATIONAL DE LA SANTE ET DE LA...,INSERM (INSTITUT NATIONAL DE LA SANTE ET DE LA...


In [40]:
buggy_data = pd.read_csv('../../files_maris/debug_data/pfizer_weird.csv')

In [45]:
buggy_data.loc[1]

Unnamed: 0                                    102
pat_publn_id                            540273784
firm_id                                     10836
similarity                                    1.0
pat_table                               patents_P
name                                   PFIZER INC
han_name               BOSTON MEDICAL CENTER CORP
person_name     BOSTON MEDICAL CENTER CORPORATION
psn_name        BOSTON MEDICAL CENTER CORPORATION
Name: 1, dtype: object

In [44]:
get_best_match(buggy_data.loc[1])

Unnamed: 0,firm_id,name,similarity
0,2008,BOSTON BEER COMPANY INC,0.877759
1,2010,BOSTON SCIENTIFIC CORP,0.857653
2,2009,BOSTON PROPERTIES INC,0.825458
3,2022,BOWLIN TRAVEL CENTERS INC,0.808065
4,15154,WHEATON PRECIOUS METALS CORP,0.801532


In [None]:
con.sql('ROLLBACK;')
con.sql('show tables')