# **PROCESS FIRM-LEVEL FEATURES**

In [406]:
import os
import dotenv
import geopandas as gpd
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim, GoogleV3

from pocketknife.database import (
    connect_database, read_from_database)

from success_prediction.config import (
    PROJ_ROOT, RAW_DATA_DIR, EXTERNAL_DATA_DIR, PROCESSED_DATA_DIR)

dotenv_path = os.path.join(PROJ_ROOT, '.env')
dotenv.load_dotenv(dotenv_path)

True

In [2]:
id2legalform = {
    1: 'Sole proprietorship',  # Einzelunternehmen  ->  EXCLUDE
    2: 'General Partnership',  # Kollektivgesellschaft  ->  INCLUDE
    3: 'Corporation',  # Aktiengesellschaft  ->  INCLUDE
    4: 'Limited Liability Company',  # Gesellschaft mit beschränkter Haftung  ->  INCLUDE
    5: 'Cooperative',  # Genossenschaft  ->  EXCLUDE
    6: 'Association',  # Verein  ->  EXCLUDE
    7: 'Foundation',  # Stiftung  ->  EXCLUDE
    8: 'Public sector institution',  # Institut des öffentlichen Rechts  ->  EXCLUDE
    9: 'Branch',  # Zweigniederlassung  ->  EXCLUDE
    10: 'Limited Partnership',  # Kommanditgesellschaft  ->  INCLUDE
    11: 'Foreign branch',  # Zweigniederlassung einer ausl. Gesellschaft  ->  EXCLUDE
    12: 'Corporation with unlimited partners',  # Kommanditaktiengesellschaft  ->  INCLUDE
    13: 'Special legal form',  # Besondere Rechtsform  ->  EXCLUDE
    14: 'Ownership in undivided shares',  # Gemeinderschaft  ->  EXCLUDE
    15: 'Limited Partnership for collective investment schemes with a fixed capital',  # Investmentgesellschaft mit festem Kapital  ->  INCLUDE
    16: 'Limited Partnership for collective investment schemes with a variable capital',  # Investmentgesellschaft mit variablem Kapital  ->  INCLUDE
    17: 'Limited Partnership for collective investment schemes',  # Kommanditgesellschaft für kollektive Kapitalanlagen  ->  INCLUDE
    18: 'Non commercial power of attorney',  # Nichtkaufmännische Prokure  ->  EXCLUDE
    19: '(unknown)',  # (unbekannt)  ->  EXCLUDE
}

growth_oriented_legal_forms = [2, 3, 4, 10, 12, 15, 16, 17]

In [None]:
# This query gets the sample of growth oriented firms that were founded between 2016 and current for the prediction sample

query_founded_firms = """ 
    SELECT
        base.ehraid,
        base.uid,

        -- Dissolution information
        base.delete_date,
        dissolution.reason_for_dissolution,
        dissolution.liquidation,
        dissolution.bankruptcy,
        dissolution.other_exit,

        -- Names
        base.name AS current_name,
        founding_name.firm_name AS founding_name,

        -- Legal forms
        base.legal_form_id AS current_legal_form,
        legal_form.legal_form_id AS founding_legal_form,

        -- Purpose
        base.purpose_raw AS current_purpose,
        founding_purpose.purpose_raw AS founding_purpose,

        -- Current address
        COALESCE(address.street, '') || ' ' || COALESCE(address.house_number, '') AS current_street,
        address.town AS current_town,
        address.swiss_zip_code AS current_zip_code,
        address.country AS current_country,

        -- Founding address
        founding_address.street AS founding_street,
        founding_address.town AS founding_town,
        founding_address.postal_code AS founding_zip_code,
        founding_address.town_bfs_gmde_code_latest AS founding_bfs_code,

        -- Founding SHAB entry
        shab.shab_id,
        shab.shab_date AS founding_date,
        shab.message AS founding_message

    FROM zefix_release_159.base base

    -- Founding SHAB messages
    INNER JOIN (
        SELECT s.ehraid, s.shab_id, s.shab_date, s.message
        FROM zefix_release_159.shab s
        INNER JOIN zefix_release_159.shab_mutation sm ON s.shab_id = sm.shab_id
        WHERE sm.description = 'status.neu'
    ) AS shab ON base.ehraid = shab.ehraid

    -- Current address
    LEFT JOIN zefix_release_159.address address ON base.ehraid = address.ehraid

    -- Founding address
    LEFT JOIN (
        SELECT DISTINCT hfa.ehraid, hfa.street, hfa.postal_code, hfa.town, hfa.town_bfs_gmde_code_latest
        FROM zefix.history_firm_addresses hfa
        WHERE founding = TRUE
    ) AS founding_address ON base.ehraid = founding_address.ehraid

    -- Founding name
    LEFT JOIN (
        SELECT DISTINCT hfn.ehraid, hfn.firm_name
        FROM zefix.history_firm_names hfn
        WHERE hfn.founding = TRUE
    ) AS founding_name ON base.ehraid = founding_name.ehraid

    -- Founding purpose
    LEFT JOIN (
        SELECT DISTINCT hp.ehraid, hp.purpose_raw
        FROM zefix.history_purpose hp
        WHERE hp.founding_purpose = TRUE
    ) AS founding_purpose ON base.ehraid = founding_purpose.ehraid

    -- Founding legal form
    LEFT JOIN (
        SELECT DISTINCT hlf.ehraid, hlf.legal_form_id
        FROM zefix.history_founding_legal_form hlf
    ) AS legal_form ON base.ehraid = legal_form.ehraid

    -- Dissolution information
    LEFT JOIN (
        -- Only keep the last dissolution message as the final dissolution
        SELECT hd.ehraid, hd.shab_date, hd.reason_for_dissolution, hd.liquidation, hd.bankruptcy, hd.other_exit
        FROM (
            SELECT *,
                ROW_NUMBER() OVER (PARTITION BY ehraid ORDER BY shab_date DESC) AS rn
            FROM zefix.history_dissolutions
        ) hd
        WHERE hd.rn = 1
    ) AS dissolution ON base.ehraid = dissolution.ehraid

    -- Filter out irrelevant records
    WHERE
        NOT base.is_branch
        AND shab.shab_date < '2024-01-01'
        AND base.legal_form_id IN (2, 3, 4, 10, 12, 15, 16, 17)
        AND LOWER(base.name) NOT LIKE '%zweigniederlassung%'
        AND LOWER(base.name) NOT LIKE '%succursale%';
"""

In [None]:
with connect_database() as con:
    df_startups = read_from_database(connection=con, query=query_founded_firms)

df_startups['founding_date'] = pd.to_datetime(df_startups['founding_date'])

In [481]:
# Observed duplicates stem from entries having multiple new inscriptions in Zefix. -> Remove them from the sample because history seems to contain errors
display(df_startups[df_startups.duplicated(subset=['ehraid', 'founding_town'], keep=False)].uid.unique())
df_startups = df_startups.drop_duplicates(subset=['ehraid'], keep=False)

array(['CHE305485209', 'CHE410995706', 'CHE187002757', 'CHE264500565',
       'CHE150817825', 'CHE442953971', 'CHE166625800', 'CHE464334776'],
      dtype=object)

### GEO ENCODE ADDRESS INFORMATION

In [None]:
# Use the current information if the founding address is missing
df_startups['founding_street'] = df_startups['founding_street'].fillna(df_startups['current_street'])
df_startups['founding_zip_code'] = df_startups['founding_zip_code'].fillna(df_startups['current_zip_code'])
df_startups['founding_town'] = df_startups['founding_town'].fillna(df_startups['current_town'])

In [116]:
assert df_startups[df_startups['founding_street'].isna()].empty
assert df_startups[df_startups['founding_zip_code'].isna()].empty
assert df_startups[df_startups['founding_town'].isna()].empty

In [None]:
nominatim_geolocator = Nominatim(
    user_agent="local_geocoder",
    domain="localhost:8080",
    scheme="http"
)
google_geolocator = GoogleV3(api_key=os.getenv('GOOGLE_GEOCODE_API_KEY'))


def geocode_address(nominatim_geolocator, google_geolocator, row):
    try:
        location = nominatim_geolocator.geocode({
            'street': row['founding_street'],
            'city': row['founding_town'],
            'postalcode': int(row['founding_zip_code']),
            'country': 'Schweiz'
        }, timeout=2)
        if location:
            return pd.Series([location.address, location.latitude, location.longitude])
        else:
            location = google_geolocator.geocode({
                'street': row['founding_street'],
                'city': row['founding_town'],
                'postalcode': int(row['founding_zip_code']),
                'country': 'Schweiz'
            }, timeout=1)
            if location:
                return pd.Series([location.address, location.latitude, location.longitude])
            return pd.Series([None, None, None])
    except Exception as e:
        print(f"Error: {e}")
        return pd.Series([None, None, None])

In [None]:
df_startups[['geocoded_address', 'latitude', 'longitude']] = df_startups.apply(lambda row: geocode_address(nominatim_geolocator, google_geolocator, row), axis=1)

In [285]:
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].astype(int)
df_startups['founding_zip_code'] = df_startups['founding_zip_code'].astype(int)

### DETERMINE BFS MUNICIPALITY CODE BY COORDINATES WHERE MISSING

In [372]:
gdf = gpd.read_file(EXTERNAL_DATA_DIR / 'geo_data' / 'swissBOUNDARIES3D_1_5_LV95_LN02.gdb', layer="TLM_HOHEITSGEBIET")
gdf = gdf.to_crs("EPSG:4326")
gdf = gdf[['geometry', 'BFS_NUMMER', 'EINWOHNERZAHL']]

df_startups = gpd.GeoDataFrame(
    df_startups,
    geometry=gpd.points_from_xy(df_startups['longitude'], df_startups['latitude']),
    crs="EPSG:4326"
)

df_startups = gpd.sjoin(df_startups, gdf, how="left", predicate="within")

# Replace where code is 0 (unmatched) or where it does not match the coordinates
df_startups.loc[df_startups['founding_bfs_code'] == 0, 'founding_bfs_code'] = pd.NA
df_startups.loc[(df_startups['founding_bfs_code'].astype(float) != df_startups['BFS_NUMMER']) & (~df_startups['BFS_NUMMER'].isna()), 'founding_bfs_code'] = pd.NA
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].fillna(df_startups['BFS_NUMMER'])

In [374]:
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].astype(int)

In [375]:
df_startups[df_startups.founding_bfs_code.astype(float) != df_startups.BFS_NUMMER][['founding_town', 'combined_address', 'founding_bfs_code', 'BFS_NUMMER']]

Unnamed: 0,founding_town,combined_address,founding_bfs_code,BFS_NUMMER
223603,Chiasso,"Via Henry Dunant 1, 6830 Chiasso",5250,
223613,Morcote,"Via Isella 11, 6922 Morcote",5203,
223639,Brusino Arsizio,"Via Lungolago 83, 6827 Brusino Arsizio",5160,
224506,San Bernardino,"Residenza Mons Avium , appartamento 25, 6565 S...",3822,
224785,La Tène,"route de Bellevue 7, 2074 La Tène",6513,
225829,Roggwil TG,"Im Pünst 1, 9325 Roggwil TG",4431,
225873,Bassins,"Chemin de Raulan 24, 1269 Bassins",5703,
226046,Warth,"Kartause Ittingen, 8532 Warth",4621,


In [None]:
df_startups.drop(columns=['geometry', 'index_right', 'Unnamed: 0', 'BFS_NUMMER'], inplace=True)

### ADD MUNICIPALITY TYPOLOGY

In [None]:
df_typology = pd.read_excel(EXTERNAL_DATA_DIR / 'geo_data' / 'Raumgliederungen.xlsx')
df_typology.drop(columns=['Gemeindename', 'Bezirksname', 'Kanton'], inplace=True)
df_typology = df_typology.rename(columns={'BFS Gde-nummer': 'founding_bfs_code', 'Bezirks-nummer': 'district_id', 'Kantons-nummer': 'canton_id', 'Stadt/Land-Typologie': 'urban_rural', 'Gemeindetypologie (9 Typen)': 'typology_9c', 'Gemeindetypologie (25 Typen)': 'typology_25c'})

In [384]:
df_startups = df_startups.merge(df_typology, on='founding_bfs_code', how='left')

In [None]:
df_startups.rename(columns={'EINWOHNERZAHL': 'population'}, inplace=True)

In [389]:
df_startups[df_startups.canton_id.isna()]

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,combined_address,geocoded_address,latitude,longitude,population,canton_id,Bezirks-nummer,urban_rural,typology_9c,typology_25c
2469,1255845,CHE395917849,2020-06-19,['Nachdem kein begründeter Einspruch gegen die...,False,False,True,Same Same GmbH in Liquidation,Same Same GmbH,4,...,"Seeplatz 1, 8820 Wädenswil","1, Seeplatz, Wädenswil, Bezirk Horgen, Zürich,...",47.228758,8.676404,0.0,,,,,
80542,1389236,CHE291873431,,,,,,MS Glärnisch AG,MS Glärnisch AG,3,...,"Seeplatz 1, 8820 Wädenswil","1, Seeplatz, Wädenswil, Bezirk Horgen, Zürich,...",47.228758,8.676404,0.0,,,,,
223147,1310452,CHE338654358,,['Mit Entscheid vom 07.01.2025 hat der Einzelr...,False,True,False,Peter Jegen GmbH in Liquidation,Peter Jegen GmbH,4,...,"Sagastrasse 3, 7214 Grüsch","Sägastrasse 3, 9495 Triesen, Liechtenstein",47.088149,9.522204,5532.0,,,,,


In [392]:
df_startups.head()

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,combined_address,geocoded_address,latitude,longitude,population,canton_id,district_id,urban_rural,typology_9c,typology_25c
0,1251325,CHE153193257,,['Mit Urteil des Gerichtspräsidenten des Zivil...,True,False,False,Arlez Carrosserie GmbH in Liquidation,Arlez Carrosserie GmbH,4,...,"Bleichiweg 4, 4460 Gelterkinden","4, Bleichiweg, Gelterkinden, Bezirk Sissach, B...",47.460137,7.86118,6296.0,13.0,1304.0,2.0,21.0,217.0
1,1251326,CHE392024369,2020-11-11,[],False,False,True,Vista Coaching GmbH in Liquidation,Vista Coaching GmbH,4,...,"Ergolzstrasse 13, 4414 Füllinsdorf","13, Ergolzstrasse, Füllinsdorf, Bezirk Liestal...",47.504062,7.724207,4700.0,13.0,1303.0,1.0,11.0,113.0
2,1251327,CHE473646370,,,,,,Wissler Consulting GmbH,Wissler Consulting GmbH,4,...,"Wintersingerstrasse 18a, 4464 Maisprach","18a, Wintersingerstrasse, Maisprach, Bezirk Si...",47.523075,7.845789,941.0,13.0,1304.0,3.0,23.0,236.0
3,1251328,CHE205344235,,,,,,Wolf Regio GmbH,Wolf Regio GmbH,4,...,"Grellingerstrasse 32, 4142 Münchenstein","32, Grellingerstrasse, Münchenstein, Bezirk Ar...",47.518085,7.603328,12304.0,13.0,1301.0,1.0,11.0,112.0
4,1251329,CHE190527339,,"['Mit Entscheid vom 27.09.2022 , 9.15 Uhr , ha...",False,True,False,AHAS GmbH in Liquidation,AHAS GmbH,4,...,"Luzernstrasse 60, 6102 Malters","60, Luzernstrasse, Bühl, Malters, Luzern, 6102...",47.0363,8.177812,7771.0,3.0,312.0,2.0,21.0,216.0


In [655]:
df_startups.to_csv(RAW_DATA_DIR / 'company_sample' / 'geo_coded_company_sample.csv', index=False)

In [487]:
df_startups = pd.read_csv(RAW_DATA_DIR / 'company_sample' / 'geo_coded_company_sample.csv')

In [496]:
df_startups.loc[df_startups.ehraid == 1285556, 'founding_legal_form'] = 3.0

### ADD STARTING CAPITAL TO COMPANY DATA

In [None]:
# Get historical exchange rates
import yfinance as yf

exchange_rate_dfs = []
for symbol in ['EUR', 'GBP', 'USD']:
    ticker = yf.Ticker(f'{symbol}CHF=X')
    df_ticker = ticker.history(start='2016-01-01', end='2024-01-01')
    df_ticker['symbol'] = symbol
    exchange_rate_dfs.append(df_ticker)

In [None]:
df_exchange_rates = pd.concat(exchange_rate_dfs).reset_index()[['Date', 'symbol', 'Open']]

df_exchange_rates = df_exchange_rates.rename(columns={'Date': 'founding_date'})
df_exchange_rates['founding_date'] = pd.to_datetime(df_exchange_rates['founding_date']).dt.date
df_exchange_rates['founding_date'] = pd.to_datetime(df_exchange_rates['founding_date'])

df_temp = pd.DataFrame({'founding_date': pd.date_range(start='2016-01-01', end='2024-01-01').tolist() * 3})
df_temp['symbol'] = ['EUR'] * int(len(df_temp) / 3) + ['GBP'] * int(len(df_temp) / 3) + ['USD'] * int(len(df_temp) / 3)

df_exchange_rates = df_temp.merge(df_exchange_rates, on=['founding_date', 'symbol'], how='left')
df_exchange_rates['Open'] = df_exchange_rates['Open'].ffill()
df_exchange_rates['symbol'] = df_exchange_rates['symbol'].ffill()

df_exchange_rates.to_csv(RAW_DATA_DIR / 'company_sample' / 'exchange_rates.csv', index=False)

In [439]:
query_capital = """ 
    SELECT * FROM zefix.history_registered_capital WHERE shab_date < '2024-01-01';
"""

In [644]:
with connect_database() as con:
    df_capital = read_from_database(connection=con, query=query_capital)

In [None]:
df_capital = df_capital.rename(columns={'shab_date': 'founding_date', 'currency_new': 'symbol'})
df_capital['founding_date'] = pd.to_datetime(df_capital['founding_date'])

mapping = {
    'Euro': 'EUR',
    'Eur': 'EUR',
    'EURO': 'EUR',
    '€': 'EUR',
    'fr': 'CHF',
    'Fr.': 'CHF',
    'CHE': 'CHF',
    '£': 'GBP',
    'US': 'USD'
}
df_capital['symbol'] = df_capital['symbol'].replace(mapping)

# Drop duplicate entries where we have libaration information do avoid duplicates before aggregation
df_capital = df_capital[~df_capital.duplicated(subset=['ehraid', 'founding_date'], keep=False) | (df_capital.duplicated(subset=['ehraid', 'founding_date'], keep=False) & ~(df_capital['keyword'].str.contains('liberierung|liberato|libéré', regex=True)))]

# Drop entries where the currency is not a common currency
df_capital = df_capital[df_capital.symbol.isin(['CHF', 'EUR', 'USD', 'GBP'])]

# Add exchange rates and convert registered capital
df_capital = df_capital.merge(df_exchange_rates, on=['symbol', 'founding_date'], how='left')
df_capital['Open'] = df_capital['Open'].fillna(1.0)
df_capital['capital_chf'] = df_capital['capital_new'].astype(float) * df_capital['Open'].astype(float)

# Aggregate capital into one value for registered capital
df_capital = df_capital.groupby(['ehraid', 'founding_date']).agg({'capital_chf': 'sum'}).reset_index()

In [None]:
df_startups = df_startups.merge(df_capital[['ehraid', 'founding_date', 'capital_chf']], on=['ehraid', 'founding_date'], how='left')

In [654]:
df_startups

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,geocoded_address,latitude,longitude,population,canton_id,district_id,urban_rural,typology_9c,typology_25c,capital_chf
0,1251325,CHE153193257,,['Mit Urteil des Gerichtspräsidenten des Zivil...,True,False,False,Arlez Carrosserie GmbH in Liquidation,Arlez Carrosserie GmbH,4,...,"4, Bleichiweg, Gelterkinden, Bezirk Sissach, B...",47.460137,7.861180,6296.0,13.0,1304.0,2.0,21.0,217.0,20000.0
1,1251326,CHE392024369,2020-11-11,[],False,False,True,Vista Coaching GmbH in Liquidation,Vista Coaching GmbH,4,...,"13, Ergolzstrasse, Füllinsdorf, Bezirk Liestal...",47.504062,7.724207,4700.0,13.0,1303.0,1.0,11.0,113.0,20000.0
2,1251327,CHE473646370,,,,,,Wissler Consulting GmbH,Wissler Consulting GmbH,4,...,"18a, Wintersingerstrasse, Maisprach, Bezirk Si...",47.523075,7.845789,941.0,13.0,1304.0,3.0,23.0,236.0,20000.0
3,1251328,CHE205344235,,,,,,Wolf Regio GmbH,Wolf Regio GmbH,4,...,"32, Grellingerstrasse, Münchenstein, Bezirk Ar...",47.518085,7.603328,12304.0,13.0,1301.0,1.0,11.0,112.0,20000.0
4,1251329,CHE190527339,,"['Mit Entscheid vom 27.09.2022 , 9.15 Uhr , ha...",False,True,False,AHAS GmbH in Liquidation,AHAS GmbH,4,...,"60, Luzernstrasse, Bühl, Malters, Luzern, 6102...",47.036300,8.177812,7771.0,3.0,312.0,2.0,21.0,216.0,20000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226553,1618578,CHE349470093,,,,,,SIRAP Holding SA,SIRAP Holding SA,3,...,"Z.I. Moulin du Choc E, 1122 Romanel-sur-Morges...",46.560951,6.521709,462.0,22.0,2227.0,3.0,23.0,236.0,100000.0
226554,1618579,CHE306057827,,,,,,SIRAP Immobilier SA,SIRAP Immobilier SA,3,...,"Z.I. Moulin du Choc E, 1122 Romanel-sur-Morges...",46.560951,6.521709,462.0,22.0,2227.0,3.0,23.0,236.0,102000.0
226555,1618768,CHE130017661,,,,,,Ambiens Estates SA,Ambiens Estates SA,3,...,"Crans-Montana, Switzerland",46.311858,7.482353,10488.0,23.0,2311.0,1.0,13.0,134.0,300000.0
226556,1618773,CHE275237254,,,,,,Heritage Estates SA,Heritage Estates SA,3,...,"Crans-Montana, Switzerland",46.311858,7.482353,10488.0,23.0,2311.0,1.0,13.0,134.0,300000.0


### ADD WEBSITE URLS TO COMPANY DATA

In [23]:
websites = pd.read_csv(RAW_DATA_DIR / 'company_urls' / 'scraped_company_urls.csv')
websites = websites.drop_duplicates(subset=['uid'], keep='first')

In [27]:
websites

Unnamed: 0,uid,noga,company_url
0,CHE395937898,749000,no website available
1,CHE142825231,812100,no website available
2,CHE171766547,464700,no website available
3,CHE430662484,257300,no website available
4,CHE156125157,829900,no website available
...,...,...,...
618169,CHE452592018,855904,no website available
618170,CHE104902451,855904,no website available
618171,CHE103332769,855904,http://www.commercants-lausannois.ch/
618172,CHE390937135,561001,no website available


### ENCODE FIRM_LEVEL FEATURES

# **PROCESS INSCRIBED PEOPLE/FIRMS FEATURES**

In [396]:
from success_prediction.zefix_processing.clustering import PersonClustering

pd.set_option('future.no_silent_downcasting', True)

In [397]:
query_inscribed_people = """ 
    SELECT * FROM zefix.history_inscribed_people WHERE founders = TRUE AND shab_date < '2024-01-01';
"""

query_inscribed_firms = """ 
    SELECT * FROM zefix.history_inscribed_firms WHERE shab_date < '2024-01-01';
"""

In [398]:
with connect_database() as con:
    df_insc_people = read_from_database(connection=con, query=query_inscribed_people)
    df_insc_firms = read_from_database(connection=con, query=query_inscribed_firms)

In [400]:
df_insc_firms

Unnamed: 0,ehraid,shab_date,shab_id,keyword,firm_name,firm_uid,firm_seat,firm_type,firm_shares
0,256,2017-03-02,3380321,nouvel organe de révision,Fiprom S.A. Fiduciaire de Prométerre,CHE-108.474.342,Lausanne,,
1,256,2017-03-02,3380321,personnes inscrites special,Hervest Fiduciaire SA,CHE-107.877.252,,organe de révision,
2,283,2018-07-12,4353769,nouvel organe de révision,KPMG AG,CHE-106.084.881,Zurich,,
3,283,2020-07-14,1004936962,nouvel organe de révision,Ernst & Young AG,CHE-491.907.686,Zurich,succursale,
4,371,2020-03-13,1004852619,personnes inscrites special,CO1 LLC,7553319,"Lewes, USA",associée,50 parts de CHF 1'000
...,...,...,...,...,...,...,...,...,...
172971,1619464,2023-12-29,1005922598,eingetragene personen,Aircon Holding AG,CHE-385.982.271,Otelfingen,Gesellschafterin,mit 10 Stammanteilen zu je CHF 1000.00
172972,1619490,2023-12-29,1005922617,eingetragene personen,Mäder + Baumgartner Treuhand AG,CHE-103.815.364,Neuhausen am Rheinfall,Revisionsstelle,
172973,1619494,2023-12-29,1005922621,eingetragene personen,KBT Revisions AG,CHE-102.663.608,Zürich,Revisionsstelle,
172974,1619495,2023-12-29,1005922622,eingetragene personen,KPMG AG,CHE-106.084.881,Zürich,Revisionsstelle,


In [399]:
df_insc_people

Unnamed: 0,ehraid,shab_date,shab_id,keyword,first_name,first_name_norm,last_name,last_name_norm,job_title,signing_rights,...,place_of_residence_1_bfs_stand_origin,place_of_residence_2_bfs_gmde_code_origin,place_of_residence_2_bfs_stand_origin,hometown_1_bfs_gmde_code_latest,hometown_2_bfs_gmde_code_latest,hometown_3_bfs_gmde_code_latest,hometown_4_bfs_gmde_code_latest,hometown_5_bfs_gmde_code_latest,place_of_residence_1_bfs_gmde_code_latest,place_of_residence_2_bfs_gmde_code_latest
0,1549301,2022-08-17,1005542691,eingetragene personen,Jeannine Michelle,jeannine michelle,Ita,ita,Vorstandsmitglied,ohne Unterschrift,...,01-01-2025,0,,351,,,,,,
1,1550516,2022-08-29,1005549619,eingetragene personen,Jo-Ann,jo-ann,Coronel,coronel,Mitglied,ohne Zeichnungsberechtigung,...,31-12-1995,0,,261,,,,,,
2,1555037,2022-10-03,1005573889,eingetragene personen,Cédric Mathieu,cedric mathieu,Schneider,schneider,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,,0,,2701,,,,,,
3,1562339,2022-11-25,1005612516,eingetragene personen,Robert Andrew,robert andrew,Graf,graf,Mitglied des Verwaltungsrates,mit Einzelunterschrift,...,01-01-2025,0,,2972,,,,,,
4,1564573,2022-12-09,1005624920,eingetragene personen,Mark Andrea,mark andrea,Hoffmann,hoffmann,,mit Kollektivunterschrift zu zweien,...,01-01-2025,0,,230,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581649,1285731,2016-12-06,3204915,personnes inscrites special,Yael,yael,Levy,levy,,,...,01-01-2025,0,,0,,,,,,
581650,1289516,2017-01-06,3265647,personnes inscrites special,Jonathan,jonathan,da Silva Pombo,da silva pombo,,avec signature collective à deux,...,01-01-2025,0,,0,,,,,,
581651,1305383,2017-05-16,3524955,personnes inscrites special,Jonathan,jonathan,Defrancesco,defrancesco,,,...,01-01-2025,0,,0,,,,,,
581652,1321872,2017-10-06,3795639,personnes inscrites special,Sabine,sabine,Nemec-Piguet,nemec-piguet,,avec signature collective à deux avec le prési...,...,01-01-2025,0,,0,,,,,,


In [403]:
# Pre-process dataframe
bfs_code_cols = [col for col in df_insc_people.columns if 'bfs_gmde_code_' in col]
df_insc_people[bfs_code_cols] = df_insc_people[bfs_code_cols].astype(str).replace('0', np.nan)

In [None]:
# test_df = people_df[people_df.ehraid.isin([1600448, 1251490, 1260743, 1328630])].reset_index(drop=True).copy()

In [404]:
clustering = PersonClustering(df_insc_people)
clustered_df = clustering.cluster()

Cluster people within company: 100%|██████████| 357357/357357 [05:53<00:00, 1011.56it/s]


In [405]:
clustered_df

Unnamed: 0,ehraid,shab_date,shab_id,keyword,first_name,first_name_norm,last_name,last_name_norm,job_title,signing_rights,...,place_of_residence_2_bfs_stand_origin,hometown_1_bfs_gmde_code_latest,hometown_2_bfs_gmde_code_latest,hometown_3_bfs_gmde_code_latest,hometown_4_bfs_gmde_code_latest,hometown_5_bfs_gmde_code_latest,place_of_residence_1_bfs_gmde_code_latest,place_of_residence_2_bfs_gmde_code_latest,heuristic,fid
178772,1251436,2016-02-03,2637391,eingetragene personen,Hans-Peter Gunnar,hans-peter gunnar,Lennhag,lennhag,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,...,,4195,,,,,,,,1
178798,1251492,2016-02-03,2637449,eingetragene personen,Roman,roman,Schleiss,schleiss,Mitglied des Stiftungsrates,mit Kollektivunterschrift zu zweien,...,,1402,,,,,,,,2
178799,1251456,2016-02-03,2636785,eingetragene personen,Peter,peter,von Gunten,von gunten,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,...,,938,,,,,,,,3
178800,1251457,2016-02-03,2636787,eingetragene personen,Roland,roland,Kalt,kalt,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,...,,261,,,,,,,,4
178801,1251459,2016-02-03,2636791,eingetragene personen,Raffaele,raffaele,Nardone,nardone,Präsident des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,,3395,,,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576773,1619418,2023-12-29,1005923336,eingetragene personen,Sabrina,sabrina,Marbet,marbet,Mitglied des Vorstandes,mit Einzelunterschrift,...,,612,,,,,,,,384812
576772,1619418,2023-12-29,1005923336,eingetragene personen,Martin,martin,Marbet,marbet,Präsident des Vorstandes,mit Einzelunterschrift,...,,2404,,,,,,,,463418
576771,1619416,2023-12-29,1005923335,eingetragene personen,Robyn Brayan,robyn brayan,Nobs,nobs,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,...,,360,,,,,,,,463419
576769,1619396,2023-12-29,1005923331,eingetragene personen,Marco Andreas,marco andreas,Zühlke,zuehlke,Gesellschafter,Einzelunterschrift,...,,2939,,,,,,,,225405


# **PROCESS ADDITIONAL OUTPUT FEATURES**

### INVOLUNTARY EXIT TARGET

### ACQUISITION TARGET

In [None]:
query_merger = """ 
    SELECT * FROM zefix.history_merger WHERE shab_date < '2024-01-01';
"""

In [None]:
with connect_database() as con:
    df_merger = read_from_database(connection=con, query=query_merger)

### FUNDING TARGET

### NEW PATENT TARGET